### Scrape list of cities/URLs from 'All Cities' page (BeautifulSoup)

In [1]:
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
import requests
import time, os

In [2]:
# Get complete list of cities from single page

url = 'https://www.expatistan.com/cost-of-living/all-cities' 

response = requests.get(url)
response.status_code  # 2** = successful read

200

In [3]:
page = response.text
soup = BeautifulSoup(page, "lxml")
# print(soup.prettify())

In [4]:
# Returns lists of cities/urls only


list_name = []
list_url = []

for entry in soup.find_all('div', id='all-cities'):        # entire table
    for city in entry.find_all('li'):                      # only cities
        for link in city.find_all('a'):                    # grab links
            list_name.append(link.text)
            list_url.append(link.get('href'))

# list_name
# list_url

In [5]:
# Get corresponding list of countries

current_country = ''
list_country = []

for entry in soup.find_all('div', id='all-cities'):         # entire table
     for link in entry.find_all('a'):                       # grab links
            url = link.get('href')
            name = link.text
            
            if '/country/' in url:                          # common feature to country URLs
                current_country = name                      # update 'Country' label
                pass                                        # not new entry
            else:
                list_country.append(current_country)        # if URL not country, it's a city to label

# list_country

In [6]:
# Create dataframe from three lists

list_tuples1 = list(zip(list_name, list_country, list_url))
df1 = pd.DataFrame(list_tuples1, columns = ['City', 'Country', 'URL'])


In [7]:
# Remove cities lacking any user-contributed data

df1 = df1[~df1["URL"].str.contains('/rate/')]          # cities with /rate/ in URL are placeholders without contributions
df1.reset_index(drop=True, inplace=True)             

# 2,475 eligible cities (Afghanistan>Kabul ... Zimbabwe>Zvishavane District) 

df1

Unnamed: 0,City,Country,URL
0,Kabul,Afghanistan,https://www.expatistan.com/cost-of-living/kabul
1,Mariehamn,Aland Islands,https://www.expatistan.com/cost-of-living/mari...
2,Durrës,Albania,https://www.expatistan.com/cost-of-living/durres
3,Elbasan,Albania,https://www.expatistan.com/cost-of-living/elbasan
4,Korçë,Albania,https://www.expatistan.com/cost-of-living/korce
...,...,...,...
2470,Kadoma,Zimbabwe,https://www.expatistan.com/cost-of-living/kadoma
2471,Marondera,Zimbabwe,https://www.expatistan.com/cost-of-living/maro...
2472,Masvingo,Zimbabwe,https://www.expatistan.com/cost-of-living/masv...
2473,Mutare,Zimbabwe,https://www.expatistan.com/cost-of-living/mutare


### Collect target (cost of living), feature, and contribution info from each city page  

In [70]:
# For cleaning cost-of-living string

def COL_to_float(string):
    
    str_list = string.split(" ")
    cost_raw = str_list[0]
    cost_str = cost_raw.replace('$', '').replace(',', '')
    cost = float(cost_str)
    
    return cost

In [71]:
# Instantiate empty list for each variable

rent_85m2_expens = []
rent_85m2_normal = []
rent_45m2_expens = []
rent_45m2_normal = []
eatout_lunch = []
eatout_dinner = []
taxi_5mi = []
gas_liter = []
pubtrans_monthly = []
internet_monthly = []
TV_40in = []
cappuccino = []
mobile_wifi_128gb = []

list_COL_fam4 = []
list_COL_sin1 = []

list_n_prices = []
list_n_people = []

In [72]:
# Lists of features for looping

list_feature_texts = ['Monthly rent for 85 m2 (900 sqft) furnished accommodation in expensive area',
                 'Monthly rent for 85 m2 (900 sqft) furnished accommodation in normal area',
                 'Monthly rent for a 45 m2 (480 sqft) furnished studio in expensive area',
                 'Monthly rent for a 45 m2 (480 sqft) furnished studio in normal area',
                 'Basic lunchtime menu (including a drink) in the business district',
                 'Basic dinner out for two in neighborhood pub',
                 'Taxi trip on a business day, basic tariff, 8 km. (5 miles)',
                 '1 liter (1/4 gallon) of gas',
                 'Monthly ticket public transport',
                 'Internet 8 mbps (1 month)',
                 '40” flat screen tv',                     # special quotation mark! (”)
                 'Cappuccino in expat area of the city',
                 'Ipad wi-fi 128gb']

list_feature_names = ['rent_85m2_expens',
                      'rent_85m2_normal',
                      'rent_45m2_expens',
                      'rent_45m2_normal',
                      'eatout_lunch',
                      'eatout_dinner',
                      'taxi_5mi',
                      'gas_liter',
                      'pubtrans_monthly',
                      'internet_monthly',
                      'TV_40in',
                      'cappuccino',
                      'mobile_wifi_128gb']


list_of_lists = [rent_85m2_expens,
                      rent_85m2_normal,
                      rent_45m2_expens,
                      rent_45m2_normal,
                      eatout_lunch,
                      eatout_dinner,
                      taxi_5mi,
                      gas_liter,
                      pubtrans_monthly,
                      internet_monthly,
                      TV_40in,
                      cappuccino,
                      mobile_wifi_128gb]

list_of_urls = list(df1["URL"])

In [73]:
# Scraping tool...be sure lists in cell above are empty!

import random
from fake_useragent import UserAgent

for url in list_of_urls[1781:]:  # from Al Mubarraz, Saudi Arabia
    
    new_url = url + '?currency=USD'  # generate page version with USD
    
    ua = UserAgent()
    user_agent = {'User-agent': ua.random}                       # new user_agent every iteration
    response = requests.get(new_url, headers = user_agent)                       
    page = response.text
    soup_usd = BeautifulSoup(page, "lxml")
    
    # 1. Get price features
    
    for i, feature in enumerate(list_feature_texts):
        
        chunk1 = soup_usd.find('a', text=feature)
        check = chunk1.findNext().findNext()            
        
        if check.name == 'tr':    # 'tr' if country uses USD only
            feature_value = chunk1.findNext().text[2:-2]
        else:                     # 'li' if USD is not only currency
            feature_value = chunk1.findNext().findNext().text[2:-2]

        if feature_value[0] != '$':  # no data for feature
           feature_value = np.nan
        else:
            fv_str = feature_value.replace('$', '').replace(',', '')
            feature_value = float(fv_str)
        
        list_of_lists[i].append(feature_value)
    
    # 2. Get cost of living estimates
    
    if soup_usd.find_all('div', class_='cost-of-living-summary centered right-widget'):    # Summary window must be present
        chunk2 = soup_usd.find('div', class_='cost-of-living-summary centered right-widget')
        COL_fam4_raw = chunk2.find_all('span', class_='price')[0].text
        COL_fam4 = COL_to_float(COL_fam4_raw)
        COL_sin1_raw = chunk2.find_all('span', class_='price')[1].text
        COL_sin1 = COL_to_float(COL_sin1_raw)
    else:
        COL_fam4, COL_sin1 = np.nan, np.nan
    
    list_COL_sin1.append(COL_sin1)
    list_COL_fam4.append(COL_fam4)
    
    # 3. Get contribution info
    
    chunk3 = soup_usd.find_all('div', class_='accuracy-report')[-1].text       # evaluation always last on page
    sentences = chunk3.split('.')                                              # split into sentences
    sentence_raw = sentences[2].strip()                                        # select 3rd sentence in accuracy-report
    sentence = sentence_raw.replace('less than ', '-')                         # flag cities with scarce data as negative for now
    words = sentence.split(' ')
    # no_prices, no_people = words[4], words[8]
    no_prices, no_people = int(words[4].replace(',', '')), int(words[8].replace(',', '')) 
    
    list_n_prices.append(no_prices)                               
    list_n_people.append(no_people)
    
    # 4. Pause like a human
    
    wait = .5 + 10 * random.random()
    time.sleep(wait)
    print(f'{url[41:]}, {no_prices}, ${COL_sin1}, waited {wait:0.4} sec.')

/al-mubarraz, -10, $1514.0, waited 2.391 sec.
/al-qurayyat, -10, $nan, waited 2.861 sec.
/arar, -10, $920.0, waited 7.16 sec.
/buraidah, 12, $933.0, waited 5.214 sec.
/dammam, 74, $1218.0, waited 5.32 sec.
/jeddah, 126, $1062.0, waited 7.589 sec.
/jizan, -10, $1212.0, waited 8.114 sec.
/jubail, -10, $1275.0, waited 5.163 sec.
/khamis-mushait, -10, $862.0, waited 9.105 sec.
/mecca, -10, $1181.0, waited 4.197 sec.
/medina, 22, $1243.0, waited 7.489 sec.
/najran, -10, $nan, waited 6.287 sec.
/riyadh, 270, $1741.0, waited 9.969 sec.
/sakaka, -10, $739.0, waited 5.726 sec.
/at-ta-if, -10, $nan, waited 1.71 sec.
/tabuk, -10, $1064.0, waited 6.955 sec.
/yanbu-al-bahr, -10, $nan, waited 8.462 sec.
/dakar, 43, $1964.0, waited 1.007 sec.
/grand-dakar, -10, $nan, waited 4.803 sec.
/kolda, -10, $nan, waited 2.787 sec.
/saint-louis, -10, $nan, waited 9.573 sec.
/ziguinchor, -10, $nan, waited 5.599 sec.
/belgrade, 261, $1168.0, waited 8.485 sec.
/kraljevo, -10, $896.0, waited 6.518 sec.
/krusevac, 3

/uppsala, 31, $2020.0, waited 8.791 sec.
/visby, -10, $1755.0, waited 2.602 sec.
/vasteras, -10, $1780.0, waited 1.155 sec.
/vaxjo, -10, $1557.0, waited 1.483 sec.
/orebro, -10, $1719.0, waited 4.401 sec.
/aarau, -10, $3163.0, waited 5.362 sec.
/basel, 61, $3233.0, waited 1.457 sec.
/bern, 126, $3198.0, waited 8.358 sec.
/geneva, 144, $4717.0, waited 0.612 sec.
/lausanne, 96, $3365.0, waited 10.03 sec.
/lugano, 52, $3430.0, waited 9.593 sec.
/neuchatel, -10, $2898.0, waited 8.79 sec.
/schaffhausen, -10, $2968.0, waited 3.636 sec.
/sion, 42, $2575.0, waited 2.602 sec.
/st-gallen, 14, $2825.0, waited 2.105 sec.
/zurich, 510, $4151.0, waited 6.291 sec.
/aleppo, -10, $215.0, waited 9.57 sec.
/damascus, -10, $220.0, waited 2.93 sec.
/homs, -10, $735.0, waited 7.621 sec.
/palmyra, -10, $nan, waited 9.795 sec.
/hsinchu-city, -10, $1373.0, waited 6.6 sec.
/hualian, -10, $nan, waited 4.548 sec.
/kao-hsiung, -10, $1232.0, waited 6.018 sec.
/kaohsiung-city, -10, $1397.0, waited 2.962 sec.
/taichu

/ras-al-khaymahs-al-khaymah, 58, $1549.0, waited 3.344 sec.
/sharjah, 143, $1572.0, waited 7.969 sec.
/aberdeen, -10, $2051.0, waited 2.413 sec.
/aberystwyth, 10, $2282.0, waited 9.661 sec.
/belfast, 92, $1900.0, waited 5.683 sec.
/birmingham-united-kingdom, 122, $2074.0, waited 1.15 sec.
/brighton-and-hove, 111, $3125.0, waited 2.605 sec.
/bristol, 105, $2650.0, waited 0.5503 sec.
/cambridge, 94, $2714.0, waited 1.514 sec.
/cardiff, 35, $2028.0, waited 10.32 sec.
/coventry, 23, $2468.0, waited 3.604 sec.
/dundee, -10, $1767.0, waited 9.588 sec.
/edinburgh, 330, $2716.0, waited 5.994 sec.
/exeter, -10, $2058.0, waited 2.371 sec.
/glasgow, 126, $1967.0, waited 10.02 sec.
/gloucester, 11, $2252.0, waited 5.648 sec.
/inverness, -10, $2742.0, waited 5.868 sec.
/ipswich, -10, $2164.0, waited 1.278 sec.
/kingston-upon-hull, 46, $2200.0, waited 10.08 sec.
/leeds, 147, $2023.0, waited 4.516 sec.
/leicester, 53, $1778.0, waited 9.14 sec.
/liverpool, 203, $1839.0, waited 3.447 sec.
/london, 1289

/reno, 192, $2236.0, waited 8.715 sec.
/richmond-virginia, 133, $1887.0, waited 4.59 sec.
/riverside, 56, $2510.0, waited 4.089 sec.
/roanoke-virginia, 27, $1762.0, waited 8.482 sec.
/rochester-new-york, 116, $1931.0, waited 0.6479 sec.
/rockford-illinois, 31, $nan, waited 9.542 sec.
/sacramento, 221, $2684.0, waited 4.588 sec.
/salt-lake-city, 230, $2308.0, waited 7.833 sec.
/san-antonio-california, -10, $1832.0, waited 5.724 sec.
/san-antonio-texas, 395, $2191.0, waited 2.237 sec.
/san-bernardino-california, 54, $2752.0, waited 4.219 sec.
/san-diego, 434, $3602.0, waited 6.079 sec.
/san-francisco, 278, $4474.0, waited 1.824 sec.
/san-jose-california, 284, $3863.0, waited 5.198 sec.
/san-luis-obispo-california, 92, $3086.0, waited 7.655 sec.
/santa-ana, 149, $2877.0, waited 3.283 sec.
/santa-barbara-california, 79, $3626.0, waited 4.602 sec.
/santa-cruz-california, 65, $3517.0, waited 8.089 sec.
/santa-fe-new-mexico, 19, $2397.0, waited 2.953 sec.
/savannah-georgia, 82, $2043.0, waite

In [74]:
# Error occurred mid-page if all four not same length -> re-scrape

print(len(rent_85m2_expens), len(mobile_wifi_128gb), len(list_COL_sin1), len(list_n_prices))

694 694 694 694


In [75]:
# Concatenate into full COL dataset

list_tuples2 = list(zip(rent_85m2_expens,
                      rent_85m2_normal,
                      rent_45m2_expens,
                      rent_45m2_normal,
                      eatout_lunch,
                      eatout_dinner,
                      taxi_5mi,
                      gas_liter,
                      pubtrans_monthly,
                      internet_monthly,
                      TV_40in,
                      cappuccino,
                      mobile_wifi_128gb))

list_tuples3 = list(zip(list_COL_sin1,
                       list_COL_fam4))

list_tuples4 = list(zip(list_n_prices,
                      list_n_people))

df2 = pd.DataFrame(list_tuples2, columns = list_feature_names)
df3 = pd.DataFrame(list_tuples3, columns = ['COL (Family of 4)',
                                            'COL (Single of 1)'])
df4 = pd.DataFrame(list_tuples4, columns = ['No. Prices',
                                            'No. Contributors'])

In [77]:
# df1_split = df1[1781:].reset_index(drop=True)  #resetting renames
# df1_split

Unnamed: 0,City,Country,URL
0,Al Mubarraz,Saudi Arabia,https://www.expatistan.com/cost-of-living/al-m...
1,Al Qurayyat,Saudi Arabia,https://www.expatistan.com/cost-of-living/al-q...
2,Arar,Saudi Arabia,https://www.expatistan.com/cost-of-living/arar
3,Buraidah,Saudi Arabia,https://www.expatistan.com/cost-of-living/bura...
4,Dammam,Saudi Arabia,https://www.expatistan.com/cost-of-living/dammam
...,...,...,...
689,Kadoma,Zimbabwe,https://www.expatistan.com/cost-of-living/kadoma
690,Marondera,Zimbabwe,https://www.expatistan.com/cost-of-living/maro...
691,Masvingo,Zimbabwe,https://www.expatistan.com/cost-of-living/masv...
692,Mutare,Zimbabwe,https://www.expatistan.com/cost-of-living/mutare


In [78]:
# check df1 name before joining

df_cities_test = pd.concat([df1_split, df2, df3, df4], axis=1, ignore_index=True)
df_cities_test

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,Al Mubarraz,Saudi Arabia,https://www.expatistan.com/cost-of-living/al-m...,1200.0,533.0,800.0,400.0,13.00,27.0,5.83,0.13,240.0,27.0,533.0,3.73,,1514.0,3444.0,-10,-5
1,Al Qurayyat,Saudi Arabia,https://www.expatistan.com/cost-of-living/al-q...,320.0,267.0,,,3.38,16.0,,,,,294.0,1.87,,,,-10,-5
2,Arar,Saudi Arabia,https://www.expatistan.com/cost-of-living/arar,754.0,302.0,307.0,213.0,4.35,9.0,16.00,0.52,,28.0,233.0,2.13,133.0,920.0,2101.0,-10,-5
3,Buraidah,Saudi Arabia,https://www.expatistan.com/cost-of-living/bura...,517.0,468.0,365.0,232.0,7.00,,,0.33,,38.0,312.0,4.56,666.0,933.0,2316.0,12,-5
4,Dammam,Saudi Arabia,https://www.expatistan.com/cost-of-living/dammam,927.0,548.0,627.0,367.0,9.00,23.0,19.00,0.42,34.0,49.0,293.0,3.62,491.0,1218.0,2666.0,74,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689,Kadoma,Zimbabwe,https://www.expatistan.com/cost-of-living/kadoma,,,,,,,,,,,,,,,,-10,-5
690,Marondera,Zimbabwe,https://www.expatistan.com/cost-of-living/maro...,,,,,,,,,,,,,,,,-10,-5
691,Masvingo,Zimbabwe,https://www.expatistan.com/cost-of-living/masv...,,,,,,,,,,,,,,,,-10,-5
692,Mutare,Zimbabwe,https://www.expatistan.com/cost-of-living/mutare,,,,,,,,,,,,,,,,-10,-5


In [80]:
df_cities = pd.concat([df1_split, df2, df3, df4], axis=1, ignore_index=True)
# df_allcities = pd.concat([df1, df2, df3, df4], axis=1) # once 
df_cities.tail(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
674,Aden,Yemen,https://www.expatistan.com/cost-of-living/aden,1116.0,778.0,491.0,317.0,8.0,11.0,,0.82,77.0,79.0,691.0,1.86,,1639.0,3657.0,26,-5
675,Al Mukalla,Yemen,https://www.expatistan.com/cost-of-living/al-m...,411.0,,598.0,,3.73,,,1.25,,143.0,957.0,1.2,,,,-10,-5
676,Ibb,Yemen,https://www.expatistan.com/cost-of-living/ibb,,,,,,,,,,,,,,,,-10,-5
677,San‘a’,Yemen,https://www.expatistan.com/cost-of-living/sana,571.0,576.0,1178.0,418.0,10.0,17.0,,0.82,26.0,82.0,293.0,3.3,120.0,1153.0,2735.0,75,-5
678,Chingola,Zambia,https://www.expatistan.com/cost-of-living/chin...,606.0,454.0,485.0,242.0,43.0,18.0,,0.67,,,333.0,,,,,-10,-5
679,Chipata,Zambia,https://www.expatistan.com/cost-of-living/chipata,606.0,273.0,,,1.51,4.24,,0.81,,3.03,91.0,1.51,,,,-10,-5
680,Kabwe,Zambia,https://www.expatistan.com/cost-of-living/kabwe,1090.0,909.0,1212.0,454.0,2.83,24.0,,0.77,24.0,12.0,182.0,3.03,55.0,1323.0,2227.0,-10,-5
681,Kasama,Zambia,https://www.expatistan.com/cost-of-living/kasama,,,,,,9.0,,,,,,,,,,-10,-5
682,Livingstone,Zambia,https://www.expatistan.com/cost-of-living/livi...,417.0,,,,,,,,73.0,,97.0,,,,,-10,-5
683,Lusaka,Zambia,https://www.expatistan.com/cost-of-living/lusaka,803.0,598.0,576.0,360.0,8.0,17.0,16.0,1.04,23.0,24.0,345.0,1.77,287.0,1049.0,2061.0,54,14


In [87]:
# Save your progress to disk!!

# df_cities1781_2474 = df_cities.copy()
# df_cities1781_2474.to_pickle('df_cities1781_2474')

# For quick recovery

# df_cities = pd.read_pickle(df_cities981_1780.pkl)

In [101]:
print(df_cities000_980.shape,  # last city: Pontianak, Indonesia
    df_cities981_1780.shape,   # last city: Sao Tome, ST&P
    df_cities1781_2474.shape)  # last city: Zvishavane District, Zimbabwe

(981, 20) (800, 20) (694, 20)


In [103]:
df_allcities

Unnamed: 0,City,Country,URL,rent_85m2_expens,rent_85m2_normal,rent_45m2_expens,rent_45m2_normal,eatout_lunch,eatout_dinner,taxi_5mi,gas_liter,pubtrans_monthly,internet_monthly,TV_40in,cappuccino,mobile_wifi_128gb,COL (Family of 4),COL (Single of 1),No. Prices,No. Contributors
0,Kabul,Afghanistan,https://www.expatistan.com/cost-of-living/kabul,429.0,254.0,295.0,213.0,4.41,9.0,4.12,0.49,31.0,37.0,229.0,2.01,259.0,619.0,1438.0,206,13
1,Mariehamn,Aland Islands,https://www.expatistan.com/cost-of-living/mari...,1174.0,998.0,910.0,587.0,15.00,41.0,23.00,1.80,121.0,32.0,587.0,4.74,,1934.0,4549.0,-10,-5
2,Durrës,Albania,https://www.expatistan.com/cost-of-living/durres,263.0,725.0,491.0,288.0,30.00,24.0,,1.66,14.0,18.0,654.0,1.66,,995.0,2516.0,-10,-5
3,Elbasan,Albania,https://www.expatistan.com/cost-of-living/elbasan,275.0,198.0,154.0,103.0,3.39,19.0,,1.60,,10.0,387.0,1.30,,628.0,1916.0,31,-5
4,Korçë,Albania,https://www.expatistan.com/cost-of-living/korce,2073.0,343.0,820.0,375.0,6.00,24.0,20.00,1.63,85.0,13.0,458.0,1.67,,1888.0,3964.0,-10,-5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
689,Kadoma,Zimbabwe,https://www.expatistan.com/cost-of-living/kadoma,,,,,,,,,,,,,,,,-10,-5
690,Marondera,Zimbabwe,https://www.expatistan.com/cost-of-living/maro...,,,,,,,,,,,,,,,,-10,-5
691,Masvingo,Zimbabwe,https://www.expatistan.com/cost-of-living/masv...,,,,,,,,,,,,,,,,-10,-5
692,Mutare,Zimbabwe,https://www.expatistan.com/cost-of-living/mutare,,,,,,,,,,,,,,,,-10,-5


In [97]:
# # Needed to run b/c column names were dropped for 2nd and 3rd sets

# df_unlabeledcities = pd.concat([df_cities981_1780, 
#                           df_cities1781_2474], axis=0) # once

# df_unlabeledcities.columns = ['City', 'Country', 'URL', 
#                         'rent_85m2_expens', 'rent_85m2_normal',
#                         'rent_45m2_expens', 'rent_45m2_normal',
#                         'eatout_lunch', 'eatout_dinner',
#                         'taxi_5mi', 'gas_liter',
#                         'pubtrans_monthly', 'internet_monthly',
#                         'TV_40in', 'cappuccino', 'mobile_wifi_128gb',
#                         'COL (Family of 4)', 'COL (Single of 1)',
#                         'No. Prices', 'No. Contributors']

# df_allcities = pd.concat([df_cities000_980, 
#                           df_unlabeledcities], axis=0)

# df_allcities.shape

(2475, 20)

In [99]:
# Save your work

df_allcities.to_pickle('df_allcities')
df_allcities.to_csv('allcitiescsvcopy', index=False)

# For quick recovery

# df_allcities = pd.read_pickle(df_allcities.pkl)
# df_allcities = pd.read_csv('allcitiescsvcopy')

### (Not needed)

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [None]:
def currency_to_usd(url):
    chromedriver = "/Applications/chromedriver" 
    os.environ["webdriver.chrome.driver"] = chromedriver
    
    driver = webdriver.Chrome(chromedriver)
    driver.get(url)
    time.sleep(20)
    
    currency_dropdown = driver.find_element_by_id('change_currency')
    currency_dropdown.click()
    time.sleep(2)

    currency_dropdown.send_keys("USD - $ (Dollar)")
    time.sleep(2)

    soup_usd = BeautifulSoup(driver.page_source, 'html.parser')
    return soup_usd

In [None]:
# Raw HTML for navigation (currency unconverted)
# Continue testing with Kabul

driver = webdriver.Chrome(chromedriver)
driver.get('https://www.expatistan.com/cost-of-living/kabul')
time.sleep(2)

currency_dropdown = driver.find_element_by_id('change_currency')
currency_dropdown.click()
time.sleep(2)

currency_dropdown.send_keys("USD - $ (Dollar)")
time.sleep(2)

soup_usd = BeautifulSoup(driver.page_source, 'html.parser')
soup_usd.prettify

# May not need Selenium for Expatistan...

In [None]:
# Get cost of living for single person and family of four

# chunk = soup_usd.find('div', class_='cost-of-living-summary centered right-widget')
# COL_fam4_str = chunk.find_all('span', class_='price')[0].text
# COL_sin1_str = chunk.find_all('span', class_='price')[1].text

# print(str_COL_fam4, type(str_COL_fam4))

In [None]:
## Step by step for single city
# 1. Import HTML from URL as BS object 

url = 'https://www.expatistan.com/cost-of-living/guayaquil'      # Guayaquil, Ecuador

response = requests.get(url)                              
# response.status_code
page = response.text
soup_usd = BeautifulSoup(page, "lxml")

# 2. Find corresponding text in BeatifulSoup object

feature = 'Basic lunchtime menu (including a drink) in the business district'

chunk = soup_usd.find('a', text=feature)
check = chunk.findNext().findNext()
    if check.name == 'tr':
        feature_value = chunk.findNext().text[2:-2]
    else:
        feature_value = chunk1.findNext().findNext().text[2:-2]

In [None]:
url = 'https://www.expatistan.com/cost-of-living/buenos-aires?currency=USD'      # Guayaquil, Ecuador

response = requests.get(url)                              
# response.status_code
page = response.text
soup_usd = BeautifulSoup(page, "lxml")

# 2. Find corresponding text in BeatifulSoup object

chunk1 = soup_usd.find('a', text='Basic lunchtime menu (including a drink) in the business district')
check = chunk1.findNext().findNext()
check.name

In [None]:
feature_value = chunk1.findNext().findNext()
        
        if feature_value[0] != '$':                              # causes errors in countries 
           feature_value = np.nan
        else:
            fv_str = feature_value.replace('$', '').replace(',', '')
            feature_value = float(fv_str)
        
        list_of_lists[i].append(feature_value)

# 3. Append to corresponding lists

list_n_prices = []
list_n_people = []

n_prices = contribution_tuple[0]
n_people = contribution_tuple[1]

list_n_prices.append(n_prices)
list_n_people.append(n_people)

print(list_n_prices, list_n_people)    # should be ['206'] ['13'] for Kabul

In [None]:
# # Define function for getting numbers of contributions/contributors.

# import random
# from fake_useragent import UserAgent

# def get_contribution_info(url):
    
#     ua = UserAgent()
#     user_agent = {'User-agent': ua.random}                                   # new user_agent every iteration
    
#     response = requests.get(url, headers = user_agent)                       
#     page = response.text
#     soup = BeautifulSoup(page, "lxml")
    
#     if soup.find_all('div', class_='cost-of-living-summary centered right-widget'):    # Summary window must be present
#         chunk = soup.find_all('div', class_='accuracy-report')[-1].text                # evaluation always last on page
#         sentences = chunk.split('.')                                                   # split into sentences
#         sentence_raw = sentences[2].strip()                                            # select 3rd sentence in accuracy-report
#         sentence = sentence_raw.replace('less than ', '-')                             # flag cities with scarce data as negative for now
#         words = sentence.split(' ')
#         no_prices, no_people = words[4], words[8]
#     else:
#         no_prices, no_people = 'ERROR', 'ERROR'                        # flagged to filter out later

#     return no_prices, no_people

In [None]:
# # Scrape contributor info with delay
# # Run overnight

# list_n_prices = []
# list_n_people = []

# for url in list_url2:
    
#     contribution_tuple = get_contribution_info(url)
#     n_prices = contribution_tuple[0]
#     n_people = contribution_tuple[1]
    
#     list_n_prices.append(n_prices)                               
#     list_n_people.append(n_people)
    
#     wait = .5 + 10 * random.random()
#     time.sleep(wait)
#     print(f'{url[41:]}, {n_prices}, {n_people}, waited {wait:0.4} sec.')