1. Scrape list of cities with hyperlinks from 'All Cities' page (BeautifulSoup)

In [1]:
import pandas as pd

from bs4 import BeautifulSoup
import requests
import time, os

In [3]:
# Complete list of cities (provided by site manager)

url = 'https://www.expatistan.com/cost-of-living/all-cities' 

response = requests.get(url)
#response.text
response.status_code  # 2** = successful read

200

In [4]:
page = response.text
soup = BeautifulSoup(page, "lxml")
# print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <meta content="text/html; charset=utf-8" http-equiv="content-type"/>
  <link href="https://www.expatistan.com/cost-of-living/all-cities" hreflang="en" rel="alternate"/>
  <link href="https://www.expatistan.com/es/costo-de-vida/todas-las-ciudades" hreflang="es" rel="alternate"/>
  <link href="https://www.expatistan.com/pt/custo-de-vida/todas-as-cidades" hreflang="pt" rel="alternate"/>
  <title>
   Cost of Living Comparisons, 2021 data.
  </title>
  <meta content="Cost of Living comparisons for thousands of cities. Fully up-to-date cost of living comparisons, including prices of 52 products and services." name="description"/>
  <meta content="Cost of Living Comparisons, 2021 data." property="og:title"/>
  <meta content="Cost of Living comparisons for thousands of cities. Fully up-to-date cost of living comparisons, including prices of 52 products and services." property="og:description"/>
  <meta content="website" property="og:type"/>
  <meta co

In [None]:
# # Toolbox

# %%script false

# for link in soup.find_all('a')[:50]:
#     print(link, '\n')

# for element in soup.find_all(class_='mojo-navigation-tab'):
#     print(element.prettify())

# for link in soup.find_all('div', id='three-columns')[:50]:
#     print(link, '\n')

# soup.find_all('li').size

In [None]:
# # Returned list includes countries...not ideal but useful later?

# list_name = []
# list_url = []

# for entry in soup.find_all('div', id='all-cities'):
#     for link in entry.find_all('a'):
#         list_name.append(link.text)
#         list_url.append(link.get('href'))
        
# list_url

In [5]:
# Returns lists of cities/urls only


list_name = []
list_url = []

for entry in soup.find_all('div', id='all-cities'):        # entire table
    for city in entry.find_all('li'):                      # only cities
        for link in city.find_all('a'):                    # grab links
            list_name.append(link.text)
            list_url.append(link.get('href'))

# list_name
# list_url

In [6]:
# Get corresponding list of countries

current_country = ''
list_country = []

for entry in soup.find_all('div', id='all-cities'):         # entire table
     for link in entry.find_all('a'):                       # grab links
            url = link.get('href')
            name = link.text
            
            if '/country/' in url:                          # common feature to country URLs
                current_country = name                      # update 'Country' label
                pass                                        # not new entry
            else:
                list_country.append(current_country)        # if URL not country, it's a city to label

# list_country

In [7]:
# Create dataframe from three lists

list_tuples = list(zip(list_name, list_country, list_url))
df = pd.DataFrame(list_tuples, columns = ['City', 'Country', 'URL'])


In [9]:
# Remove cities lacking any user-contributed data

df_clean = df[~df["URL"].str.contains('/rate/')]          # cities with /rate/ in URL are placeholders without contributions
df_clean.reset_index(drop=True, inplace=True)             

# 2,475 eligible cities (Afghanistan>Kabul ... Zimbabwe>Zvishavane District) 

df_clean

Unnamed: 0,City,Country,URL
0,Kabul,Afghanistan,https://www.expatistan.com/cost-of-living/kabul
1,Mariehamn,Aland Islands,https://www.expatistan.com/cost-of-living/mari...
2,Durrës,Albania,https://www.expatistan.com/cost-of-living/durres
3,Elbasan,Albania,https://www.expatistan.com/cost-of-living/elbasan
4,Korçë,Albania,https://www.expatistan.com/cost-of-living/korce
...,...,...,...
2470,Kadoma,Zimbabwe,https://www.expatistan.com/cost-of-living/kadoma
2471,Marondera,Zimbabwe,https://www.expatistan.com/cost-of-living/maro...
2472,Masvingo,Zimbabwe,https://www.expatistan.com/cost-of-living/masv...
2473,Mutare,Zimbabwe,https://www.expatistan.com/cost-of-living/mutare


2. 1. Collect A) number of entries and B) number of contributors from each city page  
   2. Sort list of cities descending by A, B
   3. Filter list to top 1000 cities

In [10]:
# For cleaning cost-of-living string

def COL_to_float(string):
    
    str_list = string.split(" ")
    cost_raw = str_list[0]
    cost_str = cost_raw.replace('$', '').replace(',', '')
    cost = float(cost_str)
    
    return cost

In [14]:
# Instantiate empty list for each feature

rent_85m2_expens = []
rent_85m2_normal = []
rent_45m2_expens = []
rent_45m2_normal = []
eatout_lunch = []
eatout_dinner = []
taxi_5mi = []
gas_liter = []
pubtrans_monthly = []
internet_monthly = []
TV_40in = []
cappuccino = []
mobile_wifi_128gb = []

list_n_prices = []
list_n_people = []

list_COL_fam4 = []
list_COL_sin1 = []

# Lists of features for looping

list_feature_texts = ['Monthly rent for 85 m2 (900 sqft) furnished accommodation in expensive area',
                 'Monthly rent for 85 m2 (900 sqft) furnished accommodation in normal area',
                 'Monthly rent for a 45 m2 (480 sqft) furnished studio in expensive area',
                 'Monthly rent for a 45 m2 (480 sqft) furnished studio in normal area',
                 'Basic lunchtime menu (including a drink) in the business district',
                 'Basic dinner out for two in neighborhood pub',
                 'Taxi trip on a business day, basic tariff, 8 km. (5 miles)',
                 '1 liter (1/4 gallon) of gas',
                 'Monthly ticket public transport',
                 'Internet 8 mbps (1 month)',
                 '40” flat screen tv',                     # special quotation mark! (”)
                 'Cappuccino in expat area of the city',
                 'Ipad wi-fi 128gb']

list_feature_names = ['rent_85m2_expens',
                      'rent_85m2_normal',
                      'rent_45m2_expens',
                      'rent_45m2_normal',
                      'eatout_lunch',
                      'eatout_dinner',
                      'taxi_5mi',
                      'gas_liter',
                      'pubtrans_monthly',
                      'internet_monthly',
                      'TV_40in',
                      'cappuccino',
                      'mobile_wifi_128gb']


list_of_lists = [rent_85m2_expens,
                      rent_85m2_normal,
                      rent_45m2_expens,
                      rent_45m2_normal,
                      eatout_lunch,
                      eatout_dinner,
                      taxi_5mi,
                      gas_liter,
                      pubtrans_monthly,
                      internet_monthly,
                      TV_40in,
                      cappuccino,
                      mobile_wifi_128gb]

list_of_urls = list(df_clean["URL"])

In [15]:
import random
from fake_useragent import UserAgent

for url in list_of_urls:
    
    new_url = url + '?currency=USD'  # generate page version with USD
    
    ua = UserAgent()
    user_agent = {'User-agent': ua.random}                       # new user_agent every iteration
    
    response = requests.get(new_url, headers = user_agent)                       
    page = response.text
    soup_usd = BeautifulSoup(page, "lxml")
    
    # 1. Get price features
    
    for i, feature in enumerate(list_feature_texts):
        chunk1 = soup_usd.find('a', text=feature)
        feature_value = chunk1.findNext().findNext().text[2:-2]
        # if feature_value[0] != '$':                              # causes errors in countries 
        #    feature_value = 'Null'
        
        list_of_lists[i].append(feature_value)
    
    # 2. Get cost of living estimates
    
    if soup_usd.find_all('div', class_='cost-of-living-summary centered right-widget'):    # Summary window must be present
        chunk2 = soup_usd.find('div', class_='cost-of-living-summary centered right-widget')
        COL_fam4_raw = chunk2.find_all('span', class_='price')[0].text
        COL_fam4 = COL_to_float(COL_fam4_raw)
        COL_sin1_raw = chunk2.find_all('span', class_='price')[1].text
        COL_sin1 = COL_to_float(COL_sin1_raw)
    else:
        COL_fam4, COL_sin1 = 'NO_COL', 'NO_COL'
    
    list_COL_sin1.append(COL_sin1)
    list_COL_fam4.append(COL_fam4)
    
    # 3. Get contribution info
    
    chunk3 = soup_usd.find_all('div', class_='accuracy-report')[-1].text       # evaluation always last on page
    sentences = chunk3.split('.')                                              # split into sentences
    sentence_raw = sentences[2].strip()                                        # select 3rd sentence in accuracy-report
    sentence = sentence_raw.replace('less than ', '-')                         # flag cities with scarce data as negative for now
    words = sentence.split(' ')
    no_prices, no_people = words[4], words[8]
    
    list_n_prices.append(no_prices)                               
    list_n_people.append(no_people)
    
    # 4. Pause like a human
    
    wait = .5 + 10 * random.random()
    time.sleep(wait)
    print(f'{url[41:]}, {no_prices}, ${COL_sin1}, waited {wait:0.4} sec.')

/kabul, 206, $619.0, waited 5.96 sec.
/mariehamn, -10, $1930.0, waited 2.812 sec.
/durres, -10, $990.0, waited 6.723 sec.
/elbasan, 31, $624.0, waited 7.856 sec.
/korce, -10, $1877.0, waited 8.016 sec.
/shkoder, -10, $583.0, waited 9.224 sec.
/tirana, 222, $905.0, waited 8.006 sec.
/vlore, -10, $577.0, waited 1.446 sec.
/ain-temouchentain-temouchent, -10, $NO_COL, waited 1.697 sec.
/algiers, 304, $951.0, waited 0.555 sec.
/batna-city, -10, $440.0, waited 7.365 sec.
/bechar, -10, $447.0, waited 3.96 sec.
/bejaia, -10, $NO_COL, waited 2.345 sec.
/constantine, -10, $538.0, waited 3.511 sec.
/drean, -10, $NO_COL, waited 1.67 sec.
/el-eulma, -10, $NO_COL, waited 7.176 sec.
/khemis-miliana, -10, $689.0, waited 9.36 sec.
/laghouat, -10, $NO_COL, waited 2.592 sec.
/medea, -10, $NO_COL, waited 8.944 sec.
/metlili-chaamba, -10, $835.0, waited 1.737 sec.
/oran, 81, $708.0, waited 3.544 sec.
/oum-el-bouaghi, -10, $NO_COL, waited 6.533 sec.
/sidi-bel-abbes, -10, $NO_COL, waited 6.001 sec.
/skikda, 

/divinopolis, 14, $602.0, waited 7.136 sec.
/duque-de-caxias, 95, $657.0, waited 3.253 sec.
/feira-de-santana, 38, $463.0, waited 3.277 sec.
/florianopolis, 301, $927.0, waited 3.489 sec.
/fortaleza, 475, $692.0, waited 2.183 sec.
/foz-do-iguacu, 93, $573.0, waited 1.473 sec.
/franca, 55, $470.0, waited 8.774 sec.
/goiania, 309, $725.0, waited 3.923 sec.
/governador-valadares, 172, $502.0, waited 4.015 sec.
/gramado, 32, $786.0, waited 4.366 sec.
/guaruja, -10, $682.0, waited 2.264 sec.
/guarulhos, 18, $691.0, waited 4.904 sec.
/imperatriz, 51, $761.0, waited 8.849 sec.
/ipatinga, 33, $451.0, waited 8.677 sec.
/itabuna, -10, $575.0, waited 7.67 sec.
/itajuba, -10, $517.0, waited 7.138 sec.
/jaboatao-brazil, 27, $NO_COL, waited 7.733 sec.
/jaboatao-dos-guararapes, -10, $NO_COL, waited 8.233 sec.
/joinville, 311, $621.0, waited 6.954 sec.
/joao-pessoa, 165, $670.0, waited 5.888 sec.
/juazeiro-do-norte, -10, $564.0, waited 6.886 sec.
/juiz-de-fora, 149, $543.0, waited 7.808 sec.
/jundiai,

/changzhou, -10, $1949.0, waited 8.867 sec.
/chaozhou, -10, $NO_COL, waited 8.594 sec.
/chengde, -10, $NO_COL, waited 3.046 sec.
/chengdu, 147, $1079.0, waited 4.541 sec.
/chifeng, -10, $NO_COL, waited 9.043 sec.
/chongqing, 17, $926.0, waited 9.085 sec.
/dalian, -10, $960.0, waited 5.449 sec.
/datong, -10, $NO_COL, waited 4.347 sec.
/dezhou, -10, $671.0, waited 7.579 sec.
/dongguan, 26, $984.0, waited 7.554 sec.
/foshan, -10, $812.0, waited 6.577 sec.
/fushun, -10, $630.0, waited 1.101 sec.
/fuzhou, -10, $972.0, waited 4.729 sec.
/guangzhou, 87, $1626.0, waited 8.687 sec.
/guilin, -10, $711.0, waited 3.514 sec.
/guiyang, 18, $782.0, waited 5.448 sec.
/haikou, -10, $NO_COL, waited 4.968 sec.
/hangzhou, -10, $1412.0, waited 9.931 sec.
/harbin, 25, $1023.0, waited 7.193 sec.
/hefei, 59, $797.0, waited 9.373 sec.
/hengyang, -10, $NO_COL, waited 4.183 sec.
/heze, 89, $NO_COL, waited 3.315 sec.
/hohhot, -10, $902.0, waited 3.323 sec.
/huizhou, -10, $726.0, waited 6.511 sec.
/jiangmen, -10, 

/bonao, -10, $NO_COL, waited 5.911 sec.
/juan-dolio, -10, $NO_COL, waited 5.984 sec.
/las-guaranas, -10, $NO_COL, waited 9.203 sec.
/salvaleon-de-higuey, 60, $733.0, waited 1.138 sec.
/san-juan-de-la-maguana, 83, $681.0, waited 3.874 sec.
/santiago-de-los-caballeros, 104, $824.0, waited 7.652 sec.
/santo-domingo, 500, $959.0, waited 7.118 sec.
/dili, -10, $2793.0, waited 2.52 sec.
/babahoyo, 10, $NO_COL, waited 9.969 sec.
/cuenca, 335, $893.0, waited 8.449 sec.


IndexError: string index out of range

In [30]:
print(rent_85m2_expens[-4], mobile_wifi_128gb[-4])

Null $694


In [25]:
# Should all be same length
print(len(rent_85m2_expens), len(mobile_wifi_128gb), len(list_COL_sin1), len(list_n_prices))
# Should be length of URL list (2475)
# len(mobile_wifi_128gb)

581 580 580 580


In [None]:
# Concatenate into full COL dataset

list_tuples2 = list(zip(rent_85m2_expens,
                      rent_85m2_normal,
                      rent_45m2_expens,
                      rent_45m2_normal,
                      eatout_lunch,
                      eatout_dinner,
                      taxi_5mi,
                      gas_liter,
                      pubtrans_monthly,
                      internet_monthly,
                      TV_40in,
                      cappuccino,
                      mobile_wifi_128gb))

list_tuples3 = list(zip(list_COL_sin1,
                       list_COL_fam4))

list_tuples4 = list(zip(list_n_prices,
                      list_n_people))

df2 = pd.DataFrame(list_tuples2, columns = list_feature_names)
df3 = pd.DataFrame(list_tuples3, columns = ['COL (Family of 4)',
                                            'COL (Single of 1)'])
df4 = pd.DataFrame(list_tuples4, columns = ['No. Prices',
                                            'No. Contributors'])

df_clean2 = pd.concat([df_clean, df2, df3, df4], axis=1)
df_clean2

In [None]:
# Write df to disk

### Selenium starts here

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

chromedriver = "/Applications/chromedriver" # path to the chromedriver executable
os.environ["webdriver.chrome.driver"] = chromedriver

In [None]:
def currency_to_usd(url):
    chromedriver = "/Applications/chromedriver" 
    os.environ["webdriver.chrome.driver"] = chromedriver
    
    driver = webdriver.Chrome(chromedriver)
    driver.get(url)
    time.sleep(20)
    
    currency_dropdown = driver.find_element_by_id('change_currency')
    currency_dropdown.click()
    time.sleep(2)

    currency_dropdown.send_keys("USD - $ (Dollar)")
    time.sleep(2)

    soup_usd = BeautifulSoup(driver.page_source, 'html.parser')
    return soup_usd

In [None]:
# Raw HTML for navigation (currency unconverted)
# Continue testing with Kabul

driver = webdriver.Chrome(chromedriver)
driver.get('https://www.expatistan.com/cost-of-living/kabul')
time.sleep(2)

currency_dropdown = driver.find_element_by_id('change_currency')
currency_dropdown.click()
time.sleep(2)

currency_dropdown.send_keys("USD - $ (Dollar)")
time.sleep(2)

soup_usd = BeautifulSoup(driver.page_source, 'html.parser')
soup_usd.prettify

In [None]:
# Get cost of living for single person and family of four

chunk = soup_usd.find('div', class_='cost-of-living-summary centered right-widget')
COL_fam4_str = chunk.find_all('span', class_='price')[0].text
COL_sin1_str = chunk.find_all('span', class_='price')[1].text

print(str_COL_fam4, type(str_COL_fam4))

In [None]:
# Features

chunk = soup_usd.find('a', text='Monthly rent for 85 m2 (900 sqft) furnished accommodation in expensive area')
rent_85m2_expensive_str = chunk.findNext().findNext().text[2:-2]

chunk = soup_usd.find('a', text='Monthly rent for 85 m2 (900 sqft) furnished accommodation in normal area')
rent_85m2_normal_str = chunk.findNext().findNext().text[2:-2]

chunk = soup_usd.find('a', text='Monthly rent for a 45 m2 (480 sqft) furnished studio in expensive area')
rent_45m2_expensive_str = chunk.findNext().findNext().text[2:-2]

chunk = soup_usd.find('a', text='Monthly rent for a 45 m2 (480 sqft) furnished studio in normal area')
rent_45m2_normal_str = chunk.findNext().findNext().text[2:-2]

chunk = soup_usd.find('a', text='Basic lunchtime menu (including a drink) in the business district')
eatout_lunch_str = chunk.findNext().findNext().text[2:-2]

chunk = soup_usd.find('a', text='Basic dinner out for two in neighborhood pub')
eatout_dinner_str = chunk.findNext().findNext().text[2:-2]

chunk = soup_usd.find('a', text='Taxi trip on a business day, basic tariff, 8 km. (5 miles)')
taxi_5mi_str = chunk.findNext().findNext().text[2:-2]

chunk = soup_usd.find('a', text='1 liter (1/4 gallon) of gas')
gas_liter_str = chunk.findNext().findNext().text[2:-2]

chunk = soup_usd.find('a', text='Monthly ticket public transport')
pubtrans_monthly_str = chunk.findNext().findNext().text[2:-2]

chunk = soup_usd.find('a', text='Internet 8 mbps')
internet_monthly_str = chunk.findNext().findNext().text[2:-2]

chunk = soup_usd.find('a', text='40\" flat screen tv')
TV_40in_str = chunk.findNext().findNext().text[2:-2]

chunk = soup_usd.find('a', text='Cappucino in expat area of the city')
cappucino_str = chunk.findNext().findNext().text[2:-2]

chunk = soup_usd.find('a', text='Ipad wi-fi 128gb')
mobile_wifi_str = chunk.findNext().findNext().text[2:-2]

In [None]:
# ## Step by step for single city
# # 1. Import HTML from URL as BS object 

# kabul = 'https://www.expatistan.com/cost-of-living/kabul'      # Kabul is df_clean.URL[0]

# response = requests.get(kabul)                              
# # response.status_code

# page = response.text
# soup = BeautifulSoup(page, "lxml")

# # 2. Find corresponding text in BeatifulSoup object

# chunk = soup.find_all('div', class_='accuracy-report')[-1].text       # evaluation always last on page
# sentences = chunk.split('.')                                          # split into sentences
# sentence_raw = sentences[2].strip()                                   # select 3rd sentence in accuracy-report
# sentence = sentence_raw.replace('less than ', '-')                    # flag cities with scarce data as negative for now
# words = sentence.split(' ')
# no_prices, no_people = words[4], words[8]
# contribution_tuple = (no_prices, no_people)

# # 3. Append to corresponding lists

# list_n_prices = []
# list_n_people = []

# n_prices = contribution_tuple[0]
# n_people = contribution_tuple[1]

# list_n_prices.append(n_prices)
# list_n_people.append(n_people)

# print(list_n_prices, list_n_people)    # should be ['206'] ['13'] for Kabul

In [None]:
# # Define function for getting numbers of contributions/contributors.

# import random
# from fake_useragent import UserAgent

# def get_contribution_info(url):
    
#     ua = UserAgent()
#     user_agent = {'User-agent': ua.random}                                   # new user_agent every iteration
    
#     response = requests.get(url, headers = user_agent)                       
#     page = response.text
#     soup = BeautifulSoup(page, "lxml")
    
#     if soup.find_all('div', class_='cost-of-living-summary centered right-widget'):    # Summary window must be present
#         chunk = soup.find_all('div', class_='accuracy-report')[-1].text                # evaluation always last on page
#         sentences = chunk.split('.')                                                   # split into sentences
#         sentence_raw = sentences[2].strip()                                            # select 3rd sentence in accuracy-report
#         sentence = sentence_raw.replace('less than ', '-')                             # flag cities with scarce data as negative for now
#         words = sentence.split(' ')
#         no_prices, no_people = words[4], words[8]
#     else:
#         no_prices, no_people = 'ERROR', 'ERROR'                        # flagged to filter out later

#     return no_prices, no_people

In [None]:
# # Scrape contributor info with delay
# # Run overnight

# list_n_prices = []
# list_n_people = []

# for url in list_url2:
    
#     contribution_tuple = get_contribution_info(url)
#     n_prices = contribution_tuple[0]
#     n_people = contribution_tuple[1]
    
#     list_n_prices.append(n_prices)                               
#     list_n_people.append(n_people)
    
#     wait = .5 + 10 * random.random()
#     time.sleep(wait)
#     print(f'{url[41:]}, {n_prices}, {n_people}, waited {wait:0.4} sec.')

6. Collect city attributes from Wikipedia (population, latitude, etc.)

7. Regression analysis