# Obtaining socio-economic data
#### Using pandas read_html to collect table data from https://www.numbeo.com/quality-of-life/in/Berlin

In [1]:
import pandas as pd
import lxml

## 1. Getting city data
### 1.1 Define city list

In [2]:
city_list = ['Amsterdam', 'Andorra-La-Vella', 'Athens', 'Belgrade', 'Berlin', 'Bern', 
             'Bratislava', 'Brussels', 'Bucharest', 'Budapest', 'Chisinau', 'Copenhagen', 
             'Dublin', 'Helsinki', 'Kiev', 'Lisbon', 'Ljubljana', 'London', 'Luxembourg', 
             'Madrid', 'Minsk', 'Monaco', 'Moscow', 'Nicosia', 'Nuuk', 'Oslo', 'Paris', 
             'Podgorica', 'Prague', 'Reykjavik', 'Riga', 'Rome', 'San-Marino', 'Sarajevo', 
             'Skopje', 'Sofia', 'Stockholm', 'Tallinn', 'Tirana', 'Vaduz', 'Valletta', 'Vatican-City', 
             'Vienna', 'Vilnius', 'Warsaw', 'Zagreb']

### 1.2 Creating a function to obtain data and put it into a dataframe for processing and modelling

In [3]:
test_list = ["Vienna", "Berlin"]

In [4]:
def get_city_info(city_list) -> list:
    
    # create empty database
    df = pd.DataFrame()
    errors = []
    
    # iterate over list of cities to obtain table data from Numbeo
    for city in city_list:
        try:
            table = pd.read_html(f"https://www.numbeo.com/quality-of-life/in/{city}")[3]
            table = (table.assign(city=city)
                      .rename(columns={0:"category", 1:"numeral", 2:"level"})
                      .drop([table.index[8]]))
            df = df.append(table)
        
        except:
            errors.append(city)
    
    df.reset_index(inplace=True, drop=True)
    
    # df = df.pivot(index="city", columns="category", values=["numeral", "level"])
    
    # colnames = ['climate', 'cost_of_living', 'health_care', 'pollution', 'property_income_ratio', 'purchasing_power',
                #'safety', 'traffic_time', 'quality_of_life', 'climate_level', 'cost_of_living_level',
                #'health_care_level', 'pollution_level', 'property_income_ratio_level', 'purchasing_power_level',
                #'safety_level', 'traffic_time_level', 'quality_of_life_level']
    #df.columns = colnames
    #df = df.reset_index()
        
    return df, errors
 
    

In [5]:
get_city_info(test_list)

(                          category  numeral      level    city
 0           Purchasing Power Index    83.38   Moderate  Vienna
 1                     Safety Index    75.85       High  Vienna
 2                Health Care Index    78.83       High  Vienna
 3                    Climate Index    81.77  Very High  Vienna
 4             Cost of Living Index    68.69   Moderate  Vienna
 5   Property Price to Income Ratio    13.72       High  Vienna
 6       Traffic Commute Time Index    26.78        Low  Vienna
 7                  Pollution Index    18.15   Very Low  Vienna
 8         ƒ Quality of Life Index:   183.99  Very High  Vienna
 9           Purchasing Power Index    97.17       High  Berlin
 10                    Safety Index    58.92   Moderate  Berlin
 11               Health Care Index    69.68       High  Berlin
 12                   Climate Index    83.35  Very High  Berlin
 13            Cost of Living Index    67.91   Moderate  Berlin
 14  Property Price to Income Ratio     

## 2. Final data-set
### 2.1 Checking for cities not covered by Numbeo

In [6]:
# get_city_info(city_list)

In [7]:
drop_cities = ['San-Marino', 'Vaduz', 'Vatican-City']

update_cities = [city for city in city_list if city not in drop_cities]

In [8]:
print(update_cities)

['Amsterdam', 'Andorra-La-Vella', 'Athens', 'Belgrade', 'Berlin', 'Bern', 'Bratislava', 'Brussels', 'Bucharest', 'Budapest', 'Chisinau', 'Copenhagen', 'Dublin', 'Helsinki', 'Kiev', 'Lisbon', 'Ljubljana', 'London', 'Luxembourg', 'Madrid', 'Minsk', 'Monaco', 'Moscow', 'Nicosia', 'Nuuk', 'Oslo', 'Paris', 'Podgorica', 'Prague', 'Reykjavik', 'Riga', 'Rome', 'Sarajevo', 'Skopje', 'Sofia', 'Stockholm', 'Tallinn', 'Tirana', 'Valletta', 'Vienna', 'Vilnius', 'Warsaw', 'Zagreb']


### 2.2 Now rerunning function with updated list

In [9]:
# get_city_info(update_cities)

### 2.3 There seem to be no errors coming up, now to rewrite the function, so it returns a data-set again

In [10]:
def get_city(city_list) -> list:
    
    # create empty database
    df = pd.DataFrame()
    
    # iterate over list of cities to obtain table data from Numbeo    
    # removing index position 8, because it refers to a border
    for city in city_list:
        table = pd.read_html(f"https://www.numbeo.com/quality-of-life/in/{city}")[3]
        table = (table.assign(city=city)
                      .rename(columns={0:"category", 1:"numeral", 2:"level"})
                )
        df = df.append(table)
    
    df.reset_index(inplace=True, drop=True)
  
    return df

In [11]:
#all_cities = 
get_city(test_list)

Unnamed: 0,category,numeral,level,city
0,Purchasing Power Index,83.38,Moderate,Vienna
1,Safety Index,75.85,High,Vienna
2,Health Care Index,78.83,High,Vienna
3,Climate Index,81.77,Very High,Vienna
4,Cost of Living Index,68.69,Moderate,Vienna
5,Property Price to Income Ratio,13.72,High,Vienna
6,Traffic Commute Time Index,26.78,Low,Vienna
7,Pollution Index,18.15,Very Low,Vienna
8,,,,Vienna
9,ƒ Quality of Life Index:,183.99,Very High,Vienna


In [12]:
# all_cities

### 2.4 Checking values of Index and Unnamed: 1 columns
There seems to be a city whose data was pushed back by one.

In [13]:
# all_cities["Index"].value_counts()

In [14]:
# all_cities["Unnamed: 1"].value_counts()

In [15]:
# all_cities.loc[all_cities["Unnamed: 1"] == 209.07]

The data for the city of Bern seems to have been scraped incorrectly. Let's check what data Numbeo actually has available. 

In [16]:
# pd.read_html(f"https://www.numbeo.com/quality-of-life/in/Bern")[3]

Perhaps we are reading the wrong table from Numbeo, let's try the others.

In [17]:
# pd.read_html(f"https://www.numbeo.com/quality-of-life/in/Bern")[4]

It seems as though there is not enough user information for Bern and there is a notice table in the html
pushing it to index position 4. I think we'll delete Bern, as it does not have enough data.

In [18]:
update_cities.remove("Bern")

In [19]:
print(update_cities)

['Amsterdam', 'Andorra-La-Vella', 'Athens', 'Belgrade', 'Berlin', 'Bratislava', 'Brussels', 'Bucharest', 'Budapest', 'Chisinau', 'Copenhagen', 'Dublin', 'Helsinki', 'Kiev', 'Lisbon', 'Ljubljana', 'London', 'Luxembourg', 'Madrid', 'Minsk', 'Monaco', 'Moscow', 'Nicosia', 'Nuuk', 'Oslo', 'Paris', 'Podgorica', 'Prague', 'Reykjavik', 'Riga', 'Rome', 'Sarajevo', 'Skopje', 'Sofia', 'Stockholm', 'Tallinn', 'Tirana', 'Valletta', 'Vienna', 'Vilnius', 'Warsaw', 'Zagreb']


In [20]:
# final_cities = get_city(update_cities)

In [21]:
def get_city(city_list) -> list:
    
    # create empty database
    df = pd.DataFrame()
    
    # iterate over list of cities to obtain table data from Numbeo    
    # removing index position 8, because it refers to a border
    for city in city_list:
        table = pd.read_html(f"https://www.numbeo.com/quality-of-life/in/{city}")[3]
        table = (table.assign(city=city)
                      .rename(columns={0:"category", 1:"numeral", 2:"level"})
                      .drop([table.index[8]])
                )
        df = df.append(table)
    
    df.reset_index(inplace=True, drop=True)
  
    return df

In [22]:
final_cities = get_city(update_cities)

In [23]:
final_cities.head(20)

Unnamed: 0,category,numeral,level,city
0,Purchasing Power Index,80.9,Moderate,Amsterdam
1,Safety Index,67.32,High,Amsterdam
2,Health Care Index,69.45,High,Amsterdam
3,Climate Index,87.45,Very High,Amsterdam
4,Cost of Living Index,84.26,Moderate,Amsterdam
5,Property Price to Income Ratio,10.98,Moderate,Amsterdam
6,Traffic Commute Time Index,29.88,Low,Amsterdam
7,Pollution Index,30.79,Low,Amsterdam
8,ƒ Quality of Life Index:,168.08,Very High,Amsterdam
9,Purchasing Power Index,81.72,Moderate,Andorra-La-Vella


In [24]:
def pivot_cities(df):

    df = df.pivot(index="city", columns="category", values=["numeral", "level"])
    
    colnames = ['climate', 'cost_of_living', 'health_care', 'pollution', 'property_income_ratio', 'purchasing_power',
                'safety', 'traffic_time', 'quality_of_life', 'climate_level', 'cost_of_living_level',
                'health_care_level', 'pollution_level', 'property_income_ratio_level', 'purchasing_power_level',
                'safety_level', 'traffic_time_level', 'quality_of_life_level']
    df.columns = colnames
    df = df.reset_index()
    
    return df

In [25]:
final_cities = pivot_cities(final_cities)

final_cities.head(20)

Unnamed: 0,city,climate,cost_of_living,health_care,pollution,property_income_ratio,purchasing_power,safety,traffic_time,quality_of_life,climate_level,cost_of_living_level,health_care_level,pollution_level,property_income_ratio_level,purchasing_power_level,safety_level,traffic_time_level,quality_of_life_level
0,Amsterdam,87.45,84.26,69.45,30.79,10.98,80.9,67.32,29.88,168.08,Very High,Moderate,High,Low,Moderate,Moderate,High,Low,Very High
1,Andorra-La-Vella,?,66.75,69.44,64.08,7.44,81.72,87.16,5.0,?,,Moderate,High,High,Low,Moderate,Very High,Very Low,
2,Athens,95.18,59.56,56.17,57.3,12.74,40.22,50.5,37.98,119.64,Very High,Low,Moderate,Moderate,High,Very Low,Moderate,Moderate,High
3,Belgrade,84.14,40.53,53.72,63.57,22.22,34.52,62.02,35.89,107.76,Very High,Very Low,Moderate,High,Very High,Very Low,High,Moderate,Moderate
4,Berlin,83.35,67.91,69.68,39.45,9.59,97.17,58.92,34.18,164.21,Very High,Moderate,High,Low,Moderate,High,Moderate,Low,Very High
5,Bratislava,80.72,50.85,57.17,41.12,13.37,61.26,68.68,30.89,147.31,Very High,Low,Moderate,Moderate,High,Low,High,Low,High
6,Brussels,83.85,74.91,74.5,62.36,6.38,87.24,48.28,37.17,142.75,Very High,Moderate,High,High,Low,Moderate,Moderate,Moderate,High
7,Bucharest,75.61,41.06,54.34,75.53,10.54,50.77,72.64,42.3,117.42,High,Very Low,Moderate,High,Moderate,Low,High,Moderate,High
8,Budapest,78.41,43.94,47.7,54.38,15.93,48.37,63.82,39.13,120.33,High,Very Low,Moderate,Moderate,High,Low,High,Moderate,High
9,Chisinau,76.91,34.82,51.93,62.24,13.26,25.83,54.82,28.0,111.91,High,Very Low,Moderate,High,High,Very Low,Moderate,Low,High


## 3. Data cleaning on final table

### 3.1 replacing the one '?' with NaN

In [27]:
import numpy as np

In [28]:
final_cities.replace("?", np.nan)

Unnamed: 0,city,climate,cost_of_living,health_care,pollution,property_income_ratio,purchasing_power,safety,traffic_time,quality_of_life,climate_level,cost_of_living_level,health_care_level,pollution_level,property_income_ratio_level,purchasing_power_level,safety_level,traffic_time_level,quality_of_life_level
0,Amsterdam,87.45,84.26,69.45,30.79,10.98,80.9,67.32,29.88,168.08,Very High,Moderate,High,Low,Moderate,Moderate,High,Low,Very High
1,Andorra-La-Vella,,66.75,69.44,64.08,7.44,81.72,87.16,5.0,,,Moderate,High,High,Low,Moderate,Very High,Very Low,
2,Athens,95.18,59.56,56.17,57.3,12.74,40.22,50.5,37.98,119.64,Very High,Low,Moderate,Moderate,High,Very Low,Moderate,Moderate,High
3,Belgrade,84.14,40.53,53.72,63.57,22.22,34.52,62.02,35.89,107.76,Very High,Very Low,Moderate,High,Very High,Very Low,High,Moderate,Moderate
4,Berlin,83.35,67.91,69.68,39.45,9.59,97.17,58.92,34.18,164.21,Very High,Moderate,High,Low,Moderate,High,Moderate,Low,Very High
5,Bratislava,80.72,50.85,57.17,41.12,13.37,61.26,68.68,30.89,147.31,Very High,Low,Moderate,Moderate,High,Low,High,Low,High
6,Brussels,83.85,74.91,74.5,62.36,6.38,87.24,48.28,37.17,142.75,Very High,Moderate,High,High,Low,Moderate,Moderate,Moderate,High
7,Bucharest,75.61,41.06,54.34,75.53,10.54,50.77,72.64,42.3,117.42,High,Very Low,Moderate,High,Moderate,Low,High,Moderate,High
8,Budapest,78.41,43.94,47.7,54.38,15.93,48.37,63.82,39.13,120.33,High,Very Low,Moderate,Moderate,High,Low,High,Moderate,High
9,Chisinau,76.91,34.82,51.93,62.24,13.26,25.83,54.82,28.0,111.91,High,Very Low,Moderate,High,High,Very Low,Moderate,Low,High


## 4. Exporting to json format

In [30]:
final_cities.to_json("../data/Socio_economic_data.json")