# Obtaining socio-economic data
#### Using pandas read_html to collect table data from https://www.numbeo.com/quality-of-life/in/Berlin

In [1]:
import pandas as pd
import lxml

## 1. Getting city data
### 1.1 Define list of the European capital cities

In [2]:
city_list = ['Amsterdam', 'Andorra-La-Vella', 'Athens', 'Belgrade', 'Berlin', 'Bern', 
             'Bratislava', 'Brussels', 'Bucharest', 'Budapest', 'Chisinau', 'Copenhagen', 
             'Dublin', 'Helsinki', 'Kiev', 'Lisbon', 'Ljubljana', 'London', 'Luxembourg', 
             'Madrid', 'Minsk', 'Monaco', 'Moscow', 'Nicosia', 'Nuuk', 'Oslo', 'Paris', 
             'Podgorica', 'Prague', 'Reykjavik', 'Riga', 'Rome', 'San-Marino', 'Sarajevo', 
             'Skopje', 'Sofia', 'Stockholm', 'Tallinn', 'Tirana', 'Vaduz', 'Valletta', 'Vatican-City', 
             'Vienna', 'Vilnius', 'Warsaw', 'Zagreb']

### 1.2 Creating a function to obtain data and put it into a dataframe for processing and modelling

In [3]:
test_list = ['Vienna', 'Berlin']

In [4]:
def get_city_info(city_list) -> list:
    
    # create empty database
    df = pd.DataFrame()
    errors = []
    
    # iterate over list of cities to obtain table data from Numbeo
    for city in city_list:
        try:
            table = pd.read_html(f'https://www.numbeo.com/quality-of-life/in/{city}')[3]
            table = (table.assign(city=city)
                      .rename(columns={0:'category', 1:'numeral', 2:'level'})
                      .drop([table.index[8]]))
            df = df.append(table)
        
        except:
            errors.append(city)
    
    df.reset_index(inplace=True, drop=True)
    
    # df = df.pivot(index='city', columns='category', values=['numeral', 'level'])
    
    # colnames = ['climate', 'cost_of_living', 'health_care', 'pollution', 'property_income_ratio', 'purchasing_power',
                #'safety', 'traffic_time', 'quality_of_life', 'climate_level', 'cost_of_living_level',
                #'health_care_level', 'pollution_level', 'property_income_ratio_level', 'purchasing_power_level',
                #'safety_level', 'traffic_time_level', 'quality_of_life_level']
                
    #df.columns = colnames
    #df = df.reset_index()
        
    return df, errors
 
    

In [5]:
get_city_info(test_list)

(                          category  numeral      level    city
 0           Purchasing Power Index    87.18   Moderate  Vienna
 1                     Safety Index    75.85       High  Vienna
 2                Health Care Index    78.83       High  Vienna
 3                    Climate Index    81.77  Very High  Vienna
 4             Cost of Living Index    69.54   Moderate  Vienna
 5   Property Price to Income Ratio    13.72       High  Vienna
 6       Traffic Commute Time Index    26.78        Low  Vienna
 7                  Pollution Index    18.15   Very Low  Vienna
 8         ƒ Quality of Life Index:   185.43  Very High  Vienna
 9           Purchasing Power Index   102.27       High  Berlin
 10                    Safety Index    59.10   Moderate  Berlin
 11               Health Care Index    69.94       High  Berlin
 12                   Climate Index    83.35  Very High  Berlin
 13            Cost of Living Index    69.03   Moderate  Berlin
 14  Property Price to Income Ratio     

## 2. Final data-set
### 2.1 Checking for cities not covered by Numbeo

In [6]:
#get_city_info(city_list)

**After running the function on the whole list San Marino, Vaduz and Vatican City appeared in the error list**

In [7]:
drop_cities = ['San-Marino', 'Vaduz', 'Vatican-City']

update_cities = [city for city in city_list if city not in drop_cities]

In [8]:
print(update_cities)

['Amsterdam', 'Andorra-La-Vella', 'Athens', 'Belgrade', 'Berlin', 'Bern', 'Bratislava', 'Brussels', 'Bucharest', 'Budapest', 'Chisinau', 'Copenhagen', 'Dublin', 'Helsinki', 'Kiev', 'Lisbon', 'Ljubljana', 'London', 'Luxembourg', 'Madrid', 'Minsk', 'Monaco', 'Moscow', 'Nicosia', 'Nuuk', 'Oslo', 'Paris', 'Podgorica', 'Prague', 'Reykjavik', 'Riga', 'Rome', 'Sarajevo', 'Skopje', 'Sofia', 'Stockholm', 'Tallinn', 'Tirana', 'Valletta', 'Vienna', 'Vilnius', 'Warsaw', 'Zagreb']


### 2.2 Now rerunning function with updated list

In [9]:
# get_city_info(update_cities)

### 2.3 There seem to be no errors coming up, now to rewrite the function, so it returns a data-set again

In [10]:
def get_city(city_list) -> list:
    
    # create empty database
    df = pd.DataFrame()
    
    # iterate over list of cities to obtain table data from Numbeo    
    # removing index position 8, because it refers to a border
    for city in city_list:
        table = pd.read_html(f'https://www.numbeo.com/quality-of-life/in/{city}')[3]
        table = (table.assign(city=city)
                      .rename(columns={0:'category', 1:'numeral', 2:'level'}))
        df = df.append(table)
    
    df.reset_index(inplace=True, drop=True)
  
    return df

In [11]:
#all_cities = 
get_city(test_list)

Unnamed: 0,category,numeral,level,city
0,Purchasing Power Index,87.18,Moderate,Vienna
1,Safety Index,75.85,High,Vienna
2,Health Care Index,78.83,High,Vienna
3,Climate Index,81.77,Very High,Vienna
4,Cost of Living Index,69.54,Moderate,Vienna
5,Property Price to Income Ratio,13.72,High,Vienna
6,Traffic Commute Time Index,26.78,Low,Vienna
7,Pollution Index,18.15,Very Low,Vienna
8,,,,Vienna
9,ƒ Quality of Life Index:,185.43,Very High,Vienna


In [12]:
# all_cities

### 2.4 Checking values of Index and Unnamed: 1 columns
There seems to be a city whose data was pushed back by one.

In [13]:
# all_cities['Index'].value_counts()

In [14]:
# all_cities['Unnamed: 1'].value_counts()

In [15]:
# all_cities.loc[all_cities['Unnamed: 1'] == 209.07]

The data for the city of Bern seems to have been scraped incorrectly. Let's check what data Numbeo actually has available. 

In [16]:
pd.read_html(f'https://www.numbeo.com/quality-of-life/in/Bern')[3]

Unnamed: 0,Index,Unnamed: 1
0,Quality of Life Index:,211.54


Perhaps we are reading the wrong table from Numbeo, let's try the others.

In [17]:
pd.read_html(f'https://www.numbeo.com/quality-of-life/in/Bern')[4]

Unnamed: 0,0,1,2
0,Purchasing Power Index,130.37,Very High
1,Safety Index,80.61,Very High
2,Health Care Index,74.76,High
3,Climate Index,75.97,High
4,Cost of Living Index,117.95,Very High
5,Property Price to Income Ratio,7.05,Low
6,Traffic Commute Time Index,19.77,Very Low
7,Pollution Index,11.11,Very Low
8,,,
9,ƒ Quality of Life Index:,211.54,Very High


It seems as though there is not enough user information for Bern and there is a notice table in the html
pushing it to index position 4. We will delete Bern from the list and add it later manually to the list.

In [18]:
update_cities.remove('Bern')

In [19]:
print(update_cities)

['Amsterdam', 'Andorra-La-Vella', 'Athens', 'Belgrade', 'Berlin', 'Bratislava', 'Brussels', 'Bucharest', 'Budapest', 'Chisinau', 'Copenhagen', 'Dublin', 'Helsinki', 'Kiev', 'Lisbon', 'Ljubljana', 'London', 'Luxembourg', 'Madrid', 'Minsk', 'Monaco', 'Moscow', 'Nicosia', 'Nuuk', 'Oslo', 'Paris', 'Podgorica', 'Prague', 'Reykjavik', 'Riga', 'Rome', 'Sarajevo', 'Skopje', 'Sofia', 'Stockholm', 'Tallinn', 'Tirana', 'Valletta', 'Vienna', 'Vilnius', 'Warsaw', 'Zagreb']


In [20]:
#final_cities = get_city(update_cities)

In [21]:
def get_city(city_list) -> list:
    
    # create empty database
    df = pd.DataFrame()
    
    # iterate over list of cities to obtain table data from Numbeo    
    # removing index position 8, because it refers to a border
    for city in city_list:
        table = pd.read_html(f'https://www.numbeo.com/quality-of-life/in/{city}')[3]
        table = (table.assign(city=city)
                      .rename(columns={0:'category', 1:'numeral', 2:'level'})
                      .drop([table.index[8]]))
        df = df.append(table)
    
    df.reset_index(inplace=True, drop=True)
  
    return df

In [22]:
final_cities = get_city(update_cities)

In [23]:
final_cities.head(20)

Unnamed: 0,category,numeral,level,city
0,Purchasing Power Index,83.85,Moderate,Amsterdam
1,Safety Index,67.13,High,Amsterdam
2,Health Care Index,69.45,High,Amsterdam
3,Climate Index,87.45,Very High,Amsterdam
4,Cost of Living Index,85.61,Moderate,Amsterdam
5,Property Price to Income Ratio,11.03,Moderate,Amsterdam
6,Traffic Commute Time Index,29.88,Low,Amsterdam
7,Pollution Index,30.79,Low,Amsterdam
8,ƒ Quality of Life Index:,168.98,Very High,Amsterdam
9,Purchasing Power Index,86.69,Moderate,Andorra-La-Vella


**Add Bern manually**

In [24]:
bern = pd.read_html(f'https://www.numbeo.com/quality-of-life/in/Bern')[4]
bern = (bern.assign(city='Bern')
        .rename(columns={0:'category', 1:'numeral', 2:'level'})
        .drop([bern.index[8]]))

bern

Unnamed: 0,category,numeral,level,city
0,Purchasing Power Index,130.37,Very High,Bern
1,Safety Index,80.61,Very High,Bern
2,Health Care Index,74.76,High,Bern
3,Climate Index,75.97,High,Bern
4,Cost of Living Index,117.95,Very High,Bern
5,Property Price to Income Ratio,7.05,Low,Bern
6,Traffic Commute Time Index,19.77,Very Low,Bern
7,Pollution Index,11.11,Very Low,Bern
9,ƒ Quality of Life Index:,211.54,Very High,Bern


In [25]:
final_cities = final_cities.append(bern)
final_cities.reset_index(inplace=True, drop=True)

final_cities[final_cities['city'] == 'Bern']

Unnamed: 0,category,numeral,level,city
378,Purchasing Power Index,130.37,Very High,Bern
379,Safety Index,80.61,Very High,Bern
380,Health Care Index,74.76,High,Bern
381,Climate Index,75.97,High,Bern
382,Cost of Living Index,117.95,Very High,Bern
383,Property Price to Income Ratio,7.05,Low,Bern
384,Traffic Commute Time Index,19.77,Very Low,Bern
385,Pollution Index,11.11,Very Low,Bern
386,ƒ Quality of Life Index:,211.54,Very High,Bern


### 2.5 Now to pivot the table so each city only has one row

In [26]:
def pivot_cities(df):

    df = df.pivot(index='city', columns='category', values=['numeral', 'level'])
    
    colnames = ['climate', 'cost_of_living', 'health_care', 'pollution', 'property_income_ratio', 'purchasing_power',
                'safety', 'traffic_time', 'quality_of_life', 'climate_level', 'cost_of_living_level',
                'health_care_level', 'pollution_level', 'property_income_ratio_level', 'purchasing_power_level',
                'safety_level', 'traffic_time_level', 'quality_of_life_level']
    
    df.columns = colnames
    df = df.reset_index()
    
    return df

In [27]:
final_cities = pivot_cities(final_cities)

final_cities.head(20)

Unnamed: 0,city,climate,cost_of_living,health_care,pollution,property_income_ratio,purchasing_power,safety,traffic_time,quality_of_life,climate_level,cost_of_living_level,health_care_level,pollution_level,property_income_ratio_level,purchasing_power_level,safety_level,traffic_time_level,quality_of_life_level
0,Amsterdam,87.45,85.61,69.45,30.79,11.03,83.85,67.13,29.88,168.98,Very High,Moderate,High,Low,Moderate,Moderate,High,Low,Very High
1,Andorra-La-Vella,?,66.57,69.44,64.08,7.44,86.69,87.54,5.0,?,,Moderate,High,High,Low,Moderate,Very High,Very Low,
2,Athens,95.18,60.86,56.17,57.3,12.85,41.45,50.3,37.98,119.79,Very High,Low,Moderate,Moderate,High,Very Low,Moderate,Moderate,High
3,Belgrade,84.14,41.47,53.69,63.57,22.22,35.72,62.08,35.79,108.2,Very High,Very Low,Moderate,High,Very High,Very Low,High,Moderate,Moderate
4,Berlin,83.35,69.03,69.94,39.04,9.38,102.27,59.1,33.84,166.99,Very High,Moderate,High,Low,Moderate,High,Moderate,Low,Very High
5,Bern,75.97,117.95,74.76,11.11,7.05,130.37,80.61,19.77,211.54,High,Very High,High,Very Low,Low,Very High,Very High,Very Low,Very High
6,Bratislava,80.72,52.19,57.17,41.12,13.18,64.14,68.76,30.89,148.56,Very High,Low,Moderate,Moderate,High,Low,High,Low,High
7,Brussels,83.85,75.41,74.5,62.36,6.4,91.77,48.41,37.17,144.56,Very High,Moderate,High,High,Low,Moderate,Moderate,Moderate,High
8,Bucharest,75.61,42.68,54.19,75.64,10.54,51.93,72.47,42.3,117.51,High,Very Low,Moderate,High,Moderate,Low,High,Moderate,High
9,Budapest,78.41,46.17,47.86,54.13,15.96,49.74,63.87,39.13,120.88,High,Low,Moderate,Moderate,High,Low,High,Moderate,High


## 3. Data cleaning on final table

### 3.1 replacing the one '?' with NaN

In [28]:
import numpy as np

In [29]:
final_cities = final_cities.replace('?', np.nan)
final_cities

Unnamed: 0,city,climate,cost_of_living,health_care,pollution,property_income_ratio,purchasing_power,safety,traffic_time,quality_of_life,climate_level,cost_of_living_level,health_care_level,pollution_level,property_income_ratio_level,purchasing_power_level,safety_level,traffic_time_level,quality_of_life_level
0,Amsterdam,87.45,85.61,69.45,30.79,11.03,83.85,67.13,29.88,168.98,Very High,Moderate,High,Low,Moderate,Moderate,High,Low,Very High
1,Andorra-La-Vella,,66.57,69.44,64.08,7.44,86.69,87.54,5.0,,,Moderate,High,High,Low,Moderate,Very High,Very Low,
2,Athens,95.18,60.86,56.17,57.3,12.85,41.45,50.3,37.98,119.79,Very High,Low,Moderate,Moderate,High,Very Low,Moderate,Moderate,High
3,Belgrade,84.14,41.47,53.69,63.57,22.22,35.72,62.08,35.79,108.2,Very High,Very Low,Moderate,High,Very High,Very Low,High,Moderate,Moderate
4,Berlin,83.35,69.03,69.94,39.04,9.38,102.27,59.1,33.84,166.99,Very High,Moderate,High,Low,Moderate,High,Moderate,Low,Very High
5,Bern,75.97,117.95,74.76,11.11,7.05,130.37,80.61,19.77,211.54,High,Very High,High,Very Low,Low,Very High,Very High,Very Low,Very High
6,Bratislava,80.72,52.19,57.17,41.12,13.18,64.14,68.76,30.89,148.56,Very High,Low,Moderate,Moderate,High,Low,High,Low,High
7,Brussels,83.85,75.41,74.5,62.36,6.4,91.77,48.41,37.17,144.56,Very High,Moderate,High,High,Low,Moderate,Moderate,Moderate,High
8,Bucharest,75.61,42.68,54.19,75.64,10.54,51.93,72.47,42.3,117.51,High,Very Low,Moderate,High,Moderate,Low,High,Moderate,High
9,Budapest,78.41,46.17,47.86,54.13,15.96,49.74,63.87,39.13,120.88,High,Low,Moderate,Moderate,High,Low,High,Moderate,High


### 3.2 Fixing the data types

In [30]:
final_cities.dtypes

city                            object
climate                         object
cost_of_living                  object
health_care                     object
pollution                       object
property_income_ratio           object
purchasing_power                object
safety                          object
traffic_time                    object
quality_of_life                float64
climate_level                   object
cost_of_living_level            object
health_care_level               object
pollution_level                 object
property_income_ratio_level     object
purchasing_power_level          object
safety_level                    object
traffic_time_level              object
quality_of_life_level           object
dtype: object

In [31]:
cols_to_float = ['climate', 'cost_of_living', 'health_care', 'pollution',
                 'property_income_ratio', 'purchasing_power', 'safety',
                 'traffic_time']

In [32]:
final_cities[cols_to_float] = final_cities[cols_to_float].apply(pd.to_numeric)

In [33]:
final_cities.dtypes

city                            object
climate                        float64
cost_of_living                 float64
health_care                    float64
pollution                      float64
property_income_ratio          float64
purchasing_power               float64
safety                         float64
traffic_time                   float64
quality_of_life                float64
climate_level                   object
cost_of_living_level            object
health_care_level               object
pollution_level                 object
property_income_ratio_level     object
purchasing_power_level          object
safety_level                    object
traffic_time_level              object
quality_of_life_level           object
dtype: object

## 4. Exporting to json format

In [34]:
final_cities.to_json('../data/Socio_economic_data.json')