In [None]:
# imports
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import seaborn as sns
import json
import folium

# data paths
TASK_1_DATA_PATH = './data/lfsa_urgan_1_Data.csv'

# Task 1: European unemployment rate (Eurostat)

### Getting the data from [Eurostat](http://ec.europa.eu/eurostat/data/database)

Database by themes/Population and social conditions/Labour market (Labour)/Employment and Unemployment(Labour Force survey)(employ)/LFS series - detailed annual survey results (LFSA)/ Total unemployment - LFS series(lfsa_unemp)/  Unemployment rates by sex, age and nationality (%) (lfsa_urgan)



Age from 15 to 74

How we got the data:
- we used the data explorer, then 'Download' button, 'Change Selection' to keep only what we're interested in, then download data in CSV format

### Data importing and cleaning
First, we import the unemployement rates from the Eurostat website in a dataframe. Since we are looking for recent statistics, we only keep the data from last year (2016). We also discard all columns except the ones we need (Country and unemployment rate). A few countries also need to be renamed, so that the countries' names are consistent between the eurostat DataFrame and the topojson file. We also drop some unwanted rows, such as the average rate for countries from the European Union.

In [None]:
df = pd.read_csv(TASK_1_DATA_PATH)
df = df[df['TIME']==2016] # only keep data from last year
df = df[['GEO', 'Value']] # The only information we need is the country and the unemployment rate
df.rename(columns={'GEO': 'Country', 'Value' : 'Unemployment rate' }, inplace=True)
df.index = (range(len(df))) # re-index the dataframe

# drop unwanted rows, rename some countries to match with the topojson data
df = df.replace(to_replace='Former Yugoslav Republic of Macedonia, the', value='The former Yugoslav Republic of Macedonia')
df = df.replace(to_replace='Germany (until 1990 former territory of the FRG)', value='Germany')
df = df.drop(index=[0,1, 2, 3, 4, 5])

df.index = (range(len(df))) # re-index the dataframe
df.head()

Then, we import the topojson data:

In [None]:
state_geo_path = r'topojson/europe.topojson.json'
geo_json_data = json.load(open(state_geo_path))

Now, let's see if we are missing some data. To do this, we extract the list of countries from the topojson file and check if some of these countries are not in our DataFrame:

In [None]:
# First, extract the list of countries from the topojson
countries = [country['properties']['NAME'] for country in geo_json_data['objects']['europe']['geometries']]
# From this list, print each country that does not appear in the DataFrame
missing_countries = set(countries).difference(set(df.Country.values))
print("The data is missing for %d out of %d countries:\n" %(len((missing_countries)), len(countries)))
print('\n'.join(missing_countries))

Let's ensure that each countries from the DataFrame has been matched to a country in the topojson:

In [None]:
assert(not set(df.Country.values).difference(set(countries)))

TODO: either find a way to fill missing countries, or explain why we did not do it

### Build the choropleth map

In [None]:
m = folium.Map([53, 15],  tiles='cartodbpositron', zoom_start=4)
#ignore_missing_countries = lambda x: {'fillOpacity':1,'fillColor':'YlOrRd' if x['properties']['NAME'] in df.Country.values else 'black'}
#folium.TopoJson(open(state_geo_path), 'objects.europe', style_function =ignore_missing_countries).add_to(m)
m.choropleth(
    geo_data=geo_json_data,
    name='choropleth',
    data=df,
    columns=['Country', 'Unemployment rate'],
    key_on='feature.properties.NAME',
    topojson='objects.europe',
    fill_color='YlOrRd',
    fill_opacity=0.6,
    line_opacity=1,
    legend_name='Unemployment Rate (%)'
)
m