In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
from matplotlib.ticker import MaxNLocator
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import statsmodels
import folium
import math

# Custom imports
from ipywidgets import IntProgress
from IPython.display import display
import time
from multiprocessing import Pool, Lock
import os
import json
import seaborn as sns
import time

# **Question 3 - Analysis**

Is it possible to rank some patterns of land usage with social and environmental factors such as employment, life satisfaction (if data are presents), income inequalities (if data are presents), emissions and finally soil quality and sustainability?

As we saw in the data cleaning nootebook, it is messy and difficult to do analysis for more than 5 countries. Therefore, we reduce our subset to the five following countires: China, India, USA, UK, Switzerland

We chose these countries because we were interested in them and thought that we might find interesting results as they are somehow special countries:

- China has had a massive growth and is now world leading.
- India is a big country with a very traditional soul and also huge growth in the recent years.
- USA has always been a world power however it has not been growing as fast as the above countires in the recent years.
- UK has been a leader in Europe and has had several interesting events in the recent years such as Brexit.
- Switzerland has always been rather stable and it's the country we live in :)

In [None]:
countries = ['China', 'India', 'United States of America', 'United Kingdom', 'Switzerland']

In [None]:
# read cleaned data:
dataLands         = pd.read_pickle('data/dataLand.pkl')
dataSoil          = pd.read_pickle('data/dataSoil.pkl')
employment        = pd.read_pickle('data/employment.pkl')
lifeExp           = pd.read_pickle('data/lifeExp.pkl')
nonFatatlInjuries = pd.read_pickle('data/nonFatalInjuries.pkl')
addedValue        = pd.read_pickle('data/valueAdded.pkl')

In [None]:
# reduce the data to selected countries
dataLands  = dataLands.loc[dataLands['country_or_area'].isin(countries)].reset_index(drop=True)
dataSoil   = dataSoil.loc[dataSoil['Country'].isin(countries)].reset_index(drop=True)
employment = employment.loc[employment['country_or_area'].isin(countries)].reset_index(drop=True)
lifeExp    = lifeExp.loc[lifeExp['country_or_area'].isin(countries)].reset_index(drop=True)
injuries   = nonFatatlInjuries.loc[nonFatatlInjuries['country_or_area'].isin(countries)].reset_index(drop=True)
addedValue = addedValue.loc[addedValue['country_or_area'].isin(countries)].reset_index(drop=True)

## Population Data

We also acquired some new data on world population via the link below. We selected our list of countries directly on the website as it's simpler.

https://databank.worldbank.org/reports.aspx?source=2&series=SP.POP.TOTL&country=#

In [None]:
population = pd.read_csv('data/population.csv').dropna()

In [None]:
population.drop(columns= ['Country Code', 'Series Code', 'Series Name'], inplace=True)
population

In [None]:
# the columns above are ugly: we create a dictionary to replace them with values afterwards
col_years = dict((col,year) for col,year in zip(population.columns[1:], np.arange(1960,2019)))

In [None]:
population = population.melt(id_vars=['Country Name'], var_name= 'year', value_name= 'value')
population.replace(col_years, inplace=True)

# rename USA
population.replace({'United States':'United States of America'}, inplace=True)
population.head(6)

In [None]:
grPopulation = population.groupby('Country Name')

plt.figure(figsize=(10,10))
for country in countries:
    
    sns.lineplot(x=grPopulation.get_group(country).year, y=grPopulation.get_group(country).value, label=country)
    
plt.ylabel('Population')
plt.title('Relative evolution of popularion');

We can see that the population of India and China grows really fast compared to USA. The population of UK and Switzerland are pretty satble.

In [None]:
# groupe by country
grdataLands = dataLands.groupby('country_or_area')

plt.figure(figsize=(10,10))
for country in countries:
    # result list
    norm_land = []
    # we normalize the total land area by population each year
    for year in grdataLands.get_group(country).year.values:
        
        #compute Land / Population ratio for each year
        ratio = grdataLands.get_group(country)[grdataLands.get_group(country).year == year].value.values/ \
                grPopulation.get_group(country)[grPopulation.get_group(country).year == year].value.values
        
        norm_land.append([year, ratio[0]])
        
    sns.lineplot(x=[row[0] for row in norm_land], y=[row[1] for row in norm_land], label=country, estimator=None)
    
plt.ylabel('land area / population')
plt.title('Relative evolution of agricultural land areas');

We see some strange kinks in Switzerland; Let's investigate

In [None]:
# Switzerland agricultural land and population
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10,5))
sns.lineplot(x=grPopulation.get_group('Switzerland').year, y=grPopulation.get_group('Switzerland').value, label='Switzerland', ax=ax[0])
ax[0].set_title('Population')
sns.lineplot(x=grdataLands.get_group('Switzerland').year, y=grdataLands.get_group('Switzerland').value, label='Switzerland', ax=ax[1])
ax[1].set_title('Agricultural Lands');

The problem comes from the Land data; According to Encyclopedia.com :"Swiss agricultural policy is highly regulated, with fixed prices and quota restrictions maintained on several products. Domestic production is encouraged by the imposition of protective customs and duties on imported goods, and by restrictions on imports. The Federal Council has the authority to fix prices of bread grains, flour, milk, and other foodstuffs. Production costs in Switzerland, as well as international exchange rates favorable to the Swiss franc, make competition with foreign products difficult. This highly protectionist system has led to excess production and mounting costs associated with the management of surpluses. The Uruguay Round and subsequent Swiss implementation of its provisions in July 1995 (along with rising costs in the agricultural sector) has forced the government to begin reforming its agricultural support system."

This could be a potential explanation for this behavior, however the kinks in the graph don't really matter for the overall analysis.

We will now investage agricultural employment for each country, comparing male and female employment. As China has only reported the total employments (no male/female separation) it doesn't figure in the first subplots.

In [None]:
# groupe by country
grEmployment = employment.groupby(['country_or_area','sex.label'])

plt.figure(figsize=(10,10))
fig, ax = plt.subplots(nrows=1, ncols=4, figsize=(20,5))

for i,country in enumerate(['India', 'United States of America', 'United Kingdom', 'Switzerland']):
    
    sns.lineplot(x=grEmployment.get_group((country, 'male')).year, y=grEmployment.get_group((country, 'male')).value, label='Male', ax=ax[i])
    sns.lineplot(x=grEmployment.get_group((country, 'female')).year, y=grEmployment.get_group((country, 'female')).value, label='Female', ax=ax[i])
    ax[i].set_title('Agricultural employment in '+country)

We observe several interesting facts:
- The gap between male and female employment in agriculture decreases over time for USA, UK and Switzerland but stays rather stable for India.
- Comparing males and females employment, we notice that female agricultural employment stays rather stable over time but the male employment decreases. An explanation for this observation could be the relationship of farmers of different sexes with "technology" and automatization. In fact, the major reason of the decrease in agricultural employment could be industrialization and the automatization of agriculture using technology and machines:

    - The male's labor in agriculture back in time has been gradually replaced by machines.
    - Agricultural industrialization had lower impact on females labor as they probably were not the ones harvesting the plants.
    
    
- We also notice a strange downward kink in USA's chart right before year 2000. This could be due to a big technological leap in agriculture but it is most likely due to the dot-com bubble as many people during that year converted to 'entrepreneurs' creating random websites and easiliy obtain fundings and revenue.

In [None]:
# total aggricultural employment
plt.figure(figsize=(10,10))
for country in countries:
    # store normalized employment in a list
    norm_employment = []
    
    # we normalize the total employment by population each year
    for year in grEmployment.get_group((country, 'total')).year.values:
        
        if year >= 1960:   # population starting year
            #compute Land / Population ratio for each year
            ratio = grEmployment.get_group((country, 'total'))[grEmployment.get_group((country, 'total')).year == year].value.values/ \
                    grPopulation.get_group(country)[grPopulation.get_group(country).year == year].value.values

            norm_employment.append([year, ratio[0]])
        
    sns.lineplot(x=[row[0] for row in norm_employment], y=[row[1] for row in norm_employment], label=country, estimator=None)
    
plt.ylabel('employment / population')
plt.title('Relative evolution of agricultural employment');

We observe that China has the highest ratio of relative agricultural employment followed by India, Switzerland, USA and UK.

### Life Expectancy

In [None]:
grLifeExp = lifeExp.groupby('country_or_area')

plt.figure(figsize=(10,10))
for country in countries:
    sns.lineplot(x=grLifeExp.get_group(country).year, y=grLifeExp.get_group(country).value, label=country)
    
plt.ylabel('Life expectancy')
plt.title('Evolution of life expectancy');

## Economic Added Value

In [None]:
grAddedValue = addedValue.groupby('country_or_area')

plt.figure(figsize=(10,10))
for country in countries:
    # store normalized employment in a list
    norm_addedValue = []
    
    # we normalize the total employment by population each year
    for year in grAddedValue.get_group(country).year.values:
        
        if year >= 1960:   # population starting year
            #compute Land / Population ratio for each year
            ratio = grAddedValue.get_group(country)[grAddedValue.get_group(country).year == year].value.values/ \
                    grPopulation.get_group(country)[grPopulation.get_group(country).year == year].value.values

            norm_addedValue.append([year, ratio[0]])
        
    sns.lineplot(x=[row[0] for row in norm_addedValue], y=[row[1] for row in norm_addedValue], label=country, estimator=None)
    
plt.ylabel('economic added value')
plt.title('Agriculture economic added value');

It is interesting to notice that india has the largest economical added value via agriculture, however it has the smallest aggricultural land to population ratio.

In [None]:
dataSoil