In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import re
from matplotlib.ticker import MaxNLocator
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
import statsmodels
import folium
import math

# Custom imports
from ipywidgets import IntProgress
from IPython.display import display
import time
from multiprocessing import Pool, Lock
import os
import json
import seaborn as sns
import time

# **Question 3**

Is it possible to rank some patterns of land usage with social and environmental factors such as employment, life satisfaction (if data are presents), income inequalities (if data are presents), emissions and finally soil quality and sustainability?

To answer this question we will need to acquire additionnal data, but before doing that, we must define the **scope** of the question. It doesn't really make sense to aggregate countries into continents since the social factors are really specific to each country. Therefore, we reduce our scope to a list of 12 countries that we are interested in and feel are important.

In [None]:
countries = ['France', 'Canada', 'Germany', 'India', 'Japan', 'Russian Federation', 'Switzerland', 'United States of America', 'United Kingdom', 'China', 'Israel']

## Data acquisition and cleaning

In [None]:
dataLands = pd.read_csv("./data/fao_data_land_data.csv")
dataLands.head()

In [None]:
# investigate NaN values
dataLands[np.isnan(dataLands['value'].values)].head(3)

We observe that the dataset is pretty clean. The only NaNs in 'value' column are actually the footnotes so we can drop them.

In [None]:
dataLands = dataLands.dropna(subset=["value"])
dataLands.tail(3)

In [None]:
print(dataLands['category'].unique().tolist())

# we only keep agricultural area
dataLands = dataLands[dataLands['category'] == 'agricultural_area']

In [None]:
# let's observe the temporal evolution of agricultural lands for our selected countries

def check_country(countries, DF):
    """Checks wether the countries 'countries' are in the dataframe.series DF"""
    for country in countries:
        if country not in DF.unique():
            print('Country', country, 'is missing!')
        
check_country(countries, dataLands['country_or_area'])

# reduce subset to selected countries
rdataLands = dataLands.loc[dataLands['country_or_area'].isin(countries)]

# groupe by country
grdataLands = rdataLands.groupby('country_or_area')

plt.figure(figsize=(15,15))
for country in countries:
    # we normalize the total land area by devinding by its mean
    sns.lineplot(x=grdataLands.get_group(country).year, y=grdataLands.get_group(country).value/np.mean(grdataLands.get_group(country).value), label=country)
    
plt.ylabel('land area / mean land area')
plt.title('Relative evolution of agricultural land areas');

We observe an overall downward trend the agricultutral land areas except for a few countries (India and China). However, we also notice an anomally with Russia that must be investigated: no data is present before 1991.

After some research we found out that the dissolution of the Soviet Union (USSR) took place on 26 December 1991, creating the country Russia. We must correct our data:

In [None]:
display(dataLands[dataLands['country_or_area'] == 'USSR'].head(3))

# we don't really care about USSR, therefore, we rename it to Russia and subtract (USSR_land_1991 - Russia_land_1992) from its values
to_subtract = dataLands[(dataLands['country_or_area'] == 'USSR') & (dataLands['year'] == 1991)].value.values - dataLands[(dataLands['country_or_area'] == 'Russian Federation') & (dataLands['year'] == 1992)].value.values
# append rdataLands
for year,value in zip(dataLands[dataLands['country_or_area'] == 'USSR'].year.values, dataLands[dataLands['country_or_area'] == 'USSR'].value.values):
    rdataLands = rdataLands.append({'country_or_area':'Russian Federation', 'year':year, 'value':value - to_subtract[0]}, ignore_index=True)

In [None]:
# check the graph again
# groupe by country
grdataLands = rdataLands.groupby('country_or_area')

plt.figure(figsize=(15,15))
for country in countries:
    # we normalize the total land area by devinding by its mean
    sns.lineplot(x=grdataLands.get_group(country).year, y=grdataLands.get_group(country).value/np.mean(grdataLands.get_group(country).value), label=country)
    
plt.ylabel('land area / mean land area')
plt.title('Relative evolution of agricultural land area')

In [None]:
plt.figure(figsize=(20,5))
for country in countries:
    plt.bar(x=country, height=np.mean(grdataLands.get_group(country).value), label = country)
plt.legend()
plt.title('Mean agriculture land area');

In [None]:
# Soil erosion data
dataSoil = pd.read_csv('data/current_FAO/raw_files/Environment_Soil_E_All_Data.csv', encoding = "ISO-8859-1")
#display(dataSoil.head())

# check for our countries
print(check_country(countries, dataSoil['Country']))

# reduce to selected countries
rdataSoil = dataSoil.loc[dataSoil['Country'].isin(countries)]
rdataSoil.head()

Except the carbon content all the other data (erosion and lan degredation) haave only 1 measure in 1991 for each country.

## **New Data**
In order to answer the question at the beginning we need to search for and acquire some new data:

### Life expectancy at birth for both sexes combined (years):
http://data.un.org/Data.aspx?d=PopDiv&f=variableID%3a68

In [None]:
lifeExp_df = pd.read_csv('data/life_expectancy.csv')
lifeExp_df.head(3)

In [None]:
# rename columns for consistency
lifeExp_df.rename(columns={'Country or Area': 'country_or_area', 'Year(s)': 'years', 'Variant': 'variant', 'Value': 'value'}, inplace= True)

# Check our list
print(check_country(countries, lifeExp_df['country_or_area']))

# reduce to our selected countries
rlifeExp_df = lifeExp_df.loc[lifeExp_df['country_or_area'].isin(countries)]

# do we have the same problem as before with russia ?
rlifeExp_df[rlifeExp_df['country_or_area']=='Russian Federation'].tail()

Fortunately we don't have the above problem with Russia in this dataset. We observe that 'Years' is a contains 5 years long periods, thus we transform it and erase the second year (-19**). 

In [None]:
# clean column year
rlifeExp_df['year'] = rlifeExp_df['years'].str.split('-', expand=True).iloc[:,0].astype('int')
rlifeExp_df.drop(columns='years', inplace= True)
rlifeExp_df = rlifeExp_df[rlifeExp_df['year'] < 2019] # we are not interested in predictions

In [None]:
# simple observations
# groupe by country
grlifeExp_df = rlifeExp_df.groupby('country_or_area')

plt.figure(figsize=(15,15))
for country in countries:
    sns.lineplot(x=grlifeExp_df.get_group(country).year, y=grlifeExp_df.get_group(country).value, label=country)
    
plt.ylabel('Life expectancy')
plt.title('Evolution of life expectancy');

We see that india and china had a big evolution from the 1960s. However, China is the only country that also had a similar evolution in its agricultural lands.

### Value added by industries at current prices (ISIC Rev. 3)

We were unable to download all the data from the below website, therefore, we reduced the industries to agriculture.

http://data.un.org/Data.aspx?d=SNA&f=group_code%3a201

In [None]:
valueAdded_df = pd.read_csv('data/value_added.csv', low_memory=False)
valueAdded_df = valueAdded_df.dropna(subset=['Value'])
valueAdded_df.head(3)

In [None]:
# rename columns for consistency
valueAdded_df.rename(columns={'Country or Area': 'country_or_area', 'Year': 'year', 'Value': 'value'}, inplace= True)

# Check our list
print(check_country(countries, valueAdded_df['country_or_area']))

In [None]:
# print all countries
print(valueAdded_df['country_or_area'].unique().tolist())

In [None]:
# Correct for united states
valueAdded_df.replace(to_replace='United States', value='United States of America', inplace= True)

# reduce to our selected countries
rvalueAdded_df = valueAdded_df.loc[valueAdded_df['country_or_area'].isin(countries)]

# do we have the same problem as before with russia ?
rvalueAdded_df[rvalueAdded_df['country_or_area']=='Russian Federation'].tail()

In [None]:
# simple observations
# groupe by country
gvalueAdded_df = valueAdded_df.groupby('country_or_area')

plt.figure(figsize=(15,15))
for country in countries:
    # plot  and normalize by mean
    sns.lineplot(x=gvalueAdded_df.get_group(country).year, y=gvalueAdded_df.get_group(country).value/np.mean(gvalueAdded_df.get_group(country).value), label=country)
    
plt.ylabel('added-value')
plt.title('Economic added value by agriculture');

We see a strange peek with Russia that will have to be investigated for the final milestone. Russia's data starts from 1990 so we might have the same problem as before, but USSR is absent from the data...

### Employment by sex and economic activity

https://www.ilo.org/shinyapps/bulkexplorer5/?lang=en&segment=indicator&id=EMP_TEMP_SEX_ECO_NB_A

In [None]:
employment_df = pd.read_csv('data/Employment.csv')
display(employment_df.head(3))
employment_df = employment_df.drop(columns=['obs_status.label', 'note_classif.label', 'note_indicator.label'])   # drop useless columns

In [None]:
# rename columns for consistency
employment_df.rename(columns={'ref_area.label': 'country_or_area', 'time': 'year', 'obs_value': 'value', 'classif1.label': 'activity'}, inplace= True)

# Check our list
print(check_country(countries, employment_df['country_or_area']))

# Correct for united states
employment_df.replace(to_replace='United States', value='United States of America', inplace= True)

# reduce to our selected countries
remployment_df = employment_df.loc[employment_df['country_or_area'].isin(countries)]

# USSR isn't present: no problem
np.sort(employment_df['country_or_area'].unique())[-20:]

In [None]:
# do a sub selection of only agricultural related economic activites
remployment_df = remployment_df[remployment_df['activity'].str.contains('Agriculture')]

In [None]:
# extra cleaning
remployment_df.drop(columns=['indicator.label', 'source.label', 'note_source.label'], inplace= True)   # drop useless columns
remployment_df.replace({'Sex: Male': 'male', 'Sex: Female': 'female', 'Sex: Total': 'total'}, inplace= True)

In [None]:
remployment_df.head()

In [None]:
# simple observations
# groupe by country
gremployment_df = remployment_df[remployment_df['sex.label'] == 'total'].groupby('country_or_area')

plt.figure(figsize=(15,15))
for country in countries:
    # plot and normalize by mean
    sns.lineplot(x=gremployment_df.get_group(country).year, y=gremployment_df.get_group(country).value/np.mean(gremployment_df.get_group(country).value), label=country)
    
plt.ylabel('employment')
plt.title('Employment in agriculture');

We see an overall downtrend in the agricultural employments.Let's compare the mean employment of the countries:

In [None]:
plt.figure(figsize=(20,5))
for country in countries:
    plt.bar(x=country, height=np.mean(gremployment_df.get_group(country).value), label = country)
plt.legend()
plt.title('Mean Employment in agriculture')

In order to have a meaningfull comparison the above values should be devided by the total population, but in an absolute comparison we can see employment in agricultural sectors is rather big in China and India.

Compared to the bar chart of agricultural lands above, we notice United States for example that has a pretty vast agricultural land area but has very little employments in agriculture in comparison. 

### Non fatal occupational injuries per 100'000 workers by economic activity

We found it hard to find reliable data for the 'quality of life' which could be subjective and not necessarily related to agriculture. Thus, we found the below dataset which describes the amount of non fatal injuries per economic activity, which we found interessting as a way to asses the social quality and safety of agriculture.

https://www.ilo.org/shinyapps/bulkexplorer32/?lang=en&segment=indicator&id=INJ_NFTL_ECO_RT_A

In [None]:
nonFatalInjuries_df = pd.read_csv('data/non_fatal_injuries.csv')

#look at some useless columns before droping them
print(nonFatalInjuries_df.obs_status.unique())
print(nonFatalInjuries_df.note_classif.unique().tolist())
nonFatalInjuries_df = nonFatalInjuries_df.drop(columns=['obs_status', 'obs_status.label', 'note_classif', 'note_classif.label', 'note_source', 'note_source.label', 'indicator.label', 'note_indicator', 'source', 'source.label'])   # drop useless columns
nonFatalInjuries_df.head(3)

In [None]:
# rename columns for consistency
nonFatalInjuries_df.rename(columns={'ref_area.label': 'country_or_area', 'time': 'year', 'obs_value': 'value', 'classif1.label': 'activity'}, inplace= True)

# Check our list
print(check_country(countries, nonFatalInjuries_df['country_or_area']))

In [None]:
print(np.sort(nonFatalInjuries_df['country_or_area'].unique()).tolist())

In [None]:
# Problem: Canada is REALLY missing...

# Correct for united states
nonFatalInjuries_df.replace(to_replace='United States', value='United States of America', inplace= True)

# Only keep activities related to agriculture
rnonFatalInjuries_df = nonFatalInjuries_df[nonFatalInjuries_df['activity'].str.contains('Agriculture')]

# There are several regions (cities) of China present in the data set but no China as a whole.
# As our value of interest is a rate, we can take the mean of the latter rates for China as a whole
china_injuries = rnonFatalInjuries_df[nonFatalInjuries_df['country_or_area'].str.contains('China')].reset_index()
print(china_injuries['country_or_area'].unique())    # different chinese cities present

In [None]:
china_injuries.head()

There is only one entry for Macau and its value is 0. So we drop it and represent China by Taiwan.

In [None]:
china_injuries.drop(index= 0, axis= 0, inplace= True)
china_injuries.replace('Taiwan, China', 'China', inplace= True)

# append with China data
rnonFatalInjuries_df = rnonFatalInjuries_df.append(china_injuries, sort=False, ignore_index= True)

# reduce to our selected countries
rnonFatalInjuries_df = rnonFatalInjuries_df.loc[rnonFatalInjuries_df['country_or_area'].isin(countries)]

# recheck countries
check_country(countries, rnonFatalInjuries_df['country_or_area'])

We now see that also Japan is missing but that is because Japan hasn't Agriculture in its activity column.

In [None]:
# simple observations
# groupe by country
grnonFatalInjuries_df = rnonFatalInjuries_df.groupby('country_or_area')

plt.figure(figsize=(15,15))
for country in rnonFatalInjuries_df['country_or_area'].unique():
    # plot and normalize by mean
    sns.lineplot(x=grnonFatalInjuries_df.get_group(country).year, y=grnonFatalInjuries_df.get_group(country).value, label=country)
    
plt.ylabel('injuries rate')
plt.title('Non fatal injuries in agriculture per 100000 workers');

We notice that the data for some countries (China, Russia, India, US, Germany) doesn't cover the entire year range. We also observe unusually high values for Switzerland, which makes us question the sanity of the data. This data in its current form, probably might not be usable for further analysis...

# Analysis plan

- Reduce the number selected countries
- Derive relations (such as correlations) between land usage and the above factors for the selected topics.
- In order to rank the importance of the above topics, we could try to develop a model to predict land usage (via panel data regression for example). 
- Make the link with question 1: greenhouse gas emissions