In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pylab as plt
from scipy import stats

In [None]:
%matplotlib inline

Is the temporal evolution of agricultural land always correlated with the temporal evolution of the quantity of fertilizers used (and waste emission) or there’s some point in time where this trend slows or reverses for some countries?

#### DATA PREPROCESSING

In [None]:
#Gettinge Merge Fertilisant df

Fertdf=pd.read_csv('data/Inputs_FertilizersNutrient_E_All_Data.csv', sep=',',engine='python')
colnames = ['Area','Item','Element','Y2002','Y2003','Y2004','Y2005','Y2006','Y2007','Y2008','Y2009','Y2010','Y2011','Y2012','Y2013',
              'Y2014','Y2015','Y2016','Y2017']
Newdf = Fertdf[colnames]
Newdf = Newdf[Newdf.Element=='Agricultural Use']
Newdf.columns = Newdf.columns.str.replace('Y','')
Meltdf = pd.melt(Newdf, id_vars=['Area','Item','Element'], var_name='Year', value_name='Value').copy()
#Meltdf.Year = Meltdf.Year.str.replace('Y','')
Meltdf.Year = Meltdf.Year.astype('int64')
colnitrogen = Meltdf[Meltdf.Item=='Nutrient nitrogen N (total)']
colnitrogen = colnitrogen[['Area','Year','Value']]
colnitrogen.columns=['Area','Year','NitrogenUse']
colnitrogen = colnitrogen.set_index(['Area','Year'])
colpho = Meltdf[Meltdf.Item=='Nutrient phosphate P2O5 (total)']
colpho = colpho[['Area','Year','Value']]
colpho.columns=['Area','Year','PhosphateUse']
colpho = colpho.set_index(['Area','Year'])
colpot = Meltdf[Meltdf.Item=='Nutrient potash K2O (total)']
colpot = colpot[['Area','Year','Value']]
colpot.columns=['Area','Year','PotashUse']
colpot = colpot.set_index(['Area','Year'])
tmp = pd.merge(colnitrogen,colpho, left_index=True,right_index=True)
mergeFert = tmp.merge(colpot, left_index=True,right_index=True)

In [None]:
#Getting Production data
df = pd.read_csv('data/Production_Crops_E_All_Data_(Normalized).csv', sep=',',engine='python')
Proddf = df[df.Element=='Production']
Proddf = Proddf[['Area','Item','Year','Value']]
Proddf.columns = ['Area','Item','Year','Production']
AreaHarvdf = df[df.Element=='Area harvested']
AreaHarvdf = AreaHarvdf[['Area','Item','Year','Value']]
AreaHarvdf.columns = ['Area','Item','Year','Harvested_Area']
RecentProd = Proddf.query('Year > 2001')
RecentAreaHarv = AreaHarvdf.query('Year > 2001')

In [None]:
#Adding the ratio of production over harvested area
RecentProdHier = RecentProd.set_index(['Area','Item','Year'])
RecentAreaHarvHier = RecentAreaHarv.set_index(['Area','Item','Year'])
x = pd.merge(RecentProdHier,RecentAreaHarvHier, right_index=True,left_index=True)
x = x.reset_index()
x = x.dropna()
y = x.Production/x.Harvested_Area
x['Ratio_Prod_field']= y
ratiodf = x

#### PLOTTING FUNCTIONS

In [None]:
def plotFertandProd(area,item,Fertilisant):
    #Plot one area's item with one fertilisant
    strtemp = ' used in tonnes in '
    title_=Fertilisant+strtemp+area
    if(Fertilisant=='N'):
        fer=4
        x_label= 'Nitrogen use'
    if(Fertilisant=='P'):
        fer=5
        x_label= 'Phosphate use'
    if(Fertilisant=='K'):
        fer=6
        x_label= 'Potash use'
    #Merging Production and Fertilisant for plotting
    tmp = RecentProd[RecentProd.Item==item]
    tmp = tmp.set_index(['Area','Year'])
    dftoplot = tmp.merge(mergeFert,left_index=True,right_index=True, how='outer')
    dftoplot = dftoplot.reset_index()
    plotdata = dftoplot[dftoplot.Area==area]
    for i,year in enumerate(plotdata.Year):
        x = plotdata.iloc[i,fer]
        y = plotdata.iloc[i,3]
        plt.scatter(x, y, color='red')
        plt.text(x+1, y+1, year, fontsize=7)
    plt.ylabel(item)
    plt.xlabel(x_label)
    plt.title(title_)
    plt.show()
    return

In [None]:
def densityplotFertvsProd(area,item):
    #Plot density of one country
    tmp = RecentProd[RecentProd.Item==item]
    tmp = tmp.set_index(['Area','Year'])
    dftoplot = tmp.merge(mergeFert,left_index=True,right_index=True, how='outer')
    dftoplot = dftoplot.reset_index()
    plotdata = dftoplot[dftoplot.Area==area]
    sns.jointplot(x="NitrogenUse", y="Production", data=plotdata, kind="kde");
    sns.jointplot(x="PhosphateUse", y="Production", data=plotdata, kind="kde");
    sns.jointplot(x="PotashUse", y="Production", data=plotdata, kind="kde");
    return

In [None]:
def densityplotFertvsProdAll(item):
    #plot density of all Area
    tmp = RecentProd[RecentProd.Item==item]
    tmp = tmp.set_index(['Area','Year'])
    dftoplot = tmp.merge(mergeFert,left_index=True,right_index=True, how='outer')
    dftoplot = dftoplot.reset_index()
    plotdata = dftoplot
    print(plotdata.shape)
    sns.jointplot(x="NitrogenUse", y="Production", data=plotdata, kind="kde",xlim=[0,2000000],ylim=[0,15000000]);
    sns.jointplot(x="PhosphateUse", y="Production", data=plotdata, kind="kde",xlim=[0,750000],ylim=[0,15000000]);
    sns.jointplot(x="PotashUse", y="Production", data=plotdata, kind="kde",xlim=[0,750000],ylim=[0,15000000]);
    return

In [None]:
def ScatterplotFertvsRatio(area,item):
    #Plot density of one country
    tmp = ratiodf[ratiodf.Item==item]
    tmp = tmp.set_index(['Area','Year'])
    dftoplot = tmp.merge(mergeFert,left_index=True,right_index=True, how='outer')
    dftoplot = dftoplot.reset_index()
    plotdata = dftoplot[dftoplot.Area==area]
    sns.scatterplot(x="NitrogenUse", y="Ratio_Prod_field", data=plotdata);
    plt.show()
    sns.scatterplot(x="PhosphateUse", y="Ratio_Prod_field", data=plotdata);
    plt.show()
    sns.scatterplot(x="PotashUse", y="Ratio_Prod_field", data=plotdata);
    plt.show()
    return

In [None]:
    def R2(x, y):
        return stats.pearsonr(x, y)[0] ** 2

In [None]:
def RegScatterplotFertvsRatio(area,item):
    #Plot density of one country
    tmp = ratiodf[ratiodf.Item==item]
    tmp = tmp.set_index(['Area','Year'])
    dftoplot = tmp.merge(mergeFert,left_index=True,right_index=True, how='outer')
    dftoplot = dftoplot.reset_index()
    plotdata = dftoplot[dftoplot.Area==area]
    plotdata.dropna()
    x = plotdata.NitrogenUse
    y = plotdata.Ratio_Prod_field
    sns.jointplot(x="NitrogenUse", y="Ratio_Prod_field", data=plotdata,kind='reg', stat_func=R2);
    plt.show()
    sns.jointplot(x="PhosphateUse", y="Ratio_Prod_field", data=plotdata,kind='reg',stat_func=R2);
    plt.show()
    sns.jointplot(x="PotashUse", y="Ratio_Prod_field", data=plotdata,kind='reg',stat_func=R2);
    plt.show()
    return

#### DATA EXPLORATION

Let's first see the progression of fertisant in time, globally we see continuous slow increase as we could have expected.

In [None]:
sns.scatterplot(x='Year',y='Value',hue='Item',data=Meltdf[Meltdf.Area=='World'])
plt.title('Fertilisant used in the worlds')
plt.ylabel('Tonnes of Nutrient')
plt.show()
sns.scatterplot(x='Year',y='Value',hue='Item',data=Meltdf[Meltdf.Area=='Least Developed Countries'])
plt.title('Fertilisant used in the least developed countries')
plt.ylabel('Tonnes of Nutrient')
plt.show()
sns.scatterplot(x='Year',y='Value',hue='Item',data=Meltdf[Meltdf.Area=='Land Locked Developing Countries'])
plt.title('Fertilisant used in the rich countries')
plt.ylabel('Tonnes of Nutrient')
plt.show()

To have an idee of the added value of fertilisant, let start looking by the global density of fertilizers use and wheat production in tons, we cannot see any kind of linear correlation between the quantities of fertilisant used and production. The Indeed Data is just centralised in rectangular manner:

In [None]:
item = 'Wheat'
densityplotFertvsProdAll(item)

Let's have a closer look in USA's wheat production, here we still see no evident correlation.

In [None]:
area = "United States of America"
item = 'Wheat'
Fertilisant = 'N' #N P K
plotFertandProd(area,item,Fertilisant)

area = "United States of America"
item = 'Wheat'
Fertilisant = 'P' #N P K
plotFertandProd(area,item,Fertilisant)

area = "United States of America"
item = 'Wheat'
Fertilisant = 'K' #N P K
plotFertandProd(area,item,Fertilisant)

In [None]:
area = "United States of America"
item = 'Seed cotton'
Fertilisant = 'N' #N P K
plotFertandProd(area,item,Fertilisant)

area = "United States of America"
item = 'Seed cotton'
Fertilisant = 'P' #N P K
plotFertandProd(area,item,Fertilisant)

area = "United States of America"
item = 'Seed cotton'
Fertilisant = 'K' #N P K
plotFertandProd(area,item,Fertilisant)

In contrast the USA's seed production a shows more correlation:

In [None]:
area = 'World'
item = 'Seed cotton'
Fertilisant = 'N' #N P K
plotFertandProd(area,item,Fertilisant)

area = "United States of America"
item = 'Seed cotton'
Fertilisant = 'P' #N P K
plotFertandProd(area,item,Fertilisant)

area = "United States of America"
item = 'Seed cotton'
Fertilisant = 'K' #N P K
plotFertandProd(area,item,Fertilisant)

One thing we have to care about is the fact that the more crop area is harvested, the more will be the production. So let's divide our production by the harvest area is h2 in order to diminish that bias.
Here, we plot the world average production in tons divided by harved area in hectars with the 3 fertilisants used.
In this case, we see a more direct correlation between fertilisant used and production (R2= ~0.75), which make difficult 
to ask farmer to use less fertlisant!:

In [None]:
area = 'World'
item = 'Seed cotton'
RegScatterplotFertvsRatio(area,item)

But if we look at wheat production here USA, there is absolute no correlation. One explaination is that once the field has sufficient ressource, adding fertilisant won't increase the production anymore. 
This means that no correlation indirectly involve that too much fertilisant is used:

In [None]:
area = 'United States of America'
item = 'Wheat'
RegScatterplotFertvsRatio(area,item)

By looking in rich and other countries, we see that the countries in devlepement have a linear corelation of fertilisant. 
This might come by the fact that they can't afford to saturate their field with artificial nutrient: 

In [None]:
area = 'Least Developed Countries'
item = 'Wheat'
RegScatterplotFertvsRatio(area,item)

In [None]:
area = 'Land Locked Developing Countries'
item = 'Wheat'
RegScatterplotFertvsRatio(area,item)

#### Q5 Conclusion

There is no direct relation of the time and ratio of production/fertilisant.
The use of fertilisant increase slowly with time consistant with the increased needs of the continuously rising population.

We can see that on world wide fertilisant used, the production is linearly correlated to fertilisant, but not on richer country. Indeed, rich country seem to over-saturate their field with artifical nutrient while the other countries are more cautious with the fertilisation.

We can conclude that richer country should reduce their fertilisant use in order to avoid polution and waste. In opposition, some other countries should increase the fertilisant use in order to have more production.

#### To do next
For the final report, it would be nice to have more geographic, dynamic and esthetic plot.