In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pycountry_convert as pc
from sklearn import preprocessing
sns.set(rc={'figure.figsize':(18, 3)})

In [3]:
# show decimal notation instead of scientific notation
pd.set_option('display.float_format', lambda x: '%.5f' % x)
# pd.set_option('display.float_format', lambda x: x)
# pd.reset_option('^display.', silent=True) # to go back

# Read cleaned datasets
Data read here was processed in the DataProcessing notebook

In [4]:
raw = pd.read_csv('datasets_cleaned/crops_cleaned.csv')
prices = pd.read_csv('datasets_cleaned/producer_prices_cleaned.csv')
caffeinePrices = pd.read_csv('datasets_cleaned/caffeine_prices_clean.csv')
rainfall = pd.read_csv('datasets_cleaned/rainfall_clean.csv')
temperatures = pd.read_csv('datasets_cleaned/temperatures_clean.csv')

# Melting and merging datasets

In [9]:
# melting crop data with average yield of all items
rawMeltedYield = raw.copy()[raw['Element'] == 'Yield'].melt(id_vars=['Area'], value_vars=raw.columns[3:],
                              var_name='Year', value_name='Yield')
rawMeltedYield = rawMeltedYield.groupby(['Year','Area'])['Yield'].mean().reset_index()
rawMeltedYield['Year'] = rawMeltedYield['Year'].astype(str)
rawMeltedYield['Area'] = rawMeltedYield['Area'].astype(str)
rawMeltedYield['Area'] = rawMeltedYield['Area'].str.strip()
rawMeltedYield

Unnamed: 0,Year,Area,Yield
0,1961,Afghanistan,40287.91667
1,1961,Africa,45553.50993
2,1961,Albania,15242.56944
3,1961,Algeria,31470.05556
4,1961,American Samoa,19846.12000
...,...,...,...
14843,2018,World,93636.97674
14844,2018,Yemen,60869.95455
14845,2018,Yugoslav SFR,0.00000
14846,2018,Zambia,82033.07692


In [10]:
# melting crop data with total production of all items
rawMeltedProduction = raw.copy()[raw['Element'] == 'Production'].melt(id_vars=['Area'], value_vars=raw.columns[3:], var_name='Year', value_name='Production')
rawMeltedProduction = rawMeltedProduction.groupby(['Year','Area'])['Production'].sum().reset_index()
rawMeltedProduction['Year'] = rawMeltedProduction['Year'].astype(str)
rawMeltedProduction['Area'] = rawMeltedProduction['Area'].astype(str)
rawMeltedProduction['Area'] = rawMeltedProduction['Area'].str.strip()
rawMeltedProduction

Unnamed: 0,Year,Area,Production
0,1961,Afghanistan,10113549.00000
1,1961,Africa,391312625.00000
2,1961,Albania,1457890.00000
3,1961,Algeria,8995374.00000
4,1961,American Samoa,25852.00000
...,...,...,...
15017,2018,World,19413566384.00000
15018,2018,Yemen,4986815.00000
15019,2018,Yugoslav SFR,0.00000
15020,2018,Zambia,19756945.00000


In [11]:
# melting crop data with total area harvested
areaHarvestedMelted = raw.copy()[raw['Element'] == 'Area harvested']
areaHarvestedMelted = areaHarvestedMelted.melt(id_vars=['Area'], value_vars=raw.columns[3:],
                              var_name='Year', value_name='Area Harvested').groupby(['Year','Area'])['Area Harvested'].sum().reset_index()
areaHarvestedMelted

Unnamed: 0,Year,Area,Area Harvested
0,1961,Afghanistan,7959162.00000
1,1961,Africa,238329548.00000
2,1961,Albania,998260.00000
3,1961,Algeria,6841230.00000
4,1961,American Samoa,18425.00000
...,...,...,...
14901,2018,World,3456659059.00000
14902,2018,Yemen,1878576.00000
14903,2018,Yugoslav SFR,0.00000
14904,2018,Zambia,6094188.00000


In [5]:
# ????
prices_melted = pd.melt(prices, id_vars=['Item'], value_vars=prices.columns[9:], value_name='Price')

In [13]:
# merging climate data (temperature and rainfall)
rainfall = pd.pivot_table(rainfall, values='Rainfall - (MM)', index=['Year', 'Country'], aggfunc=np.sum) # total rainfall
temperatures = pd.pivot_table(temperatures, values='Temperature - (Celsius)', index=['Year', 'Country'], aggfunc=np.mean) # average temperature

rainfallAndTemperatures = rainfall.merge(temperatures, on=['Year','Country']).reset_index()
rainfallAndTemperatures.rename(columns={"Country": "Area"},inplace=True)

rainfallAndTemperatures['Area'] = rainfallAndTemperatures['Area'].str.strip()
rainfallAndTemperatures['Year'] = rainfallAndTemperatures['Year'].astype(str)
rainfallAndTemperatures['Area'] = rainfallAndTemperatures['Area'].astype(str)

rainfallAndTemperatures.head()

Unnamed: 0,Year,Area,Rainfall - (MM),Temperature - (Celsius)
0,1991,Afghanistan,435.4499,12.89449
1,1991,Albania,917.8688,11.03765
2,1991,Algeria,88.13276,22.51346
3,1991,Andorra,757.1163,11.18815
4,1991,Angola,1017.9866,21.71688


# Dictionary for regions and areas

In [14]:
continents = ['Africa', 'Americas', 'Asia', 'Europe', 'Oceania']
subcon_regions = ['Eastern Africa', 'Middle Africa', 'Northern Africa', 'Southern Africa', 'Western Africa',
        'Northern America', 'Central America', 'Caribbean', 'South America', 
        'Central Asia', 'Eastern Asia', 'Southern Asia', 'South-eastern Asia', 'Western Asia',
        'Eastern Europe', 'Northern Europe', 'Southern Europe', 'Western Europe', 
        'Australia and New Zealand','Melanesia', 'Micronesia', 'Polynesia']

regions = ['World', 'Africa',
       'Eastern Africa', 'Middle Africa', 'Northern Africa',
       'Southern Africa', 'Western Africa', 'Americas',
       'Northern America', 'Central America', 'Caribbean',
       'South America', 'Asia', 'Central Asia', 'Eastern Asia',
       'Southern Asia', 'South-eastern Asia', 'Western Asia', 'Europe',
       'Eastern Europe', 'Northern Europe', 'Southern Europe',
       'Western Europe', 'Oceania', 'Australia and New Zealand','Melanesia', 'Micronesia', 'Polynesia']

regionalDict = {}
for region in regions:
    regionalDict[region] = raw[raw['Area']==region]

In [None]:
Exploratory Data Analysis