In [8]:
# *************************************************************************************
# *                          © 2019 Roshini Saravanakumar                             *
# *************************************************************************************

import pandas as pd
import numpy as np
import seaborn as sns
from collections import Counter

df = pd.read_csv('../data/raw_data.csv')
df.drop(columns='Row Number', inplace=True)

df = df[['Area', 'Item', 'Year', 'Production', 'Import Quantity', 'Export Quantity']]
df.rename(columns={'Area': 'Country', 'Item':'Produce', 'Production':'Production Quantity'}, inplace=True)

# if both production and import quantity is 0, this data is not meaningful
df = df[df['Production Quantity'] + df['Import Quantity'] > 0]
df.head() 

Unnamed: 0,Country,Produce,Year,Production Quantity,Import Quantity,Export Quantity
0,Armenia,Potatoes,2000,290260.0,390.0,300.0
1,Armenia,Potatoes,2001,363834.0,2290.0,0.0
2,Armenia,Potatoes,2002,374263.0,1918.0,0.0
3,Armenia,Potatoes,2003,507518.0,2853.0,7.0
4,Armenia,Potatoes,2004,576427.0,1553.0,0.0


In [9]:
def calculateConsumption(entry):
    return entry['Production Quantity'] + entry['Import Quantity'] - entry['Export Quantity']

# calculate the consumption for each entry (produce + import) - export
df['Consumption Quantity'] = df.apply(calculateConsumption, axis=1)

In [10]:
def calculatePercentConsumed(entry):
    return (entry['Consumption Quantity'] / (entry['Production Quantity'] + entry['Import Quantity'])) * 100

# calculate the percentage of the total yield(production + import) for a crop that was consumed
df['Percent Consumed'] = df.apply(calculatePercentConsumed, axis=1)

In [11]:
def calculatePercentExported(entry):
    return (entry['Export Quantity'] / (entry['Production Quantity'] + entry['Import Quantity'])) * 100

# calculate the percentage of the total yield(production + import) for a crop that was exported
df['Percent Exported'] = df.apply(calculatePercentExported, axis=1)

In [12]:
# filter out entries that do not have complete info (more was exported than what was produced + imported)
df = df[(df['Percent Consumed'] >= 0) & (df['Percent Exported'] >= 0)]
df.to_csv('../data/percent_consumption.csv')
df.sample(n=20)

Unnamed: 0,Country,Produce,Year,Production Quantity,Import Quantity,Export Quantity,Consumption Quantity,Percent Consumed,Percent Exported
132870,European Union,Pistachios,2008,11009.0,148666.0,61558.0,98117.0,61.447941,38.552059
125160,Southern Europe,Bananas,2003,435335.0,1064916.0,196291.0,1303960.0,86.916123,13.083877
30842,New Zealand,Asparagus,2009,2200.0,13.0,367.0,1846.0,83.416177,16.583823
26805,Republic of Moldova,"Beans, dry",2012,5531.0,327.0,923.0,4935.0,84.243769,15.756231
112737,Central Asia,Barley,2003,2617723.0,8069.0,574641.0,2051151.0,78.115517,21.884483
16796,Latvia,Oats,2013,134200.0,6533.0,18800.0,121933.0,86.641371,13.358629
1079,Australia,Grapefruit (inc. pomelos),2013,8840.0,1792.0,230.0,10402.0,97.836719,2.163281
111709,Asia,Hops,2005,12557.0,8236.0,507.0,20286.0,97.561679,2.438321
123451,Northern Europe,Soybeans,2002,0.0,1718742.0,14321.0,1704421.0,99.166774,0.833226
57855,Syrian Arab Republic,"Vegetables, fresh nes",2000,76845.0,0.0,29174.0,47671.0,62.035266,37.964734


In [13]:
df[(df['Country'] == 'Austria') & (df['Produce'] == 'Vetches')]

Unnamed: 0,Country,Produce,Year,Production Quantity,Import Quantity,Export Quantity,Consumption Quantity,Percent Consumed,Percent Exported
9677,Austria,Vetches,2003,1181.0,0.0,0.0,1181.0,100.0,0.0
9678,Austria,Vetches,2004,1535.0,0.0,0.0,1535.0,100.0,0.0
9679,Austria,Vetches,2005,2142.0,0.0,0.0,2142.0,100.0,0.0
9680,Austria,Vetches,2006,1917.0,0.0,0.0,1917.0,100.0,0.0
9681,Austria,Vetches,2007,2210.0,0.0,0.0,2210.0,100.0,0.0
9682,Austria,Vetches,2008,2198.0,0.0,0.0,2198.0,100.0,0.0
9683,Austria,Vetches,2009,3185.0,0.0,0.0,3185.0,100.0,0.0
9684,Austria,Vetches,2010,4214.0,0.0,0.0,4214.0,100.0,0.0
9685,Austria,Vetches,2011,2903.0,0.0,0.0,2903.0,100.0,0.0
9686,Austria,Vetches,2012,2460.0,0.0,0.0,2460.0,100.0,0.0
