In [1]:
# *************************************************************************************
# *                          © 2019 Roshini Saravanakumar                             *
# *************************************************************************************

import pandas as pd
import numpy as np
import seaborn as sns
from collections import Counter

df = pd.read_csv('../data/raw_data.csv')
df.drop(columns='Row Number', inplace=True)

df = df[['Area', 'Item', 'Year', 'Production', 'Import Quantity', 'Export Quantity']]
df.rename(columns={'Area': 'Country', 'Item':'Produce', 'Production':'Production Quantity'}, inplace=True)

df.head() 

Unnamed: 0,Country,Produce,Year,Production Quantity,Import Quantity,Export Quantity
0,Armenia,Potatoes,2000,290260.0,390.0,300.0
1,Armenia,Potatoes,2001,363834.0,2290.0,0.0
2,Armenia,Potatoes,2002,374263.0,1918.0,0.0
3,Armenia,Potatoes,2003,507518.0,2853.0,7.0
4,Armenia,Potatoes,2004,576427.0,1553.0,0.0


In [2]:
def calculateConsumption(entry):
    return entry['Production Quantity'] + entry['Import Quantity'] - entry['Export Quantity']

# calculate the consumption for each entry (produce + import) - export
df['Consumption Quantity'] = df.apply(calculateConsumption, axis=1)

In [3]:
def calculatePercentConsumed(entry):
    # if there was no yield, none of it could have been consumed
    if entry['Production Quantity'] + entry['Import Quantity'] == 0:
        return 0
    return (entry['Consumption Quantity'] / (entry['Production Quantity'] + entry['Import Quantity'])) * 100

# calculate the percentage of the total yield(production + import) for a crop that was consumed
df['Percent Consumed'] = df.apply(calculatePercentConsumed, axis=1)

In [4]:
def calculatePercentExported(entry):
    # if there was no yield, none of it could have been exported
    if entry['Production Quantity'] + entry['Import Quantity'] == 0:
        return 0
    return (entry['Export Quantity'] / (entry['Production Quantity'] + entry['Import Quantity'])) * 100

# calculate the percentage of the total yield(production + import) for a crop that was exported
df['Percent Exported'] = df.apply(calculatePercentExported, axis=1)

In [5]:
# filter out entries that do not have complete info (more was exported than what was produced + imported)
df = df[(df['Percent Consumed'] >= 0) & (df['Percent Exported'] >= 0)]
df.to_csv('../data/percent_consumption.csv')
df.sample(n=20)

Unnamed: 0,Country,Produce,Year,Production Quantity,Import Quantity,Export Quantity,Consumption Quantity,Percent Consumed,Percent Exported
128574,Oceania,Currants,2015,9842.0,0.0,1.0,9841.0,99.989839,0.010161
155218,Argentina,Watermelons,2000,125528.0,9616.0,385.0,134759.0,99.715119,0.284881
98846,Northern Africa,Kiwi fruit,2013,32.0,20178.0,5.0,20205.0,99.97526,0.02474
103305,Americas,Maize,2004,399660932.0,16980828.0,65323967.0,351317793.0,84.321311,15.678689
66632,Belize,Coconuts,2007,673.0,6.0,0.0,679.0,100.0,0.0
62988,Tunisia,Asparagus,2010,227.0,0.0,114.0,113.0,49.779736,50.220264
124573,Southern Europe,"Walnuts, with shell",2008,88547.0,36835.0,2060.0,123322.0,98.357021,1.642979
71484,Viet Nam,"Onions, dry",2006,242888.0,47206.0,2560.0,287534.0,99.117527,0.882473
17026,Bahamas,Papayas,2006,0.0,71.0,0.0,71.0,100.0,0.0
46681,Bolivia (Plurinational State of),Bananas,2010,159681.0,0.0,79447.0,80234.0,50.246429,49.753571
