In [1]:
# *************************************************************************************
# *                          © 2019 Roshini Saravanakumar                             *
# *************************************************************************************

import pandas as pd
import numpy as np
import seaborn as sns
from collections import Counter

df = pd.read_csv('data/raw_data.csv')
df.drop(columns='Row Number', inplace=True)

df = df[['Area', 'Item', 'Year', 'Production', 'Import Quantity', 'Export Quantity']]
df.rename(columns={'Area': 'Country', 'Item':'Produce', 'Production':'Production Quantity'}, inplace=True)

df.head() 

Unnamed: 0,Country,Produce,Year,Production Quantity,Import Quantity,Export Quantity
0,Armenia,Potatoes,2000,290260.0,390.0,300.0
1,Armenia,Potatoes,2001,363834.0,2290.0,0.0
2,Armenia,Potatoes,2002,374263.0,1918.0,0.0
3,Armenia,Potatoes,2003,507518.0,2853.0,7.0
4,Armenia,Potatoes,2004,576427.0,1553.0,0.0


In [6]:
def calculateConsumption(entry):
    return entry['Production Quantity'] + entry['Import Quantity'] - entry['Export Quantity']

# calculate the consumption for each entry (produce + import) - export
df['Consumption Quantity'] = df.apply(calculateConsumption, axis=1)

In [11]:
def calculatePercentConsumed(entry):
    # if there was no yield, none of it could have been consumed
    if entry['Production Quantity'] + entry['Import Quantity'] == 0:
        return 0
    return (entry['Consumption Quantity'] / (entry['Production Quantity'] + entry['Import Quantity'])) * 100

# calculate the percentage of the total yield(production + import) for a crop that was consumed
df['Percent Consumed'] = df.apply(calculatePercentConsumed, axis=1)

In [12]:
def calculatePercentExported(entry):
    # if there was no yield, none of it could have been exported
    if entry['Production Quantity'] + entry['Import Quantity'] == 0:
        return 0
    return (entry['Export Quantity'] / (entry['Production Quantity'] + entry['Import Quantity'])) * 100

# calculate the percentage of the total yield(production + import) for a crop that was exported
df['Percent Exported'] = df.apply(calculatePercentExported, axis=1)

In [19]:
# filter out entries that do not have complete info (more was exported than what was produced + imported)
df = df[(df['Percent Consumed'] >= 0) & (df['Percent Exported'] >= 0)]
df.to_csv('data/percents.csv')
df.sample(n=20)

Unnamed: 0,Country,Produce,Year,Production Quantity,Import Quantity,Export Quantity,Consumption Quantity,Percent Consumed,Percent Exported
74344,Belgium,Potatoes,2001,2564300.0,1065748.0,934834.0,2695214.0,74.247338,25.752662
114249,Eastern Asia,Grapefruit (inc. pomelos),2005,444280.0,228648.0,23793.0,649135.0,96.464258,3.535742
127381,Dominica,Cucumbers and gherkins,2008,1588.0,0.0,11.0,1577.0,99.307305,0.692695
158200,Hungary,Gooseberries,2009,1870.0,0.0,0.0,1870.0,100.0,0.0
27748,Namibia,"Vegetables, fresh nes",2011,18500.0,2700.0,90.0,21110.0,99.575472,0.424528
100987,Western Africa,Avocados,2014,43003.0,1268.0,912.0,43359.0,97.939961,2.060039
142498,Net Food Importing Developing Countries,Asparagus,2010,338032.0,276.0,125008.0,213300.0,63.049056,36.950944
154185,Guatemala,Strawberries,2008,10659.0,2.0,2942.0,7719.0,72.40409,27.59591
129043,Oceania,"Pyrethrum, dried",2008,1150.0,0.0,0.0,1150.0,100.0,0.0
112584,Central Asia,Eggplants (aubergines),2000,32500.0,213.0,1301.0,31412.0,96.022988,3.977012
