In [1]:
import pandas as pd
import numpy as np
from collections import Counter

df = pd.read_csv('data/raw_data.csv')
df.drop(columns='Row Number', inplace=True)

df = df[['Area', 'Item', 'Year', 'Production', 'Import Quantity', 'Export Quantity']]
df.rename(columns={'Area': 'Country', 'Item':'Produce', 'Production':'Production Quantity'}, inplace=True)

df.head() 

Unnamed: 0,Country,Produce,Year,Production Quantity,Import Quantity,Export Quantity
0,Armenia,Potatoes,2000,290260.0,390.0,300.0
1,Armenia,Potatoes,2001,363834.0,2290.0,0.0
2,Armenia,Potatoes,2002,374263.0,1918.0,0.0
3,Armenia,Potatoes,2003,507518.0,2853.0,7.0
4,Armenia,Potatoes,2004,576427.0,1553.0,0.0


In [2]:
# create a country name lookup table from the countries dataset
countries = pd.read_csv('data/raw_countries.csv')

# if the country name specifies an optional prefix, add this prefix to the beginning of the country name
def add_prefix(name):
    components = name.split(', ')
    if len(components) == 2: 
        return components[1] + ' ' + components[0]
    return components[0]

countries = set((countries['Name']).apply(add_prefix)) # python sets are implemented as hashtables

In [8]:
# special cases: name formatting so that it can be found in the country lookup
def format_country(name):
    if name[:2] == 'Cz': # Czechia -> Czech Republic
        return 'Czech Republic'
    if name[:3] == 'Esw': #Eswatini -> Swaziland
        return 'Swaziland'
    if name[:2] == 'C�': # Cote d'Ivoire -> Ivory Coast since circumflex(the hat thingy) doesn't load :(     (NEEDS REVIEW)
        return 'Ivory Coast'
    if name == 'Serbia and Montenegro': # Montenegro declared independence from Serbia in 2006
        return 'Serbia'                 # categorizing temporarily as Serbia                                  (NEEDS REVIEW) 
    components = name.split(' (') # add prefixes that are in ()
    if len(components) == 2:
        if components[1] == 'former)': # Southern Sudan became independent from Sudan in 2011
            return 'Sudan'             # categorizing former Sudan temporarily as Sudan                       (NEEDS REVIEW)
        return (components[1])[:len(components[1]) - 1] + ' ' + components[0]
    if name[:5] == 'China':
        if len(name) == 5:
            return 'China'
        if name[7] == 'H':
            return 'Hong Kong'
        elif name[7] == 'M':
            return 'Macao'
        return 'Province of China Taiwan'
    return name
    
# format country names that have multiple translations/different formats so they can be found in the lookup table
df['Country'] = (df['Country']).apply(format_country)

# print a list of all regions in the dataframe that could not be found in the countries lookup table
set((df[~df['Country'].isin(countries)])['Country'])

{'Africa',
 'Americas',
 'Asia',
 'Australia & New Zealand',
 'Caribbean',
 'Central America',
 'Central Asia',
 'Eastern Africa',
 'Eastern Asia',
 'Eastern Europe',
 'Europe',
 'European Union',
 'Land Locked Developing Countries',
 'Least Developed Countries',
 'Low Income Food Deficit Countries',
 'Melanesia',
 'Micronesia',
 'Middle Africa',
 'Net Food Importing Developing Countries',
 'Northern Africa',
 'Northern America',
 'Northern Europe',
 'Occupied Palestinian Territory',
 'Oceania',
 'Polynesia',
 'Small Island Developing States',
 'South America',
 'South-Eastern Asia',
 'Southern Africa',
 'Southern Asia',
 'Southern Europe',
 'Western Africa',
 'Western Asia',
 'Western Europe',
 'World'}

In [10]:
# remove the regions printed above from the dataframe
df = df[df['Country'].isin(countries)]

In [14]:
# scenario 1: avacados in 2012
avacados_2012 = df[(df['Produce'] == 'Avocados') & (df['Year'] == 2012)]

# top 10 countries that exported avocados in 2012
avacados_2012.nlargest(10, 'Export Quantity')

Unnamed: 0,Country,Produce,Year,Production Quantity,Import Quantity,Export Quantity
24308,Mexico,Avocados,2012,1316104.0,923.0,494481.0
85979,Chile,Avocados,2012,160000.0,698.0,91527.0
38041,Peru,Avocados,2012,268525.0,0.0,83576.0
53491,Spain,Avocados,2012,76337.0,34060.0,65494.0
52140,South Africa,Avocados,2012,91603.0,1660.0,54502.0
68912,United States of America,Avocados,2012,238495.0,502546.0,29630.0
147899,France,Avocados,2012,980.0,94500.0,18073.0
31415,New Zealand,Avocados,2012,25500.0,1.0,11077.0
134976,Ecuador,Avocados,2012,28600.0,1770.0,6048.0
55643,Brazil,Avocados,2012,159903.0,0.0,4273.0


In [24]:
# top 10 countries that imported avocados in 2012
avacados_2012.nlargest(10, 'Import Quantity')

Unnamed: 0,Country,Produce,Year,Production Quantity,Import Quantity,Export Quantity
68912,United States of America,Avocados,2012,238495.0,502546.0,29630.0
147899,France,Avocados,2012,980.0,94500.0,18073.0
53491,Spain,Avocados,2012,76337.0,34060.0,65494.0
145253,El Salvador,Avocados,2012,5250.0,13754.0,3.0
89668,Costa Rica,Avocados,2012,1672.0,13731.0,95.0
1306,Australia,Avocados,2012,48951.0,9627.0,2300.0
155293,Argentina,Avocados,2012,4100.0,9179.0,117.0
156952,Honduras,Avocados,2012,1258.0,9013.0,34.0
26109,Morocco,Avocados,2012,54340.0,8817.0,1933.0
88404,Colombia,Avocados,2012,255384.0,6126.0,5.0


In [25]:
df

Unnamed: 0,Country,Produce,Year,Production Quantity,Import Quantity,Export Quantity
0,Armenia,Potatoes,2000,290260.0,390.0,300.0
1,Armenia,Potatoes,2001,363834.0,2290.0,0.0
2,Armenia,Potatoes,2002,374263.0,1918.0,0.0
3,Armenia,Potatoes,2003,507518.0,2853.0,7.0
4,Armenia,Potatoes,2004,576427.0,1553.0,0.0
5,Armenia,Potatoes,2005,564211.0,915.0,302.0
6,Armenia,Potatoes,2006,539477.0,1705.0,0.0
7,Armenia,Potatoes,2007,583934.0,5794.0,0.0
8,Armenia,Potatoes,2008,648562.0,4268.0,484.0
9,Armenia,Potatoes,2009,593551.0,801.0,614.0
