In [1]:
import pandas as pd
import numpy as np
import json
    
df = pd.read_csv('../../data/imp_exp_prod_flat_all.csv')
df.drop(columns='Row Number', inplace=True)
df = df[['Area', 'Item', 'Year', 'Production', 'Import Quantity', 'Export Quantity']]
df.rename(columns={'Area': 'Country', 'Item':'Produce', 'Production':'Production Quantity'}, inplace=True)
df.head()

Unnamed: 0,Country,Produce,Year,Production Quantity,Import Quantity,Export Quantity
0,Armenia,Potatoes,2000,290260.0,390.0,300.0
1,Armenia,Potatoes,2001,363834.0,2290.0,0.0
2,Armenia,Potatoes,2002,374263.0,1918.0,0.0
3,Armenia,Potatoes,2003,507518.0,2853.0,7.0
4,Armenia,Potatoes,2004,576427.0,1553.0,0.0


In [2]:
# create a lookup table for countries and their centroid locations
lookup = pd.read_csv('../../data/country_centroids.csv')
lookup.rename(columns={'country':'code'}, inplace=True)
lookup.index = lookup['name']
(lookup.drop(columns=['name'])).to_csv('../../data/countries_lookup.csv')


# FIXME --------------------------------------------------------------------------
# ********************************************************************************
# Assuming that China is a sum of mainland, taiwan, hong kong and macao
# 'Serbia and Montenegro' - set coordinates to average of Serbia and Montenegro
# 'South Sudan' and 'Sudan (former)'- set coordinates to Sudan's coordinates

# print regions that are not present in the countries lookup table
set(df[~df['Country'].isin(lookup['name'])]['Country'])

{'Africa',
 'Americas',
 'Asia',
 'Australia & New Zealand',
 'Caribbean',
 'Central America',
 'Central Asia',
 'China',
 'Eastern Africa',
 'Eastern Asia',
 'Eastern Europe',
 'Europe',
 'European Union',
 'Land Locked Developing Countries',
 'Least Developed Countries',
 'Low Income Food Deficit Countries',
 'Melanesia',
 'Middle Africa',
 'Net Food Importing Developing Countries',
 'Northern Africa',
 'Northern America',
 'Northern Europe',
 'Occupied Palestinian Territory',
 'Oceania',
 'Polynesia',
 'Small Island Developing States',
 'South America',
 'South-Eastern Asia',
 'Southern Africa',
 'Southern Asia',
 'Southern Europe',
 'Western Africa',
 'Western Asia',
 'Western Europe',
 'World'}

In [3]:
# remove the regions printed above from the dataframe
df = df[df['Country'].isin(lookup.index)]

# randomly print 10 entries from the dataframe for an example of what it contains
df.sample(n=10)

Unnamed: 0,Country,Produce,Year,Production Quantity,Import Quantity,Export Quantity
64897,Turkey,"Tobacco, unmanufactured",2000,200280.0,62195.0,100388.0
77632,Montenegro,Apples,2008,1797.0,6655.0,2.0
57174,Switzerland,Currants,2013,529.0,59.0,0.0
145010,El Salvador,"Pumpkins, squash and gourds",2007,7200.0,0.0,0.0
148550,Djibouti,"Beans, dry",2010,1600.0,7027.0,1025.0
65728,Uganda,"Cocoa, beans",2004,4500.0,0.0,5154.0
64225,Turkey,Garlic,2003,125000.0,3270.0,146.0
33760,Norway,"Beans, green",2002,710.0,180.0,0.0
3479,Indonesia,"Rubber, natural",2001,1607460.0,6334.0,10375.0
55213,Brazil,Sunflower seed,2012,123646.0,7768.0,6.0


In [4]:
# calculate top 10 producers, importers and exporters for each produce in each year
top10_statistics = {}
for produce in set(df['Produce']): 
    years = sorted(set(df[df['Produce'] == produce]['Year']))
    yearStatistics = {}
    yearStatistics['available_years'] = years
    individualYears = {}
    
    for year in years:
        selected = df[(df['Produce'] == produce) & (df['Year'] == year)]
        individualYears[year] = {
            'top10_producers': {
                'countries': list(selected.nlargest(10, 'Production Quantity')['Country']),
                'quantities': list(selected.nlargest(10, 'Production Quantity')['Production Quantity'])
            },
            'top10_importers': {
                'countries': list(selected.nlargest(10, 'Import Quantity')['Country']),
                'quantities': list(selected.nlargest(10, 'Import Quantity')['Import Quantity'])
            },
            'top10_exporters': {
                'countries': list(selected.nlargest(10, 'Export Quantity')['Country']),
                'quantities': list(selected.nlargest(10, 'Export Quantity')['Export Quantity'])
            }
        }
        
    yearStatistics['top10_per_year'] = individualYears
    top10_statistics[produce] = yearStatistics

In [5]:
# dump top 10 statistics to a json file
json.dump(top10_statistics, open('../../data/top10_imp_exp_prod.json', 'w'))