In [1]:
import pandas as pd
import numpy as np
import json
from tqdm.auto import tqdm # progress bar

df = pd.read_csv("../../data/Imp_trade_2016.csv")

# select relevant columns and filter columns that actually represent movement of goods
df = df[["Index", "Reporter.Countries", "Partner.Countries", "Item", "Year", "1000 Head", "Head", "tonnes"]]
df = df[(df["1000 Head"] != 0) | (df["Head"] != 0) | (df["tonnes"] != 0)]

# rename the columns for better understanding
df.rename(columns={"Reporter.Countries": "to_country", "Partner.Countries": "from_country"}, inplace=True)

# delete rows that do not specify the from country
df.drop((df[df["from_country"] == "Unspecified Area"]).index, inplace=True)

# switch some country names so they match country centroids data
country_centroids = pd.read_csv("../../data/country_centroids.csv")
country_centroids.replace(to_replace={
    "Saint Helena": "Saint Helena, Ascension and Tristan da Cunha",
    "Palestinian Territories": "Occupied Palestinian Territory",
    "Vatican City": "Holy See",
    "Cocos [Keeling] Islands": "Cocos (Keeling) Islands",
    "Falkland Islands [Islas Malvinas]": "Falkland Islands (Malvinas)",
    "Micronesia": "Micronesia (Federated States of)",
    "French Southern Territories": "French Southern and Antarctic Territories",
    "Netherlands Antilles": "Netherlands Antilles (former)"
}, value=None, inplace=True)

In [2]:
def getQuantityUnitsAndGeoCoordinates(row):
    # clean up the quantity and units measures 
    if row["1000 Head"] != 0:
        info_list = [row["1000 Head"] * 1000, "Heads"]
    elif row["Head"] != 0:
        info_list = [row["Head"], "Heads"]
    else:
        info_list = [row["tonnes"], "Tonnes"]
        
    # add the longitude and latitude of each country
    to_country_coordinates = country_centroids[country_centroids["name"] == row["to_country"]]
    from_country_coordinates = country_centroids[country_centroids["name"] == row["from_country"]] 
    if to_country_coordinates.size == 0:
        print("MISSING IN COUNTRY CENTROIDS CSV: " + row["to_country"])
    elif from_country_coordinates.size == 0:
        print("MISSING IN COUNTRY CENTROIDS CSV: " + row["from_country"])
    else:
        info_list.append({
            "to_country": [float(to_country_coordinates["longitude"]), float(to_country_coordinates["latitude"])],
            "from_country": [float(from_country_coordinates["longitude"]), float(from_country_coordinates["latitude"])]
        })

    return info_list

tqdm.pandas()
df[["Quantity", "Unit", "Coordinates"]] = df.progress_apply(getQuantityUnitsAndGeoCoordinates, axis=1, result_type="expand")

  from pandas import Panel


HBox(children=(IntProgress(value=0, max=340432), HTML(value='')))




In [11]:
# drop the now unneeded columns of 1000 Head, Head and tonnes
# df.drop(columns=["1000 Head", "Head", "tonnes"], inplace=True)

In [13]:
df.to_csv("../../data/trade_imports.csv")