<a href="https://colab.research.google.com/github/stevegbrooks/commodify/blob/preprocessing/usda_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests, zipfile, io
import pandas as pd
import numpy as np
out_path = "~/CIS550/commodify/data/"

## Process commodities data from USDA

In [2]:
zip_url = "https://apps.fas.usda.gov/psdonline/downloads/psd_alldata_csv.zip"

r = requests.get(zip_url)
if r.ok:
  z = zipfile.ZipFile(io.BytesIO(r.content))
  usda_data = pd.read_csv(z.open('psd_alldata.csv'))

usda_data.head(n=5)

Unnamed: 0,Commodity_Code,Commodity_Description,Country_Code,Country_Name,Market_Year,Calendar_Year,Month,Attribute_ID,Attribute_Description,Unit_ID,Unit_Description,Value
0,577400,"Almonds, Shelled Basis",AF,Afghanistan,2010,2018,10,20,Beginning Stocks,21,(MT),0.0
1,577400,"Almonds, Shelled Basis",AF,Afghanistan,2010,2018,10,125,Domestic Consumption,21,(MT),0.0
2,577400,"Almonds, Shelled Basis",AF,Afghanistan,2010,2018,10,176,Ending Stocks,21,(MT),0.0
3,577400,"Almonds, Shelled Basis",AF,Afghanistan,2010,2018,10,88,Exports,21,(MT),0.0
4,577400,"Almonds, Shelled Basis",AF,Afghanistan,2010,2018,10,57,Imports,21,(MT),0.0


### Deal with null values

In [3]:
usda_data.isnull().values.any()

True

It looks like only the `Country_Code` column has `NaN`. 


In [4]:
usda_data[usda_data.isnull().any(axis=1)]

Unnamed: 0,Commodity_Code,Commodity_Description,Country_Code,Country_Name,Market_Year,Calendar_Year,Month,Attribute_ID,Attribute_Description,Unit_ID,Unit_Description,Value
716727,813100,"Meal, Soybean",,Netherlands Antilles,1976,2006,6,20,Beginning Stocks,8,(1000 MT),0.0
716728,813100,"Meal, Soybean",,Netherlands Antilles,1976,2006,6,7,Crush,8,(1000 MT),0.0
716729,813100,"Meal, Soybean",,Netherlands Antilles,1976,2006,6,125,Domestic Consumption,8,(1000 MT),1.0
716730,813100,"Meal, Soybean",,Netherlands Antilles,1976,2006,6,176,Ending Stocks,8,(1000 MT),0.0
716731,813100,"Meal, Soybean",,Netherlands Antilles,1976,2006,6,88,Exports,8,(1000 MT),0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1726314,612000,"Sugar, Centrifugal",,Netherlands Antilles,2021,2020,11,99,Refined Exp.(Raw Val),8,(1000 MT),0.0
1726315,612000,"Sugar, Centrifugal",,Netherlands Antilles,2021,2020,11,74,Refined Imp.(Raw Val),8,(1000 MT),0.0
1726316,612000,"Sugar, Centrifugal",,Netherlands Antilles,2021,2020,11,126,Total Disappearance,8,(1000 MT),0.0
1726317,612000,"Sugar, Centrifugal",,Netherlands Antilles,2021,2020,11,178,Total Distribution,8,(1000 MT),0.0


Lets check which values for `Country_Name` don't have a `Country_Code`

In [5]:
usda_data[usda_data.isnull().any(axis=1)]["Country_Name"].unique()

array(['Netherlands Antilles'], dtype=object)

Before gaining indepedence in 2010, these islands were part of the Netherlands, but now the group of islands consists of smaller countries. 

We can just set the country code of Netherlands Antilles to the Netherlands' country code.

In [6]:
usda_data[usda_data["Country_Name"] == "Netherlands"]["Country_Code"].unique()

array(['NL'], dtype=object)

In [7]:
usda_data.loc[usda_data.Country_Name == "Netherlands Antilles", 'Country_Code'] = "NL"

Check to make sure it worked. This should return `False`

In [8]:
usda_data.isnull().values.any()

False

## Reshape from long to wide

In [9]:
usda_pivot = usda_data.pivot(index = ["Commodity_Description", "Market_Year", "Month", "Country_Name"], columns = "Attribute_Description", values = "Value")
usda_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Attribute_Description,Annual % Change Per Cap. Cons.,Annual % Change Prod. To Sows,Arabica Production,Area Harvested,Bean Exports,Bean Imports,Beef Cows Beg. Stocks,Beet Sugar Production,Beginning Stocks,Calf Slaughter,...,TY Imp. from U.S.,TY Imports,Total Disappearance,Total Distribution,Total Slaughter,Total Supply,Total Use,USE Dom. Consumption,Withdrawal From Market,Yield
Commodity_Description,Market_Year,Month,Country_Name,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
"Almonds, Shelled Basis",1960,0,Iran,,,,,,,,,0.0,,...,,,,0.0,,4.0,,,,
"Almonds, Shelled Basis",1960,0,Italy,,,,,,,,,0.0,,...,,,,0.0,,13.0,,,,
"Almonds, Shelled Basis",1960,0,Morocco,,,,,,,,,0.0,,...,,,,0.0,,2.0,,,,
"Almonds, Shelled Basis",1960,0,Portugal,,,,,,,,,0.0,,...,,,,0.0,,1.0,,,,
"Almonds, Shelled Basis",1960,0,Spain,,,,,,,,,0.0,,...,,,,0.0,,29.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wheat,2020,3,Venezuela,,,,0.0,,,,,142.0,,...,0.0,1000.0,,1142.0,,1142.0,,,,0.00
Wheat,2020,3,Vietnam,,,,0.0,,,,,849.0,,...,0.0,3400.0,,4249.0,,4249.0,,,,0.00
Wheat,2020,3,Yemen,,,,95.0,,,,,564.0,,...,0.0,3800.0,,4502.0,,4502.0,,,,1.45
Wheat,2020,3,Zambia,,,,30.0,,,,,18.0,,...,0.0,50.0,,258.0,,258.0,,,,6.33


In [10]:
usda_pivot_reset = usda_pivot.reset_index(drop=False)
usda_pivot_reset

Attribute_Description,Commodity_Description,Market_Year,Month,Country_Name,Annual % Change Per Cap. Cons.,Annual % Change Prod. To Sows,Arabica Production,Area Harvested,Bean Exports,Bean Imports,...,TY Imp. from U.S.,TY Imports,Total Disappearance,Total Distribution,Total Slaughter,Total Supply,Total Use,USE Dom. Consumption,Withdrawal From Market,Yield
0,"Almonds, Shelled Basis",1960,0,Iran,,,,,,,...,,,,0.0,,4.0,,,,
1,"Almonds, Shelled Basis",1960,0,Italy,,,,,,,...,,,,0.0,,13.0,,,,
2,"Almonds, Shelled Basis",1960,0,Morocco,,,,,,,...,,,,0.0,,2.0,,,,
3,"Almonds, Shelled Basis",1960,0,Portugal,,,,,,,...,,,,0.0,,1.0,,,,
4,"Almonds, Shelled Basis",1960,0,Spain,,,,,,,...,,,,0.0,,29.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146458,Wheat,2020,3,Venezuela,,,,0.0,,,...,0.0,1000.0,,1142.0,,1142.0,,,,0.00
146459,Wheat,2020,3,Vietnam,,,,0.0,,,...,0.0,3400.0,,4249.0,,4249.0,,,,0.00
146460,Wheat,2020,3,Yemen,,,,95.0,,,...,0.0,3800.0,,4502.0,,4502.0,,,,1.45
146461,Wheat,2020,3,Zambia,,,,30.0,,,...,0.0,50.0,,258.0,,258.0,,,,6.33


In [11]:
cols_to_keep = ["Commodity_Description", "Market_Year", "Month", "Country_Name",
           "Beginning Stocks", "Ending Stocks", "Imports", "Exports", 
           "Area Harvested", "Yield", "Production", "Domestic Consumption"]

output = usda_pivot_reset[cols_to_keep]

output

Attribute_Description,Commodity_Description,Market_Year,Month,Country_Name,Beginning Stocks,Ending Stocks,Imports,Exports,Area Harvested,Yield,Production,Domestic Consumption
0,"Almonds, Shelled Basis",1960,0,Iran,0.0,0.0,0.0,0.0,,,4.0,0.0
1,"Almonds, Shelled Basis",1960,0,Italy,0.0,0.0,0.0,0.0,,,13.0,0.0
2,"Almonds, Shelled Basis",1960,0,Morocco,0.0,0.0,0.0,0.0,,,2.0,0.0
3,"Almonds, Shelled Basis",1960,0,Portugal,0.0,0.0,0.0,0.0,,,1.0,0.0
4,"Almonds, Shelled Basis",1960,0,Spain,0.0,0.0,0.0,0.0,,,29.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
146458,Wheat,2020,3,Venezuela,142.0,242.0,1000.0,0.0,0.0,0.00,0.0,900.0
146459,Wheat,2020,3,Vietnam,849.0,649.0,3400.0,250.0,0.0,0.00,0.0,3350.0
146460,Wheat,2020,3,Yemen,564.0,602.0,3800.0,0.0,95.0,1.45,138.0,3900.0
146461,Wheat,2020,3,Zambia,18.0,23.0,50.0,0.0,30.0,6.33,190.0,235.0


### The last step is to swap the `Country_Name` column for the matching political entity IDs.

In [12]:
pol_ent = pd.read_csv(out_path + "political_entity.csv")
pol_ent = pol_ent[pol_ent["is_country"] == 1]
pol_ent = pol_ent.rename(columns={"name" : "Country_Name"})

output = usda_pivot_reset.merge(pol_ent[["Country_Name", "id"]], on = "Country_Name")

cols_to_keep = ["Commodity_Description", "Market_Year", "Month", "id",
                "Beginning Stocks", "Ending Stocks", "Imports", "Exports", 
                "Area Harvested", "Yield", "Production", "Domestic Consumption"]

output = output[cols_to_keep]
output

Unnamed: 0,Commodity_Description,Market_Year,Month,id,Beginning Stocks,Ending Stocks,Imports,Exports,Area Harvested,Yield,Production,Domestic Consumption
0,"Almonds, Shelled Basis",1960,0,114,0.0,0.0,0.0,0.0,,,4.0,0.0
1,"Almonds, Shelled Basis",1961,0,114,0.0,0.0,0.0,0.0,,,8.0,0.0
2,"Almonds, Shelled Basis",1962,0,114,0.0,0.0,0.0,0.0,,,7.0,0.0
3,"Almonds, Shelled Basis",1964,0,114,0.0,1300.0,0.0,0.0,,,1300.0,0.0
4,"Almonds, Shelled Basis",1965,0,114,1300.0,1500.0,0.0,3500.0,,,7000.0,3300.0
...,...,...,...,...,...,...,...,...,...,...,...,...
146458,"Sugar, Centrifugal",2019,11,237,0.0,0.0,0.0,0.0,,,0.0,
146459,"Sugar, Centrifugal",2020,11,237,0.0,0.0,0.0,0.0,,,0.0,
146460,"Sugar, Centrifugal",2021,11,237,0.0,0.0,0.0,0.0,,,0.0,
146461,"Sugar, Centrifugal",2005,5,62,4699.0,5339.0,2549.0,6028.0,,,21648.0,


In [14]:
output.describe()

Unnamed: 0,Market_Year,Month,id,Beginning Stocks,Ending Stocks,Imports,Exports,Area Harvested,Yield,Production,Domestic Consumption
count,146463.0,146463.0,146463.0,132707.0,132707.0,146462.0,146462.0,57430.0,60744.0,146462.0,127539.0
mean,1992.579846,5.250425,131.844616,1270.585983,1288.692285,2696.456,3357.661,944.137803,37.546946,27935.65,26058.26
std,16.565273,3.413278,79.509353,13430.282856,13572.563606,27347.42,38604.36,3631.432589,157.843891,500072.1,507943.9
min,1960.0,0.0,0.0,-1135.0,-1135.0,0.0,0.0,0.0,0.0,0.0,-210.0
25%,1979.0,3.0,57.0,0.0,0.0,0.0,0.0,2.0,0.23,2.0,22.0
50%,1993.0,6.0,123.0,0.0,0.0,8.0,0.0,54.0,1.14,55.0,106.0
75%,2007.0,7.0,206.0,42.0,44.0,99.0,24.0,335.0,2.5,500.0,636.0
max,2021.0,12.0,267.0,584342.0,584342.0,1382800.0,1833000.0,70205.0,2443.0,42425000.0,41487100.0


In [19]:
output.iloc[[5386]]

Unnamed: 0,Commodity_Description,Market_Year,Month,id,Beginning Stocks,Ending Stocks,Imports,Exports,Area Harvested,Yield,Production,Domestic Consumption
5386,Cotton,1983,6,221,82.0,122.0,368.0,13.0,40.0,996.0,183.0,498.0


## Write the data set to CSV

In [None]:
output.to_csv(out_path + "commodity.csv", index = False)