<a href="https://colab.research.google.com/github/stevegbrooks/commodify/blob/preprocessing/usda_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
import requests, zipfile, io
import pandas as pd
import numpy as np
out_path = "~/CIS550/commodify/data/"

## Process commodities data from USDA

In [2]:
zip_url = "https://apps.fas.usda.gov/psdonline/downloads/psd_alldata_csv.zip"

r = requests.get(zip_url)
if r.ok:
  z = zipfile.ZipFile(io.BytesIO(r.content))
  usda_data = pd.read_csv(z.open('psd_alldata.csv'))

usda_data.head(n=5)

Unnamed: 0,Commodity_Code,Commodity_Description,Country_Code,Country_Name,Market_Year,Calendar_Year,Month,Attribute_ID,Attribute_Description,Unit_ID,Unit_Description,Value
0,577400,"Almonds, Shelled Basis",AF,Afghanistan,2010,2018,10,20,Beginning Stocks,21,(MT),0.0
1,577400,"Almonds, Shelled Basis",AF,Afghanistan,2010,2018,10,125,Domestic Consumption,21,(MT),0.0
2,577400,"Almonds, Shelled Basis",AF,Afghanistan,2010,2018,10,176,Ending Stocks,21,(MT),0.0
3,577400,"Almonds, Shelled Basis",AF,Afghanistan,2010,2018,10,88,Exports,21,(MT),0.0
4,577400,"Almonds, Shelled Basis",AF,Afghanistan,2010,2018,10,57,Imports,21,(MT),0.0


### Deal with null values

In [3]:
usda_data.isnull().values.any()

True

It looks like only the `Country_Code` column has `NaN`. 


In [4]:
usda_data[usda_data.isnull().any(axis=1)]

Unnamed: 0,Commodity_Code,Commodity_Description,Country_Code,Country_Name,Market_Year,Calendar_Year,Month,Attribute_ID,Attribute_Description,Unit_ID,Unit_Description,Value
716727,813100,"Meal, Soybean",,Netherlands Antilles,1976,2006,6,20,Beginning Stocks,8,(1000 MT),0.0
716728,813100,"Meal, Soybean",,Netherlands Antilles,1976,2006,6,7,Crush,8,(1000 MT),0.0
716729,813100,"Meal, Soybean",,Netherlands Antilles,1976,2006,6,125,Domestic Consumption,8,(1000 MT),1.0
716730,813100,"Meal, Soybean",,Netherlands Antilles,1976,2006,6,176,Ending Stocks,8,(1000 MT),0.0
716731,813100,"Meal, Soybean",,Netherlands Antilles,1976,2006,6,88,Exports,8,(1000 MT),0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
1726314,612000,"Sugar, Centrifugal",,Netherlands Antilles,2021,2020,11,99,Refined Exp.(Raw Val),8,(1000 MT),0.0
1726315,612000,"Sugar, Centrifugal",,Netherlands Antilles,2021,2020,11,74,Refined Imp.(Raw Val),8,(1000 MT),0.0
1726316,612000,"Sugar, Centrifugal",,Netherlands Antilles,2021,2020,11,126,Total Disappearance,8,(1000 MT),0.0
1726317,612000,"Sugar, Centrifugal",,Netherlands Antilles,2021,2020,11,178,Total Distribution,8,(1000 MT),0.0


Lets check which values for `Country_Name` don't have a `Country_Code`

In [5]:
usda_data[usda_data.isnull().any(axis=1)]["Country_Name"].unique()

array(['Netherlands Antilles'], dtype=object)

Before gaining indepedence in 2010, these islands were part of the Netherlands, but now the group of islands consists of smaller countries. 

We can just set the country code of Netherlands Antilles to the Netherlands' country code.

In [6]:
usda_data[usda_data["Country_Name"] == "Netherlands"]["Country_Code"].unique()

array(['NL'], dtype=object)

In [9]:
usda_data.loc[usda_data.Country_Name == "Netherlands Antilles", 'Country_Code'] = "NL"

Check to make sure it worked.

In [10]:
usda_data.isnull().values.any()

False

## Reshape from long to wide

In [15]:
usda_pivot = usda_data.pivot(index = ["Commodity_Description", "Market_Year", "Month", "Country_Name"], columns = "Attribute_Description", values = "Value")
usda_pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Attribute_Description,Annual % Change Per Cap. Cons.,Annual % Change Prod. To Sows,Arabica Production,Area Harvested,Bean Exports,Bean Imports,Beef Cows Beg. Stocks,Beet Sugar Production,Beginning Stocks,Calf Slaughter,...,TY Imp. from U.S.,TY Imports,Total Disappearance,Total Distribution,Total Slaughter,Total Supply,Total Use,USE Dom. Consumption,Withdrawal From Market,Yield
Commodity_Description,Market_Year,Month,Country_Name,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
"Almonds, Shelled Basis",1960,0,Iran,,,,,,,,,0.0,,...,,,,0.0,,4.0,,,,
"Almonds, Shelled Basis",1960,0,Italy,,,,,,,,,0.0,,...,,,,0.0,,13.0,,,,
"Almonds, Shelled Basis",1960,0,Morocco,,,,,,,,,0.0,,...,,,,0.0,,2.0,,,,
"Almonds, Shelled Basis",1960,0,Portugal,,,,,,,,,0.0,,...,,,,0.0,,1.0,,,,
"Almonds, Shelled Basis",1960,0,Spain,,,,,,,,,0.0,,...,,,,0.0,,29.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Wheat,2020,3,Venezuela,,,,0.0,,,,,142.0,,...,0.0,1000.0,,1142.0,,1142.0,,,,0.00
Wheat,2020,3,Vietnam,,,,0.0,,,,,849.0,,...,0.0,3400.0,,4249.0,,4249.0,,,,0.00
Wheat,2020,3,Yemen,,,,95.0,,,,,564.0,,...,0.0,3800.0,,4502.0,,4502.0,,,,1.45
Wheat,2020,3,Zambia,,,,30.0,,,,,18.0,,...,0.0,50.0,,258.0,,258.0,,,,6.33


In [18]:
usda_pivot_reset = usda_pivot.reset_index(drop=False)
usda_pivot_reset

Attribute_Description,Commodity_Description,Market_Year,Month,Country_Name,Annual % Change Per Cap. Cons.,Annual % Change Prod. To Sows,Arabica Production,Area Harvested,Bean Exports,Bean Imports,...,TY Imp. from U.S.,TY Imports,Total Disappearance,Total Distribution,Total Slaughter,Total Supply,Total Use,USE Dom. Consumption,Withdrawal From Market,Yield
0,"Almonds, Shelled Basis",1960,0,Iran,,,,,,,...,,,,0.0,,4.0,,,,
1,"Almonds, Shelled Basis",1960,0,Italy,,,,,,,...,,,,0.0,,13.0,,,,
2,"Almonds, Shelled Basis",1960,0,Morocco,,,,,,,...,,,,0.0,,2.0,,,,
3,"Almonds, Shelled Basis",1960,0,Portugal,,,,,,,...,,,,0.0,,1.0,,,,
4,"Almonds, Shelled Basis",1960,0,Spain,,,,,,,...,,,,0.0,,29.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
146458,Wheat,2020,3,Venezuela,,,,0.0,,,...,0.0,1000.0,,1142.0,,1142.0,,,,0.00
146459,Wheat,2020,3,Vietnam,,,,0.0,,,...,0.0,3400.0,,4249.0,,4249.0,,,,0.00
146460,Wheat,2020,3,Yemen,,,,95.0,,,...,0.0,3800.0,,4502.0,,4502.0,,,,1.45
146461,Wheat,2020,3,Zambia,,,,30.0,,,...,0.0,50.0,,258.0,,258.0,,,,6.33


In [24]:
cols_to_keep = ["Commodity_Description", "Market_Year", "Month", "Country_Name",
           "Beginning Stocks", "Ending Stocks", "Imports", "Exports", 
           "Area Harvested", "Yield", "Production", "Domestic Consumption"]

output = usda_pivot_reset[cols_to_keep]

output.to_csv(out_path + "usda_data.csv", index = False)

output

Attribute_Description,Commodity_Description,Market_Year,Month,Country_Name,Beginning Stocks,Ending Stocks,Imports,Exports,Area Harvested,Yield,Production,Domestic Consumption
0,"Almonds, Shelled Basis",1960,0,Iran,0.0,0.0,0.0,0.0,,,4.0,0.0
1,"Almonds, Shelled Basis",1960,0,Italy,0.0,0.0,0.0,0.0,,,13.0,0.0
2,"Almonds, Shelled Basis",1960,0,Morocco,0.0,0.0,0.0,0.0,,,2.0,0.0
3,"Almonds, Shelled Basis",1960,0,Portugal,0.0,0.0,0.0,0.0,,,1.0,0.0
4,"Almonds, Shelled Basis",1960,0,Spain,0.0,0.0,0.0,0.0,,,29.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
146458,Wheat,2020,3,Venezuela,142.0,242.0,1000.0,0.0,0.0,0.00,0.0,900.0
146459,Wheat,2020,3,Vietnam,849.0,649.0,3400.0,250.0,0.0,0.00,0.0,3350.0
146460,Wheat,2020,3,Yemen,564.0,602.0,3800.0,0.0,95.0,1.45,138.0,3900.0
146461,Wheat,2020,3,Zambia,18.0,23.0,50.0,0.0,30.0,6.33,190.0,235.0


## Create "Political Entity" data by grabbing unique country names

In [21]:
output["Country_Name"].unique()

array(['Iran', 'Italy', 'Morocco', 'Portugal', 'Spain', 'United States',
       'Turkey', 'Greece', 'Tunisia', 'Thailand', 'Algeria', 'Argentina',
       'Australia', 'Brazil', 'Canada', 'Chile', 'China', 'Colombia',
       'European Union', 'Hong Kong', 'India', 'Indonesia', 'Israel',
       'Japan', 'Jordan', 'Kazakhstan', 'Korea, South', 'Malaysia',
       'Mexico', 'New Zealand', 'Norway', 'Pakistan', 'Russia',
       'Saudi Arabia', 'South Africa', 'Switzerland', 'Taiwan',
       'United Arab Emirates', 'Vietnam', 'Afghanistan', 'Austria',
       'Belgium-Luxembourg', 'Bulgaria', 'Costa Rica', 'Denmark',
       'Dominican Republic', 'Ecuador', 'El Salvador', 'Finland',
       'Former Czechoslovakia', 'Former Yugoslavia', 'France',
       'German Democratic Republic', 'Germany, Federal Republic of',
       'Guatemala', 'Honduras', 'Hungary', 'Ireland', 'Netherlands',
       'Nicaragua', 'Panama', 'Peru', 'Philippines', 'Poland', 'Romania',
       'Sweden', 'Union of Soviet Socialis

In [26]:
pol_ent = pd.DataFrame(output["Country_Name"].unique(), columns = ["name"])
pol_ent["is_country"] = 1
pol_ent

Unnamed: 0,name,is_country
0,Iran,1
1,Italy,1
2,Morocco,1
3,Portugal,1
4,Spain,1
...,...,...
208,Seychelles,1
209,St. Lucia,1
210,French Polynesia,1
211,Tonga,1


### Add in the US political entity names

In [39]:
us_names = np.array(["Alaska", "Alabama", "Arkansas", "American Samoa", "Arizona", "California", "Colorado", "Connecticut", "District of Columbia", "Delaware", "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Puerto Rico", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Virgin Islands", "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"])

us = pd.DataFrame(us_names, columns = ["name"])

us["is_country"] = 0

us

Unnamed: 0,name,is_country
0,Alaska,0
1,Alabama,0
2,Arkansas,0
3,American Samoa,0
4,Arizona,0
5,California,0
6,Colorado,0
7,Connecticut,0
8,District of Columbia,0
9,Delaware,0


In [40]:
pol_ent.append(us)

Unnamed: 0,name,is_country
0,Iran,1
1,Italy,1
2,Morocco,1
3,Portugal,1
4,Spain,1
...,...,...
50,Vermont,0
51,Washington,0
52,Wisconsin,0
53,West Virginia,0
