In [1]:
import requests, zipfile, io
import pandas as pd
import numpy as np
data_path = "~/CIS550/commodify/data/"

### Create the Political Entity table 

We will use the unique countries in the USDA agricultural data and a list of US state and territory names borrowed from gist.github.com

In [2]:
zip_url = "https://apps.fas.usda.gov/psdonline/downloads/psd_alldata_csv.zip"

r = requests.get(zip_url)
if r.ok:
  z = zipfile.ZipFile(io.BytesIO(r.content))
  usda_data = pd.read_csv(z.open('psd_alldata.csv'))

usda_data["Country_Name"].unique()

array(['Afghanistan', 'Algeria', 'Argentina', 'Australia', 'Brazil',
       'Canada', 'Chile', 'China', 'Colombia', 'European Union', 'Greece',
       'Hong Kong', 'India', 'Indonesia', 'Iran', 'Israel', 'Italy',
       'Japan', 'Jordan', 'Kazakhstan', 'Korea, South', 'Malaysia',
       'Mexico', 'Morocco', 'New Zealand', 'Norway', 'Pakistan',
       'Portugal', 'Russia', 'Saudi Arabia', 'South Africa', 'Spain',
       'Switzerland', 'Taiwan', 'Thailand', 'Tunisia', 'Turkey',
       'United Arab Emirates', 'United States', 'Vietnam', 'Armenia',
       'Austria', 'Azerbaijan', 'Belarus', 'Belgium-Luxembourg', 'Belize',
       'Bulgaria', 'Costa Rica', 'Czech Republic', 'Denmark',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Estonia',
       'EU-15', 'Finland', 'Former Czechoslovakia', 'Former Yugoslavia',
       'France', 'Georgia', 'German Democratic Republic', 'Germany',
       'Germany, Federal Republic of', 'Guatemala', 'Haiti', 'Honduras',
       'Hungary', 'Ire

In [8]:
pol_ent = pd.DataFrame(usda_data["Country_Name"].unique(), columns = ["name"])
pol_ent["is_country"] = 1
pol_ent

Unnamed: 0,name,is_country
0,Afghanistan,1
1,Algeria,1
2,Argentina,1
3,Australia,1
4,Brazil,1
...,...,...
208,Seychelles,1
209,St. Kitts and Nevis,1
210,St. Lucia,1
211,Tonga,1


### Add in the US political entity names 

List of names taken from https://gist.github.com/norcal82/e4c7e8113f377db184bb

In [9]:
us_names = np.array(["Alaska", "Alabama", "Arkansas", "American Samoa", "Arizona", "California", "Colorado", "Connecticut", "District of Columbia", "Delaware", "Florida", "Georgia", "Guam", "Hawaii", "Iowa", "Idaho", "Illinois", "Indiana", "Kansas", "Kentucky", "Louisiana", "Massachusetts", "Maryland", "Maine", "Michigan", "Minnesota", "Missouri", "Mississippi", "Montana", "North Carolina", "North Dakota", "Nebraska", "New Hampshire", "New Jersey", "New Mexico", "Nevada", "New York", "Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Puerto Rico", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", "Texas", "Utah", "Virginia", "Virgin Islands", "Vermont", "Washington", "Wisconsin", "West Virginia", "Wyoming"])

us = pd.DataFrame(us_names, columns = ["name"])

us["is_country"] = 0

us

Unnamed: 0,name,is_country
0,Alaska,0
1,Alabama,0
2,Arkansas,0
3,American Samoa,0
4,Arizona,0
5,California,0
6,Colorado,0
7,Connecticut,0
8,District of Columbia,0
9,Delaware,0


In [10]:
pol_ent = pol_ent.append(us)
pol_ent

Unnamed: 0,name,is_country
0,Afghanistan,1
1,Algeria,1
2,Argentina,1
3,Australia,1
4,Brazil,1
...,...,...
50,Vermont,0
51,Washington,0
52,Wisconsin,0
53,West Virginia,0


### Now create a unique ID just in case of name collisions between country names and US states/territories

In [11]:
pol_ent = pol_ent.sort_values(by = "name")
pol_ent["id"] = np.arange(0, pol_ent.shape[0])
pol_ent[["id", "name", "is_country"]]

Unnamed: 0,id,name,is_country
0,0,Afghanistan,1
1,1,Alabama,0
0,2,Alaska,0
92,3,Albania,1
1,4,Algeria,1
...,...,...,...
172,263,Yemen (Aden),1
125,264,Yemen (Sanaa),1
90,265,Yugoslavia (>05/92),1
110,266,Zambia,1


In [12]:
pol_ent[["id", "name", "is_country"]].to_csv(data_path + "political_entity.csv", index = False)