In [2]:
import requests, zipfile, io
import pandas as pd
import numpy as np
data_path = "~/CIS550/commodify/data/"

### Create the Political Entity table 

We will use the unique countries in the USDA agricultural data and a list of US state and territory names borrowed from gist.github.com

In [3]:
zip_url = "https://apps.fas.usda.gov/psdonline/downloads/psd_alldata_csv.zip"

r = requests.get(zip_url)
if r.ok:
  z = zipfile.ZipFile(io.BytesIO(r.content))
  usda_data = pd.read_csv(z.open('psd_alldata.csv'))

usda_data["Country_Name"].unique()

array(['Afghanistan', 'Algeria', 'Argentina', 'Australia', 'Brazil',
       'Canada', 'Chile', 'China', 'Colombia', 'European Union', 'Greece',
       'Hong Kong', 'India', 'Indonesia', 'Iran', 'Israel', 'Italy',
       'Japan', 'Jordan', 'Kazakhstan', 'Korea, South', 'Malaysia',
       'Mexico', 'Morocco', 'New Zealand', 'Norway', 'Pakistan',
       'Portugal', 'Russia', 'Saudi Arabia', 'South Africa', 'Spain',
       'Switzerland', 'Taiwan', 'Thailand', 'Tunisia', 'Turkey',
       'United Arab Emirates', 'United States', 'Vietnam', 'Armenia',
       'Austria', 'Azerbaijan', 'Belarus', 'Belgium-Luxembourg', 'Belize',
       'Bulgaria', 'Costa Rica', 'Czech Republic', 'Denmark',
       'Dominican Republic', 'Ecuador', 'Egypt', 'El Salvador', 'Estonia',
       'EU-15', 'Finland', 'Former Czechoslovakia', 'Former Yugoslavia',
       'France', 'Georgia', 'German Democratic Republic', 'Germany',
       'Germany, Federal Republic of', 'Guatemala', 'Haiti', 'Honduras',
       'Hungary', 'Ire

In [10]:
pol_ent = pd.DataFrame(usda_data["Country_Name"].unique(), columns = ["name"])
pol_ent["is_country"] = 1
pol_ent["abbrev"] = ''
pol_ent

Unnamed: 0,name,is_country,abbrev
0,Afghanistan,1,
1,Algeria,1,
2,Argentina,1,
3,Australia,1,
4,Brazil,1,
...,...,...,...
208,Seychelles,1,
209,St. Kitts and Nevis,1,
210,St. Lucia,1,
211,Tonga,1,


### Add in the US political entity names 

List of names taken from https://gist.github.com/rogerallen/1583593

In [13]:
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

states = np.array(list(us_state_abbrev.keys()))
abbrev = np.array(list(us_state_abbrev.values()))

us = pd.DataFrame(states, columns = ["name"])

us["abbrev"] = abbrev

us["is_country"] = 0

us

Unnamed: 0,name,abbrev,is_country
0,Alabama,AL,0
1,Alaska,AK,0
2,American Samoa,AS,0
3,Arizona,AZ,0
4,Arkansas,AR,0
5,California,CA,0
6,Colorado,CO,0
7,Connecticut,CT,0
8,Delaware,DE,0
9,District of Columbia,DC,0


In [14]:
pol_ent = pol_ent.append(us)
pol_ent

Unnamed: 0,name,is_country,abbrev
0,Afghanistan,1,
1,Algeria,1,
2,Argentina,1,
3,Australia,1,
4,Brazil,1,
...,...,...,...
51,Virginia,0,VA
52,Washington,0,WA
53,West Virginia,0,WV
54,Wisconsin,0,WI


### Now create a unique ID just in case of name collisions between country names and US states/territories

In [17]:
pol_ent = pol_ent.sort_values(by = "name")
pol_ent["id"] = np.arange(0, pol_ent.shape[0])
pol_ent[["id", "name", "is_country", "abbrev"]]

Unnamed: 0,id,name,is_country,abbrev
0,0,Afghanistan,1,
0,1,Alabama,0,AL
1,2,Alaska,0,AK
92,3,Albania,1,
1,4,Algeria,1,
...,...,...,...,...
172,264,Yemen (Aden),1,
125,265,Yemen (Sanaa),1,
90,266,Yugoslavia (>05/92),1,
110,267,Zambia,1,


In [18]:
pol_ent[["id", "name", "is_country", "abbrev"]].to_csv(data_path + "political_entity.csv", index = False)