In [1]:
import time
import requests
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup

In [2]:
pd.set_option("display.max_columns", 2000)
pd.set_option("display.max_rows", 2000)

In [3]:
df = pd.read_csv("../data/us_state_dept_hr_reports.csv")

### Gapminder Names to UN Codes

In [4]:
fp = "../data/GapMinder_Raw_CSVs/UNCTRY_CODES_GapMinderNames.csv"
df_un = pd.read_csv(fp)

In [5]:
df_un.rename(columns={'CTRY':'country', 'UNCTRY':'code'}, inplace=True)

In [6]:
country_un_mapper = {k:v for k,v in zip(df_un.country, df_un.code)}

In [7]:
def apply_mapper(country_name):
    try:
        return country_un_mapper[country_name]
    except:
        return -1

### US State Department Country Names

In [8]:
fp_state = "../data/Custom_State_Dep_Reports/state_dept_country_names.csv"
df_state = pd.read_csv(fp_state)

In [9]:
df_state = df_state[['country', 'simple']].copy()
df_state.reset_index(inplace=True, drop=True)
df_state.rename(columns={'country': 'country_full', 'simple':'country'}, 
                inplace=True)

In [10]:
df_state['code_full'] = df_state.country_full.apply(apply_mapper)

In [11]:
df_state['code_short'] = df_state.country.apply(apply_mapper)

In [12]:
df_state['code'] = df_state[['code_full','code_short']].apply(max, axis=1)

In [13]:
# check that all country names are handled (in some way)
for cu in df.country.unique():
    cu = cu.strip()
    _df_cu = df_state[df_state.country == cu]
    _df_cu_f = df_state[df_state.country_full == cu]
    if (len(_df_cu) == 0) & (len(_df_cu_f) == 0):
        print(cu)

Make a name to code mapper

In [14]:
_df_state = df_state[["country_full","code"]].copy()
_df_state.rename(columns={"country_full":"country"}, inplace=True)

In [15]:
_df_state.append(df_state[["country", "code"]], ignore_index=True)
_df_state = _df_state.drop_duplicates()

In [16]:
state_codes1 = [(k,v) for k, v in zip(df_state.country, _df_state.code)]
state_codes2 = [(k,v) for k, v in zip(df_state.country_full, _df_state.code)]

In [17]:
mapper_state = {k:v for k,v in set(state_codes1 + state_codes2)}

Apply to State Department Sraping

In [18]:
df.country = df.country.str.strip()

In [19]:
df['code'] = df.country.map(mapper_state)

In [20]:
df.head()

Unnamed: 0.1,Unnamed: 0,year,country,url,text,code
0,0,1999,Angola,https://www.state.gov/j/drl/rls/hrrpt/1999/223...,The Republic of Angola's transition from a sin...,24.0
1,1,1999,Benin,https://www.state.gov/j/drl/rls/hrrpt/1999/227...,The Republic of Benin is a constitutional demo...,204.0
2,2,1999,Botswana,https://www.state.gov/j/drl/rls/hrrpt/1999/228...,"Botswana is a longstanding, multiparty democra...",72.0
3,3,1999,Burkina Faso,https://www.state.gov/j/drl/rls/hrrpt/1999/229...,President Blaise Compaore continued to dominat...,854.0
4,4,1999,Burundi,https://www.state.gov/j/drl/rls/hrrpt/1999/230...,Burundi is ruled by an authoritarian military ...,108.0


In [21]:
df.tail()

Unnamed: 0.1,Unnamed: 0,year,country,url,text,code
3869,3869,2018,Saint Vincent and the Grenadines,https://www.state.gov/j/drl/rls/hrrpt/2018/wha...,\n\nPDF\n\n\n \n\n\nPermalink: http://www.s...,670.0
3870,3870,2018,Suriname,https://www.state.gov/j/drl/rls/hrrpt/2018/wha...,\n\nPDF\n\n\n \n\n\nPermalink: http://www.s...,
3871,3871,2018,Trinidad and Tobago,https://www.state.gov/j/drl/rls/hrrpt/2018/wha...,\n\nPDF \nPermalink: http://www.state.gov/j...,780.0
3872,3872,2018,Uruguay,https://www.state.gov/j/drl/rls/hrrpt/2018/wha...,\n\nPDF \nPermalink: http://www.state.gov/j...,858.0
3873,3873,2018,Venezuela,https://www.state.gov/j/drl/rls/hrrpt/2018/wha...,\n\nPDF \nPermalink: http://www.state.gov/j...,862.0


In [22]:
df.to_csv('../data/us_state_dept_hr_reports_un_coded_1999_2018.csv', 
          index=False)

In [23]:
df.year.min()

1999

##### Missing / Error Codes

Code == 9000  
States Added, but UN / US State Department category does not exist

In [24]:
df_state[df_state.code  == 9000]

Unnamed: 0,country_full,country,code_full,code_short,code
55,Hong Kong,Hong Kong,9000,9000,9000
60,Macau,Macau,9000,9000,9000
76,Taiwan,Taiwan,9000,9000,9000
185,Western Sahara,Western Sahara,-1,9000,9000
217,"Yugoslavia, Federal Republic of",Yugoslavia,-1,9000,9000
226,Western Sahara,Western Sahara,9000,9000,9000
248,Kosovo,Kosovo,9000,9000,9000
252,Kosovo,Kosovo,9000,9000,9000
255,Western Sahara,Morocco,9000,504,9000
266,Cabo Verde,Cabo Verde,9000,9000,9000


Code == -1  
A parsing or labeling error from the scraping steps.  In this case, some yearly editions contained prefaces and introductions.

In [25]:
df_state[df_state.code  == -1]

Unnamed: 0,country_full,country,code_full,code_short,code
240,Preface,Preface,-1,-1,-1
241,Introduction,Introduction,-1,-1,-1
