# Merge CIRI data with scraped State Dep Reports
## Merge on year and country name

In [1]:
import pandas as pd

In [45]:
# CIRI data
ciri_df = pd.read_csv('../data/CIRI_DATA_2016.csv')
# State Dept. data
state_dep = pd.read_csv('../data/Custom_State_Dep_Reports/us_state_dept_reports_1999_2018.csv')
#state_dep = state_dep.drop(state_dep.columns[0],axis=1)
# Read in reference.
ref = pd.read_csv('../data/Custom_State_Dep_Reports/state_dept_country_names.csv')
ref = ref.drop(ref.columns[0],axis=1)
# Read in GapMinder
codes = pd.read_csv('../data/GapMinder_Raw_CSVs/UNCTRY_CODES_GapMinderNames.csv')

In [25]:
#!!DO NOT RUN AGAIN!!
#Create a dataframe to quickly be cleaned manually.
#ctry = state_dep.Country.unique()
#ctry = pd.DataFrame({"country":ctry, "simple":ctry})
#ctry.to_csv("../data/Custom_State_Dep_Reports/state_dept_country_names.csv")

In [46]:
# Countries in the GapMinder Data are not exactly the same as in the UN codes.
# This function looks for the most parsimonious name link between ...
# ... the GapMinder and UN data and return the UN codes.
def compare(x, codes):
    ctry = {'size':100,'un':0}
    for i in range(len(codes)):
        if x.lower() in codes.CTRY.iloc[i].lower():
            if len(codes.CTRY.iloc[i]) <= ctry['size']:
                ctry['size'] = len(codes.CTRY.iloc[i])
                ctry['un'] = codes.UNCTRY.iloc[i]
    return ctry

In [47]:
# Take the simplified state deptartment country names...
countries = ref.simple.unique()
un = []
# Loop through and find the GapMinder equivalent and UN number.
for i in countries:
    un.append(compare(i, codes)['un'])
codes = pd.DataFrame({'CTRY':countries,'UNCTRY':un})

In [48]:
# Associate the UN numbers by simple country name in state dept. data.
# This has the side-effect of giving the same UN code to various spellings of countries.
st_dpt_un = pd.merge(ref,codes,
                     left_on='simple',
                     right_on='CTRY',how='left').drop('simple',axis=1)

In [49]:
state_dep.columns

Index(['year', 'country', 'url', 'text', 'code'], dtype='object')

In [50]:
# Associate complex country name with the State Dept. to include the text data.
state_dep = pd.merge(state_dep,st_dpt_un,
         left_on='country',
         right_on='country',how='left').drop(['CTRY'], axis=1) #original dropped 'country' as well, leaving it in to check its working

In [51]:
state_dep.columns

Index(['year', 'country', 'url', 'text', 'code', 'UNCTRY'], dtype='object')

In [63]:
# Merge CIRI data with State Dept. data.
df_text_to_ciri = pd.merge(state_dep,ciri_df,
                           left_on=['UNCTRY','year'],
                           right_on=['unctry',"year"],how = 'inner') #.drop('Country',axis=1)

In [64]:
df_text_to_ciri[['country','countryname']]

Unnamed: 0,country,countryname
0,Benin,Benin
1,Botswana,Botswana
2,Burkina Faso,Burkina Faso
3,Burundi,Burundi
4,Cameroon,Cameroon
5,Cape Verde,Cape Verde
6,Central African Republic,Central African Republic
7,Chad,Chad
8,Comoros,Comoros
9,Cote D'Ivoire,Cote d'Ivoire


In [60]:
# Write.
df_text_to_ciri.to_csv('../data/Custom_State_Dep_Reports/CIRI_Text_1999_2015.csv')

In [59]:
df_text_to_ciri

Unnamed: 0,year,country,url,text,code,UNCTRY,countryname,ciri,cow,polity,...,assn,elecsd,worker,wopol,wecon,injud,physint,new_empinx,Unnamed: 24,Unnamed: 25
0,1999,Benin,https://www.state.gov/j/drl/rls/hrrpt/1999/227...,The Republic of Benin is a constitutional demo...,204.0,204.0,Benin,155,434.0,434.0,...,2.0,2.0,1.0,2.0,1.0,1.0,8.0,11.0,,
1,1999,Botswana,https://www.state.gov/j/drl/rls/hrrpt/1999/228...,"Botswana is a longstanding, multiparty democra...",72.0,72.0,Botswana,167,571.0,571.0,...,2.0,2.0,1.0,2.0,1.0,2.0,7.0,12.0,,
2,1999,Burkina Faso,https://www.state.gov/j/drl/rls/hrrpt/1999/229...,President Blaise Compaore continued to dominat...,854.0,854.0,Burkina Faso,179,439.0,439.0,...,2.0,1.0,2.0,2.0,0.0,0.0,6.0,12.0,,
3,1999,Burundi,https://www.state.gov/j/drl/rls/hrrpt/1999/230...,Burundi is ruled by an authoritarian military ...,108.0,108.0,Burundi,185,516.0,516.0,...,0.0,0.0,1.0,2.0,1.0,0.0,1.0,4.0,,
4,1999,Cameroon,https://www.state.gov/j/drl/rls/hrrpt/1999/231...,Cameroon is a republic dominated by a strong p...,120.0,120.0,Cameroon,191,471.0,471.0,...,1.0,1.0,0.0,2.0,1.0,1.0,2.0,6.0,,
5,1999,Cape Verde,https://www.state.gov/j/drl/rls/hrrpt/1999/232...,Cape Verde is a multiparty parliamentary democ...,132.0,132.0,Cape Verde,197,402.0,402.0,...,,,,,,1.0,,,,
6,1999,Central African Republic,https://www.state.gov/j/drl/rls/hrrpt/1999/233...,The Central African Republic is a constitution...,140.0,140.0,Central African Republic,200,482.0,482.0,...,1.0,1.0,0.0,2.0,1.0,1.0,5.0,7.0,,
7,1999,Chad,https://www.state.gov/j/drl/rls/hrrpt/1999/234...,Chad is a centralized republic dominated by a ...,148.0,148.0,Chad,203,483.0,483.0,...,1.0,1.0,1.0,2.0,1.0,0.0,3.0,5.0,,
8,1999,Comoros,https://www.state.gov/j/drl/rls/hrrpt/1999/235...,The Federal Islamic Republic of the Comoros is...,174.0,174.0,Comoros,215,581.0,581.0,...,,,,,,0.0,,,,
9,1999,Cote D'Ivoire,https://www.state.gov/j/drl/rls/hrrpt/1999/242...,Cote d'Ivoire is in transition following a blo...,384.0,384.0,Cote d'Ivoire,227,437.0,437.0,...,1.0,1.0,0.0,2.0,2.0,1.0,3.0,5.0,,


In [58]:
state_dep

Unnamed: 0,year,country,url,text,code,UNCTRY
0,1999,Angola,https://www.state.gov/j/drl/rls/hrrpt/1999/223...,The Republic of Angola's transition from a sin...,24.0,24.0
1,1999,Benin,https://www.state.gov/j/drl/rls/hrrpt/1999/227...,The Republic of Benin is a constitutional demo...,204.0,204.0
2,1999,Botswana,https://www.state.gov/j/drl/rls/hrrpt/1999/228...,"Botswana is a longstanding, multiparty democra...",72.0,72.0
3,1999,Burkina Faso,https://www.state.gov/j/drl/rls/hrrpt/1999/229...,President Blaise Compaore continued to dominat...,854.0,854.0
4,1999,Burundi,https://www.state.gov/j/drl/rls/hrrpt/1999/230...,Burundi is ruled by an authoritarian military ...,108.0,108.0
5,1999,Cameroon,https://www.state.gov/j/drl/rls/hrrpt/1999/231...,Cameroon is a republic dominated by a strong p...,120.0,120.0
6,1999,Cape Verde,https://www.state.gov/j/drl/rls/hrrpt/1999/232...,Cape Verde is a multiparty parliamentary democ...,132.0,132.0
7,1999,Central African Republic,https://www.state.gov/j/drl/rls/hrrpt/1999/233...,The Central African Republic is a constitution...,140.0,140.0
8,1999,Chad,https://www.state.gov/j/drl/rls/hrrpt/1999/234...,Chad is a centralized republic dominated by a ...,148.0,148.0
9,1999,Comoros,https://www.state.gov/j/drl/rls/hrrpt/1999/235...,The Federal Islamic Republic of the Comoros is...,174.0,174.0
