In [1]:
import numpy as np
import pandas as pd

from sqlalchemy import create_engine

pd.options.display.max_rows = 200

In [2]:
path = '../data/raw/The-Database-of-Political-Institutions-2020-DPI2020/'
countries = pd.read_stata(path+'DPI2020_stata13.dta')

### Type correction: year

In [3]:
# Workaround necessary to avoid pandas setting a time index of "year"
countries['id_idb'] = countries.year.dt.year.astype('str') + ' ' + countries.countryname

In [4]:
countries

Unnamed: 0,countryname,ifs,year,system,yrsoffc,finittrm,yrcurnt,termlimit,reelect,multpl,...,stabs_strict,stabs,stabns_strict,stabns,tenlong_strict,tenlong,tenshort_strict,tenshort,polariz,id_idb
0,Turk Cyprus,0,1975-01-01,-999.0,1.0,-999.0,-999.0,-999.0,-999.0,-999.0,...,,,,,,,,,,1975 Turk Cyprus
1,Turk Cyprus,0,1976-01-01,Presidential,1.0,1.0,0.0,1.0,1.0,1.0,...,,,,,1.0,1.0,1.0,1.0,0.0,1976 Turk Cyprus
2,Turk Cyprus,0,1977-01-01,Presidential,2.0,1.0,4.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,0.0,1977 Turk Cyprus
3,Turk Cyprus,0,1978-01-01,Presidential,3.0,1.0,3.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,3.0,3.0,2.0,2.0,0.0,1978 Turk Cyprus
4,Turk Cyprus,0,1979-01-01,Presidential,4.0,1.0,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,4.0,4.0,3.0,3.0,0.0,1979 Turk Cyprus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8195,Zimbabwe,ZWE,2016-01-01,Presidential,29.0,1.0,2.0,3.0,1.0,1.0,...,0.0,0.0,0.0,0.0,29.0,29.0,3.0,3.0,0.0,2016 Zimbabwe
8196,Zimbabwe,ZWE,2017-01-01,Presidential,30.0,1.0,1.0,3.0,1.0,1.0,...,0.0,0.0,0.0,0.0,30.0,30.0,4.0,4.0,0.0,2017 Zimbabwe
8197,Zimbabwe,ZWE,2018-01-01,Presidential,1.0,1.0,0.0,3.0,1.0,1.0,...,0.25,0.25,0.333333,0.333333,5.0,5.0,1.0,1.0,0.0,2018 Zimbabwe
8198,Zimbabwe,ZWE,2019-01-01,Presidential,2.0,1.0,4.0,3.0,1.0,1.0,...,0.0,0.0,0.0,0.0,6.0,6.0,2.0,2.0,0.0,2019 Zimbabwe


## Create identifier to map countries in "Protests" to "Countries"

In [5]:
# Import "Protests" dataset
engine = create_engine('sqlite:///../data/processed/protests.db')
with engine.begin() as connection:
    protests = pd.read_sql('SELECT * FROM protests', connection)

In [6]:
countries

Unnamed: 0,countryname,ifs,year,system,yrsoffc,finittrm,yrcurnt,termlimit,reelect,multpl,...,stabs_strict,stabs,stabns_strict,stabns,tenlong_strict,tenlong,tenshort_strict,tenshort,polariz,id_idb
0,Turk Cyprus,0,1975-01-01,-999.0,1.0,-999.0,-999.0,-999.0,-999.0,-999.0,...,,,,,,,,,,1975 Turk Cyprus
1,Turk Cyprus,0,1976-01-01,Presidential,1.0,1.0,0.0,1.0,1.0,1.0,...,,,,,1.0,1.0,1.0,1.0,0.0,1976 Turk Cyprus
2,Turk Cyprus,0,1977-01-01,Presidential,2.0,1.0,4.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,2.0,2.0,1.0,1.0,0.0,1977 Turk Cyprus
3,Turk Cyprus,0,1978-01-01,Presidential,3.0,1.0,3.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,3.0,3.0,2.0,2.0,0.0,1978 Turk Cyprus
4,Turk Cyprus,0,1979-01-01,Presidential,4.0,1.0,2.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,4.0,4.0,3.0,3.0,0.0,1979 Turk Cyprus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8195,Zimbabwe,ZWE,2016-01-01,Presidential,29.0,1.0,2.0,3.0,1.0,1.0,...,0.0,0.0,0.0,0.0,29.0,29.0,3.0,3.0,0.0,2016 Zimbabwe
8196,Zimbabwe,ZWE,2017-01-01,Presidential,30.0,1.0,1.0,3.0,1.0,1.0,...,0.0,0.0,0.0,0.0,30.0,30.0,4.0,4.0,0.0,2017 Zimbabwe
8197,Zimbabwe,ZWE,2018-01-01,Presidential,1.0,1.0,0.0,3.0,1.0,1.0,...,0.25,0.25,0.333333,0.333333,5.0,5.0,1.0,1.0,0.0,2018 Zimbabwe
8198,Zimbabwe,ZWE,2019-01-01,Presidential,2.0,1.0,4.0,3.0,1.0,1.0,...,0.0,0.0,0.0,0.0,6.0,6.0,2.0,2.0,0.0,2019 Zimbabwe


In [7]:
countries.loc[countries.countryname=='Canada'].iloc[15:20].index

Int64Index([1303, 1304, 1305, 1306, 1307], dtype='int64')

In [8]:
# Identify cases where a protest_country isn't in the countries_country
match = {}
no_match = {}
for country in protests.country.unique():
    if country not in countries.countryname.unique():
        no_match[country] = None
no_match

{'Dominican Republic': None,
 'United Kingdom': None,
 'Germany': None,
 'Germany West': None,
 'Germany East': None,
 'Czechoslovakia': None,
 'Czech Republic': None,
 'Slovak Republic': None,
 'Kosovo': None,
 'Serbia': None,
 'Bosnia': None,
 'Serbia and Montenegro': None,
 'Montenegro': None,
 'USSR': None,
 'Cape Verde': None,
 'Equatorial Guinea': None,
 'Ivory Coast': None,
 'Central African Republic': None,
 'Congo Brazzaville': None,
 'Congo Kinshasa': None,
 'South Africa': None,
 'Comoros': None,
 'United Arab Emirate': None,
 'China': None,
 'North Korea': None,
 'South Korea': None,
 'Timor Leste': None,
 'Papua New Guinea': None}

##### Given the above list, manually create a dictionary to find the "Countries" country name corresponding to the "Protest" country name, since it is likely a difference in syntax/spelling/etc

In [9]:
print(countries.countryname.unique())

['Turk Cyprus' 'Afghanistan' 'Angola' 'Albania' 'UAE' 'Argentina'
 'Armenia' 'Australia' 'Austria' 'Azerbaijan' 'Burundi' 'Belgium' 'Benin'
 'Burkina Faso' 'Bangladesh' 'Bulgaria' 'Bahrain' 'Bahamas' 'Bosnia-Herz'
 'Belarus' 'Belize' 'Bolivia' 'Brazil' 'Barbados' 'Brunei' 'Bhutan'
 'Botswana' 'Cent. Af. Rep.' 'Canada' 'Switzerland' 'Chile' 'PRC'
 "Cote d'Ivoire" 'Cameroon' 'Congo' 'Colombia' 'Comoro Is.' 'C. Verde Is.'
 'Costa Rica' 'Czech Rep.' 'Cuba' 'Cyprus' 'GDR' 'FRG/Germany' 'Djibouti'
 'Denmark' 'Dom. Rep.' 'Algeria' 'Ecuador' 'Egypt' 'Eritrea' 'Spain'
 'Estonia' 'Ethiopia' 'Finland' 'Fiji' 'France' 'Gabon' 'UK' 'Georgia'
 'Ghana' 'Guinea' 'Gambia' 'Guinea-Bissau' 'Eq. Guinea' 'Greece' 'Grenada'
 'Guatemala' 'Guyana' 'Honduras' 'Croatia' 'Haiti' 'Hungary' 'Indonesia'
 'India' 'Ireland' 'Iran' 'Iraq' 'Iceland' 'Israel' 'Italy' 'Jamaica'
 'Jordan' 'Japan' 'Kazakhstan' 'Kenya' 'Kyrgyzstan' 'Cambodia' 'ROK'
 'Kuwait' 'Laos' 'Lebanon' 'Liberia' 'Libya' 'St. Lucia' 'Sri Lanka'
 'Lesot

In [10]:
no_match['Dominican Republic'] = 'Dom. Rep.'
no_match['United Kingdom'] = 'UK'
no_match['Germany'] = 'FRG/Germany'
no_match['Germany West'] = 'FRG/Germany'
no_match['Germany East'] = 'GDR'
no_match['Czechoslovakia'] = 'Czech Rep.'
no_match['Czech Republic'] = 'Czech Rep.'
no_match['Slovak Republic'] = 'Slovakia'
no_match['Kosovo'] = None # No corresponding country in "Countries" dataset
no_match['Serbia'] = None # No corresponding country in "Countries" dataset
no_match['Bosnia'] = 'Bosnia-Herz' # No corresponding country in "Countries" dataset
no_match['Serbia and Montenegro'] = None # No corresponding country in "Countries" dataset
no_match['Montenegro'] = None # No corresponding country in "Countries" dataset
no_match['USSR'] = 'Soviet Union'
no_match['Cape Verde'] = 'C. Verde Is.'
no_match['Equatorial Guinea'] = 'Eq. Guinea'
no_match['Ivory Coast'] = "Cote d'Ivoire"
no_match['Central African Republic'] = 'Cent. Af. Rep.'
no_match['Congo Brazzaville'] = 'Congo'
no_match['Congo Kinshasa'] = 'Congo (DRC)'
no_match['South Africa'] = 'S. Africa'
no_match['Comoros'] = 'Comoro Is.'
no_match['United Arab Emirate'] = 'UAE'
no_match['China'] = 'PRC'
no_match['North Korea'] = 'PRK'
no_match['South Korea'] = 'ROK'
no_match['Timor Leste'] = 'Timor-Leste'
no_match['Papua New Guinea'] = 'P. N. Guinea'


keys = no_match.keys()
for country in list(no_match):
    if no_match[country] == None:
        del no_match[country]
        
no_match

{'Dominican Republic': 'Dom. Rep.',
 'United Kingdom': 'UK',
 'Germany': 'FRG/Germany',
 'Germany West': 'FRG/Germany',
 'Germany East': 'GDR',
 'Czechoslovakia': 'Czech Rep.',
 'Czech Republic': 'Czech Rep.',
 'Slovak Republic': 'Slovakia',
 'Bosnia': 'Bosnia-Herz',
 'USSR': 'Soviet Union',
 'Cape Verde': 'C. Verde Is.',
 'Equatorial Guinea': 'Eq. Guinea',
 'Ivory Coast': "Cote d'Ivoire",
 'Central African Republic': 'Cent. Af. Rep.',
 'Congo Brazzaville': 'Congo',
 'Congo Kinshasa': 'Congo (DRC)',
 'South Africa': 'S. Africa',
 'Comoros': 'Comoro Is.',
 'United Arab Emirate': 'UAE',
 'China': 'PRC',
 'North Korea': 'PRK',
 'South Korea': 'ROK',
 'Timor Leste': 'Timor-Leste',
 'Papua New Guinea': 'P. N. Guinea'}

In [11]:
country_idb = []
for country in protests.country:
    if country in countries.countryname.unique():
        country_idb.append(country)
    elif country in no_match.keys():
        country_idb.append(no_match[country])
    else:
        country_idb.append(np.nan)
        
protests['country_idb'] = country_idb

In [12]:
# Count the number of missing values:
print('Rows w/missing data:', protests.country_idb.isna().sum())

# These rows will be dropped since they don't have corresponding data in IDB for countryname
protests.loc[protests.country.isna() | protests.country_idb.isna()]

Rows w/missing data: 68


Unnamed: 0,id,country,location,region,protestnumber,protesterviolence,protesteridentity,startyear,startmonth,startday,...,demand_removal-of-politician,demand_social-restrictions,response_accomodation,response_arrests,response_beatings,response_crowd-dispersal,response_ignore,response_killings,response_shootings,country_idb
5360,3412008001,Kosovo,Pristina,Europe,1,0,protesters,2008,12,2,...,0,0,0,0,0,0,1,0,0,
5361,3412009001,Kosovo,Mitrovica,Europe,1,1,serbs,2009,8,25,...,0,0,0,0,0,1,0,0,0,
5362,3412010001,Kosovo,Mitrovica,Europe,1,1,albanians,2010,3,30,...,0,0,0,0,0,1,0,0,0,
5363,3412010002,Kosovo,Pristina,Europe,2,0,protesters,2010,6,18,...,0,0,0,0,0,0,1,0,0,
5364,3412011001,Kosovo,Pristina,Europe,1,0,ethnic albanians,2011,6,22,...,0,0,1,0,0,0,0,0,0,
5365,3412011002,Kosovo,Kosovo Serbia border,Europe,2,1,ethnic serbs,2011,7,25,...,0,0,0,0,0,1,0,0,0,
5366,3412011003,Kosovo,Serbia Kosovo border,Europe,3,0,serbs,2011,9,16,...,0,0,0,0,0,1,0,0,0,
5367,3412013001,Kosovo,National,Europe,1,0,serbians,2013,4,21,...,0,0,0,0,0,0,1,0,0,
5368,3412015001,Kosovo,Pristina,Europe,1,1,protesters,2015,1,27,...,1,0,0,0,0,1,0,0,0,
5369,3412015002,Kosovo,Pristina,Europe,2,1,protesters,2015,10,13,...,0,0,0,0,0,1,0,0,0,


In [13]:
protests['id_idb'] = protests.startyear.astype('str')+' '+protests.country_idb
protests.dropna(inplace=True)
protests.shape

(15140, 37)

In [14]:
countries['execrlc'].value_counts()

0.0       3081
Left      2308
Right     1602
-999.0     649
Center     540
Name: execrlc, dtype: int64

In [15]:
countries.iloc[:, :100].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8200 entries, 0 to 8199
Data columns (total 100 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   countryname  8200 non-null   object        
 1   ifs          8200 non-null   object        
 2   year         8200 non-null   datetime64[ns]
 3   system       8192 non-null   category      
 4   yrsoffc      8190 non-null   float64       
 5   finittrm     8177 non-null   float64       
 6   yrcurnt      8145 non-null   float64       
 7   termlimit    8070 non-null   float64       
 8   reelect      8137 non-null   float64       
 9   multpl       8118 non-null   float64       
 10  military     8177 non-null   float64       
 11  defmin       8092 non-null   float64       
 12  percent1     7987 non-null   float32       
 13  percentl     8023 non-null   float32       
 14  prtyin       8183 non-null   float64       
 15  execme       8200 non-null   object        
 16  execr

In [16]:
countries.replace('NA', np.nan, inplace=True)

In [17]:
merged = pd.merge(protests, countries, how='left', on='id_idb')

cols_to_drop = ['id_idb', 'country_idb', 'countryname', 'ifs', 'year']
merged.drop(cols_to_drop, axis=1, inplace=True)

In [18]:
merged

Unnamed: 0,id,country,location,region,protestnumber,protesterviolence,protesteridentity,startyear,startmonth,startday,...,checks,stabs_strict,stabs,stabns_strict,stabns,tenlong_strict,tenlong,tenshort_strict,tenshort,polariz
0,201990001,Canada,national,North America,1,0,unknown,1990,1,15,...,4.0,0.0,0.0,0.0,0.0,6.0,6.0,6.0,6.0,0.0
1,201990002,Canada,"Montreal, Quebec",North America,2,0,unknown,1990,6,25,...,4.0,0.0,0.0,0.0,0.0,6.0,6.0,6.0,6.0,0.0
2,201990003,Canada,"Montreal, Quebec",North America,3,0,separatist parti quebecois,1990,7,1,...,4.0,0.0,0.0,0.0,0.0,6.0,6.0,6.0,6.0,0.0
3,201990004,Canada,"Montreal, Quebec",North America,4,1,mohawk indians,1990,7,12,...,4.0,0.0,0.0,0.0,0.0,6.0,6.0,6.0,6.0,0.0
4,201990005,Canada,"Montreal, Quebec",North America,5,1,local residents,1990,8,14,...,4.0,0.0,0.0,0.0,0.0,6.0,6.0,6.0,6.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15135,9102014001,Papua New Guinea,Manus Island,Oceania,1,1,asylum seekers,2014,2,16,...,3.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0,
15136,9102016001,Papua New Guinea,Port Moresby,Oceania,1,1,university students,2016,5,15,...,3.0,0.0,0.0,0.0,0.0,4.0,4.0,4.0,4.0,
15137,9102017001,Papua New Guinea,Bougainville,Oceania,1,0,protesters opposed to renewing the licence of ...,2017,6,15,...,3.0,0.0,0.0,0.0,0.0,5.0,5.0,5.0,5.0,
15138,9102017002,Papua New Guinea,Mount Hagen,Oceania,2,1,protesters opposed to counting irregularities ...,2017,7,15,...,3.0,0.0,0.0,0.0,0.0,5.0,5.0,5.0,5.0,


# Export data to SQL 

In [19]:
engine = create_engine('sqlite:///../data/processed/merged.db')

with engine.begin() as connection:
    merged.to_sql(name='merged', con=connection, if_exists='replace', index=False)