## Imports

In [1]:
import numpy as np
import pandas as pd

from sqlalchemy import create_engine

pd.options.display.max_rows = 200
pd.options.display.max_columns = 200

NOT_APLIC_STR = "NA_SS"
NOT_APLIC_NUM = -999.0
OTHER = "OTHER" 

In [2]:
path = '../data/raw/The-Database-of-Political-Institutions-2020-DPI2020/'
countries = pd.read_stata(path+'DPI2020_stata13.dta')

countries.fillna(np.nan, inplace=True) # Use Numpy's NaN instead of Panda's for consistency
countries['year'] = countries.year.dt.year # no need to use DT format when it is actually just an integer for the year, otherwise, leads to formatting inconsistencies downstream

# determined to be unvaluable based on extensive EDA. Not worthwhile to explain each choice
# refer to data dictionary to infer it is a proper choice
remove_these_features = countries.iloc[:, 25:76].columns
countries.drop(remove_these_features, axis=1, inplace=True)

## Understanding columns

- ifs - "IFS" country code
- system - Presidential (0), Assembly-elected president (1), parliamentry (2)
- yrsoffc - how many years has chief executive been in office?
- finittrm - is there a finite term? (0, 1)
- yrcurnt - years left in current term
- multpl - if there are formal restraints on an executive’s term (NA if not), can s/he serve additional term(s) following the current one?
- military - is chief exeuctive a military officer?
- defmin - is defense minister a military officer?
- PERCENT1 - president got what % of votes in the 1st/only round?
- PERCENTL - president got what % of votes in the final round? (na if no runoff)
- PRTYIN - party of chief executive has been how long in office
- EXECME - name of party, if any **remove** (too many distinct values)
- EXECRLC - executive is Right (1), Left (3), Center (2), No information (0), No executive (NA)
- EXECNAT - Nationalist (0, 1)
- EXECRURL - "Rural" issues listed as key component of party's platform? **remove** nearly no 1s
- EXECREG - "Regional" issues listed as key component of party's platform? **remove** nearly no 1s
- EXECREL - Executive religion
- EXECAGE - time since party formation (under same name)
- ALLHOUSE - does party of executive control all relevant houses?
- NONCHIEF - party affiliation of the one not called "Chief Executive" (in systems w/both non-ceremonial PM and president) **remove**
- TOTALSEATS - total seats in legislature. Includes gov1seat, gov2seat, gov3seat, opp1seat, opp2seat, opp3seat, govothst, oppthst, numul)
- GOV1ME, GOV1SEAT GOV1[ETC] - Descriptors of largest party. Too granular to bother with in this analysis since there are already aggregate features that account for the same data. **remove**
- OPPMAJH - does one opposition party have an absolute majority in House?
- OPPMAJS - does one opposition party have an absolute majority in Senate?
- DATELEG - month when parliamentary elections were held **remove**
- DATEEXEC - month when presidential elections were held **remove**
- LEGELEX - legislative election this year?
- EXELEC - executive election this year?
- LIEC, EIEC - Legislative, Executive Index of Electoral Completiveness (see data dictionary, valuable metric)
- MDMH, MDMS - Mean District Magnitude House, Senate
- PLURALTY - government is plurality?
- PR - proportional representation?
- HOUSESYS, SENSYS - House, Senate electorical rule
- THRESH - vote threshold for representation *in proportional representation system*
- DHONDT - is the D'Hondt system used? **remove**
- CL - are closed lists used? 
- GQ, GQI - gender quota, whether it was implemented
- SELECT - method for selecting election candidates **remove** (almost no values)

--------

- AUTON - are there autonomous regions? (federalism)
- MUNI - are municipal governments locally elected?
- STATE - are there state/providence governments locally elected? (important!)
- AUTHOR - do the state/provinves have authority over taxing, spending, or legislating? (important!)
- STCONST - are the constituencies of the senators the states/provides? **remove**
- GWNO - no idea, not included in dictioanry **remove**
- NUMGOV - total number of seats held by all government parties **remove**
- NUMVOTE - vote share of ruling government party  
- NUMOPP - vote share of opposition government party **remove**
- FRAUD - were vote fraud or candidate intimidation serious enough to affect the outcome of elections
- MAJ - Margin of Majority (important!)
- PARTYAGE - average age of parties
- HERFGOV, HERFOPP - Herfindahl index government (sum of squared seat shares of all parties in the government) [probably a valuable metric]
- HERFTOT - same as above, for all, but very few values. **remove**

--------

- TENLONG - longest tenure of a veto player **remove in favor of strict**
- TENLONG_STRICT - uses TENLONG, restricted to fewer certain leaders 
- TENSHORT - shortest tensure of a veto player **remove in favor of strict**
- TENSHORT_strict - uses TENSHORT, restricted to fewer leaders
- CHECKS - checks and balances
- CHECKS_LAX - **remove**, corresponds to same group as TENLONG (removed), as opposed to CHECKS, which maps to people in TENLONG_STRICT
- STABS, STABS_STRICT - "Stability". use STRICT, as explained in TENLONG.
- STABNS, STABNS_STRICT - similar to STABS, **remove**
- POLARIZ, POLARIZ_STRICT - maximum polarization between the executive party and the four principle parties of the legislature. Use STRING similar to above. 

In [3]:
# Replace "placeholder" NaN values (as defined by data dictionary) 
# with one consistent np.nan across dataset for consistency

countries.replace(-999, NOT_APLIC_NUM, inplace=True)
countries.replace(-999.0, NOT_APLIC_NUM, inplace=True)
countries.replace('-999', NOT_APLIC_NUM, inplace=True)
countries.replace('-999.0', NOT_APLIC_NUM, inplace=True)
countries.replace('NA', NOT_APLIC_STR, inplace=True)



# One-off cases based on close reading of data dictionary
countries.system.replace(-999, NOT_APLIC_STR, inplace=True)
countries.system = countries.system.astype('object')
countries.execrlc.replace(0.0, NOT_APLIC_STR, inplace=True)
countries.execrlc = countries.execrlc.astype('object')
countries.execrel.replace(0.0, OTHER, inplace=True)
countries.execrel.replace(-999.0, NOT_APLIC_STR, inplace=True)
countries.execrel = countries.execrel.astype('object')
countries.percent1.replace(999.0, NOT_APLIC_NUM, inplace=True)
countries.percentl.replace(-99.0, NOT_APLIC_NUM, inplace=True)
countries.execrlc.replace(-999.0, NOT_APLIC_STR, inplace=True)
countries.oppmajs.replace(999.0, NOT_APLIC_NUM, inplace=True)
countries.legelec.replace(12.0, NOT_APLIC_NUM, inplace=True)
countries.liec.replace(0.0, NOT_APLIC_NUM, inplace=True)
countries.eiec.replace(0.0, NOT_APLIC_NUM, inplace=True)
countries.housesys.replace('PR', 'Proportional', inplace=True)
countries.housesys.replace(0.5, NOT_APLIC_STR, inplace=True)
countries.housesys.replace(-999.0, NOT_APLIC_STR, inplace=True)
countries.housesys = countries.housesys.astype('object')
countries.sensys.replace(-999.0, NOT_APLIC_STR, inplace=True)
countries.sensys.replace(-888.0, NOT_APLIC_STR, inplace=True)
countries.sensys.replace(0.5, NOT_APLIC_STR, inplace=True)
countries.sensys.replace('PR', 'Proportional', inplace=True)
countries.sensys = countries.sensys.astype('object')
countries.thresh.replace(-9999.0, NOT_APLIC_NUM, inplace=True)
countries.select.replace(-999.0, NOT_APLIC_STR, inplace=True)
countries.fraud.replace(0.0, 'OppositionBanned', inplace=True)
countries.fraud.replace(1.0, 'OppositionSuppressed', inplace=True)
countries.fraud.replace(-999.0, NOT_APLIC_STR, inplace=True)
countries.muni.replace('Legislature and executive locally elected', 1.0, inplace=True)
countries.muni.replace('Legislature locally elected', 0.5, inplace=True)
countries.muni.replace('No local elections', 0.0, inplace=True)
countries.muni = countries.muni.astype('float')

countries.state.replace('Legislature and executive locally elected', 1.0, inplace=True)
countries.state.replace('Legislature locally elected', 0.5, inplace=True)
countries.state.replace('No local elections', 0.0, inplace=True)
countries.state = countries.state.astype('float')

countries.maj.replace(NOT_APLIC_STR, NOT_APLIC_NUM, inplace=True)
countries.maj.fillna(NOT_APLIC_NUM, inplace=True)
countries.maj = countries.maj.astype('float')
countries.partyage.replace(NOT_APLIC_STR, NOT_APLIC_NUM, inplace=True)
countries.partyage.fillna(NOT_APLIC_NUM, inplace=True)
countries.partyage = countries.partyage.astype('float')
countries.herfgov.replace(NOT_APLIC_STR, NOT_APLIC_NUM, inplace=True)
countries.herfgov = countries.herfgov.astype('float')
countries.herfopp.replace(NOT_APLIC_STR, NOT_APLIC_NUM, inplace=True)
countries.herfopp = countries.herfopp.astype('float')



countries.frac = countries.frac.astype('object')
countries.frac.fillna(NOT_APLIC_NUM, inplace=True)
countries.frac.replace(NOT_APLIC_STR, NOT_APLIC_NUM, inplace=True)

countries.frac = countries.frac.astype('object')
countries.frac.fillna(NOT_APLIC_NUM, inplace=True)
countries.frac.replace(NOT_APLIC_STR, NOT_APLIC_NUM, inplace=True)

countries.oppfrac = countries.oppfrac.astype('object')
countries.oppfrac.fillna(NOT_APLIC_NUM, inplace=True)
countries.oppfrac.replace(NOT_APLIC_STR, NOT_APLIC_NUM, inplace=True)

countries.govfrac = countries.govfrac.astype('object')
countries.govfrac.fillna(NOT_APLIC_NUM, inplace=True)
countries.govfrac.replace(NOT_APLIC_STR, NOT_APLIC_NUM, inplace=True)

countries.tensys_strict.replace(NOT_APLIC_STR, NOT_APLIC_NUM, inplace=True)
countries.tensys_strict = countries.tensys_strict.astype('float')

countries.checks.replace(NOT_APLIC_STR, NOT_APLIC_NUM, inplace=True)
countries.checks = countries.checks.astype('float')

countries.stabs_strict.replace(NOT_APLIC_STR, NOT_APLIC_NUM, inplace=True)
countries.stabs_strict = countries.stabs_strict.astype('float')

countries.tenlong_strict.replace(NOT_APLIC_STR, NOT_APLIC_NUM, inplace=True)
countries.tenlong_strict = countries.tenlong_strict.astype('float')

countries.tenshort_strict.replace(NOT_APLIC_STR, NOT_APLIC_NUM, inplace=True)
countries.tenshort_strict = countries.tenshort_strict.astype('float')

countries.tenshort_strict.replace(NOT_APLIC_STR, NOT_APLIC_NUM, inplace=True)
countries.tenshort_strict = countries.tenshort_strict.astype('float')

countries.polariz.replace(NOT_APLIC_STR, NOT_APLIC_NUM, inplace=True)



In [5]:
countries[['countryname', 'ifs']].drop_duplicates()

Unnamed: 0,countryname,ifs
0,Turk Cyprus,0
46,Afghanistan,AFG
92,Angola,AGO
138,Albania,ALB
184,UAE,ARE
230,Argentina,ARG
276,Armenia,ARM
322,Australia,AUS
368,Austria,AUT
414,Azerbaijan,AZE


In [4]:
# Normalize percentages to [0, 1] instead of [0, 100]
def normalize_to_percent(zero_to_100):
    if_not_999 = lambda x: x/100 if x != -999.0 else x
    percentage = zero_to_100.astype('float').apply(if_not_999)
    return percentage



countries.polariz = countries.polariz.astype('float')
countries.percent1 = normalize_to_percent(countries.percent1)
countries.percentl = normalize_to_percent(countries.percentl)
countries.numvote = normalize_to_percent(countries.numvote)
countries.oppvote = normalize_to_percent(countries.oppvote)

In [5]:
# Based on manual review of the data dictionary, explained briefly above
remove_these_features = ['nonchief', 'dateleg', 'dateexec', 'dhondt', 'select', 'stconst',
                        'gwno', 'numgov', 'numopp', 'herftot', 'tenlong', 'tenshort',
                        'checks_lax', 'stabns', 'stabns_strict', 'execme', 'tensys', 'stabs', 'execrurl',
                        'execreg']
countries.drop(remove_these_features, axis=1, inplace=True)

## Feature Engineering

In [6]:
# Combine PERCENT1 and PERCENTL since they cover similar content since 
# PERCENTL is more meaningful in cases where it exists, but it often doesn't exist

last_else_first = lambda first, last: last if last != NOT_APLIC_NUM else first

countries['percent1'].fillna(NOT_APLIC_NUM, inplace=True)
countries['percentl'].fillna(NOT_APLIC_NUM, inplace=True)

countries['percent'] = list(map(last_else_first, countries['percent1'], countries['percentl']))
countries['percent'].replace(NOT_APLIC_NUM, np.nan, inplace=True)
countries.drop(['percent1', 'percentl'], axis=1, inplace=True)

# Column-by-column cleaning complete
## Now, create unique identifier for subsequent merge with other datasets

In [7]:
countries['id_idb'] = countries.year.astype('str') + ' ' + countries.countryname

In [8]:
countries

Unnamed: 0,countryname,ifs,year,system,yrsoffc,finittrm,yrcurnt,termlimit,reelect,multpl,military,defmin,prtyin,execrlc,execnat,execrel,execage,allhouse,totalseats,oppmajh,oppmajs,legelec,exelec,liec,eiec,mdmh,mdms,ssh,pluralty,pr,housesys,sensys,thresh,cl,gq,gqi,fraud,auton,muni,state,author,numvote,oppvote,maj,partyage,herfgov,herfopp,frac,oppfrac,govfrac,tensys_strict,checks,stabs_strict,tenlong_strict,tenshort_strict,polariz,percent,id_idb
0,Turk Cyprus,0,1975,NA_SS,1.0,-999.0,-999.0,-999.0,-999.0,-999.0,,,-999.0,NA_SS,-999.0,NA_SS,-999.0,-999.0,0,-999.0,-999.0,0.0,0.0,-999.0,-999.0,,-999.0,-999.00,-999.0,-999.0,NA_SS,NA_SS,2.0,-999.0,0.0,0.0,OppositionBanned,0.0,0.0,0.0,0.0,0.0,0.0,-999.000000,-999.0,-999.0,-999.000000,-999.000000,-999.000000,-999.0,-999.0,-999.0,,-999.0,-999.0,-999.0,,1975 Turk Cyprus
1,Turk Cyprus,0,1976,Presidential,1.0,1.0,0.0,1.0,1.0,1.0,,,-999.0,NA_SS,1.0,OTHER,,1.0,0,0.0,-999.0,0.0,1.0,1.0,2.0,,-999.0,-999.00,-999.0,-999.0,NA_SS,NA_SS,2.0,-999.0,0.0,0.0,,,,,0.0,0.0,0.0,-999.000000,-999.0,-999.0,-999.000000,-999.000000,-999.000000,-999.0,1.0,1.0,-999.00,1.0,1.0,0.0,,1976 Turk Cyprus
2,Turk Cyprus,0,1977,Presidential,2.0,1.0,4.0,1.0,1.0,1.0,,,-999.0,Right,1.0,OTHER,,1.0,40,0.0,-999.0,0.0,0.0,7.0,6.5,,,,,,NA_SS,NA_SS,2.0,,0.0,0.0,OppositionBanned,0.0,0.0,0.0,0.0,0.0,0.0,0.725000,-999.0,1.0,0.371901,0.457692,0.690909,0.0,1.0,3.0,0.00,2.0,1.0,0.0,,1977 Turk Cyprus
3,Turk Cyprus,0,1978,Presidential,3.0,1.0,3.0,1.0,1.0,1.0,,,-999.0,Right,1.0,OTHER,,1.0,40,0.0,-999.0,0.0,0.0,7.0,6.5,,,,,,NA_SS,NA_SS,2.0,,0.0,0.0,OppositionBanned,0.0,0.0,0.0,0.0,0.0,0.0,0.725000,-999.0,1.0,0.371901,0.457692,0.690909,0.0,2.0,3.0,0.00,3.0,2.0,0.0,,1978 Turk Cyprus
4,Turk Cyprus,0,1979,Presidential,4.0,1.0,2.0,1.0,1.0,1.0,,,-999.0,Right,1.0,OTHER,,1.0,40,0.0,-999.0,0.0,0.0,7.0,6.5,,,,,,NA_SS,NA_SS,2.0,,0.0,0.0,OppositionBanned,0.0,0.0,0.0,0.0,0.0,0.0,0.725000,-999.0,1.0,0.371901,0.457692,0.690909,0.0,3.0,3.0,0.00,4.0,3.0,0.0,,1979 Turk Cyprus
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8195,Zimbabwe,ZWE,2016,Presidential,29.0,1.0,2.0,3.0,1.0,1.0,0.0,0.0,36.0,NA_SS,1.0,OTHER,53.0,1.0,210,0.0,0.0,0.0,0.0,6.0,7.0,1.0,1.0,0.35,1.0,0.0,Plurality,Plurality,-999.0,-999.0,1.0,0.0,OppositionBanned,0.0,,,,0.0,0.0,0.761905,35.0,1.0,1.000000,0.366781,0.000000,0.0,36.0,4.0,0.00,29.0,3.0,0.0,0.6188,2016 Zimbabwe
8196,Zimbabwe,ZWE,2017,Presidential,30.0,1.0,1.0,3.0,1.0,1.0,0.0,0.0,37.0,NA_SS,1.0,OTHER,54.0,1.0,210,0.0,0.0,0.0,0.0,6.0,7.0,1.0,1.0,0.35,1.0,0.0,Plurality,Plurality,-999.0,-999.0,1.0,0.0,OppositionBanned,0.0,,0.0,,0.0,0.0,0.761905,36.0,1.0,1.000000,0.366781,0.000000,0.0,37.0,4.0,0.00,30.0,4.0,0.0,0.6188,2017 Zimbabwe
8197,Zimbabwe,ZWE,2018,Presidential,1.0,1.0,0.0,3.0,1.0,1.0,0.0,0.0,38.0,NA_SS,1.0,OTHER,55.0,1.0,210,0.0,0.0,1.0,1.0,6.0,7.0,1.0,1.0,0.35,1.0,0.0,Plurality,Plurality,-999.0,-999.0,1.0,0.0,OppositionSuppressed,0.0,,0.0,,0.0,0.0,0.761905,37.0,1.0,1.000000,0.366781,0.000000,0.0,38.0,4.0,0.25,5.0,1.0,0.0,,2018 Zimbabwe
8198,Zimbabwe,ZWE,2019,Presidential,2.0,1.0,4.0,3.0,1.0,1.0,0.0,0.0,39.0,NA_SS,1.0,OTHER,56.0,1.0,210,0.0,0.0,0.0,0.0,7.0,7.0,1.0,1.0,0.35,1.0,0.0,Plurality,Plurality,-999.0,-999.0,1.0,0.0,OppositionSuppressed,0.0,,,,0.0,0.0,0.690476,28.5,1.0,0.969238,0.435270,0.031250,0.0,39.0,4.0,0.00,6.0,2.0,0.0,0.5143,2019 Zimbabwe


## Create identifier to map countries in "Protests" to "Countries"

In [9]:
# Import "Protests" dataset
engine = create_engine('sqlite:///../data/processed/protests.db')
with engine.begin() as connection:
    protests = pd.read_sql('SELECT * FROM protests', connection)

In [10]:
# Identify cases where a protest_country isn't in the countries_country
match = {}
no_match = {}
for country in protests.country.unique():
    if country not in countries.countryname.unique():
        no_match[country] = None
no_match

{'Dominican Republic': None,
 'United Kingdom': None,
 'Germany': None,
 'Germany West': None,
 'Germany East': None,
 'Czechoslovakia': None,
 'Czech Republic': None,
 'Slovak Republic': None,
 'Kosovo': None,
 'Serbia': None,
 'Bosnia': None,
 'Serbia and Montenegro': None,
 'Montenegro': None,
 'USSR': None,
 'Cape Verde': None,
 'Equatorial Guinea': None,
 'Ivory Coast': None,
 'Central African Republic': None,
 'Congo Brazzaville': None,
 'Congo Kinshasa': None,
 'South Africa': None,
 'Comoros': None,
 'United Arab Emirate': None,
 'China': None,
 'North Korea': None,
 'South Korea': None,
 'Timor Leste': None,
 'Papua New Guinea': None}

##### Given the above list, manually create a dictionary to find the "Countries" country name corresponding to the "Protest" country name, since it is likely a difference in syntax/spelling/etc

In [11]:
print(countries.countryname.unique())

['Turk Cyprus' 'Afghanistan' 'Angola' 'Albania' 'UAE' 'Argentina'
 'Armenia' 'Australia' 'Austria' 'Azerbaijan' 'Burundi' 'Belgium' 'Benin'
 'Burkina Faso' 'Bangladesh' 'Bulgaria' 'Bahrain' 'Bahamas' 'Bosnia-Herz'
 'Belarus' 'Belize' 'Bolivia' 'Brazil' 'Barbados' 'Brunei' 'Bhutan'
 'Botswana' 'Cent. Af. Rep.' 'Canada' 'Switzerland' 'Chile' 'PRC'
 "Cote d'Ivoire" 'Cameroon' 'Congo' 'Colombia' 'Comoro Is.' 'C. Verde Is.'
 'Costa Rica' 'Czech Rep.' 'Cuba' 'Cyprus' 'GDR' 'FRG/Germany' 'Djibouti'
 'Denmark' 'Dom. Rep.' 'Algeria' 'Ecuador' 'Egypt' 'Eritrea' 'Spain'
 'Estonia' 'Ethiopia' 'Finland' 'Fiji' 'France' 'Gabon' 'UK' 'Georgia'
 'Ghana' 'Guinea' 'Gambia' 'Guinea-Bissau' 'Eq. Guinea' 'Greece' 'Grenada'
 'Guatemala' 'Guyana' 'Honduras' 'Croatia' 'Haiti' 'Hungary' 'Indonesia'
 'India' 'Ireland' 'Iran' 'Iraq' 'Iceland' 'Israel' 'Italy' 'Jamaica'
 'Jordan' 'Japan' 'Kazakhstan' 'Kenya' 'Kyrgyzstan' 'Cambodia' 'ROK'
 'Kuwait' 'Laos' 'Lebanon' 'Liberia' 'Libya' 'St. Lucia' 'Sri Lanka'
 'Lesot

In [12]:
no_match['Dominican Republic'] = 'Dom. Rep.'
no_match['United Kingdom'] = 'UK'
no_match['Germany'] = 'FRG/Germany'
no_match['Germany West'] = 'FRG/Germany'
no_match['Germany East'] = 'GDR'
no_match['Czechoslovakia'] = 'Czech Rep.'
no_match['Czech Republic'] = 'Czech Rep.'
no_match['Slovak Republic'] = 'Slovakia'
no_match['Kosovo'] = None # No corresponding country in "Countries" dataset
no_match['Serbia'] = None # No corresponding country in "Countries" dataset
no_match['Bosnia'] = 'Bosnia-Herz' # No corresponding country in "Countries" dataset
no_match['Serbia and Montenegro'] = None # No corresponding country in "Countries" dataset
no_match['Montenegro'] = None # No corresponding country in "Countries" dataset
no_match['USSR'] = 'Soviet Union'
no_match['Cape Verde'] = 'C. Verde Is.'
no_match['Equatorial Guinea'] = 'Eq. Guinea'
no_match['Ivory Coast'] = "Cote d'Ivoire"
no_match['Central African Republic'] = 'Cent. Af. Rep.'
no_match['Congo Brazzaville'] = 'Congo'
no_match['Congo Kinshasa'] = 'Congo (DRC)'
no_match['South Africa'] = 'S. Africa'
no_match['Comoros'] = 'Comoro Is.'
no_match['United Arab Emirate'] = 'UAE'
no_match['China'] = 'PRC'
no_match['North Korea'] = 'PRK'
no_match['South Korea'] = 'ROK'
no_match['Timor Leste'] = 'Timor-Leste'
no_match['Papua New Guinea'] = 'P. N. Guinea'


keys = no_match.keys()
for country in list(no_match):
    if no_match[country] == None:
        del no_match[country]
        
no_match

{'Dominican Republic': 'Dom. Rep.',
 'United Kingdom': 'UK',
 'Germany': 'FRG/Germany',
 'Germany West': 'FRG/Germany',
 'Germany East': 'GDR',
 'Czechoslovakia': 'Czech Rep.',
 'Czech Republic': 'Czech Rep.',
 'Slovak Republic': 'Slovakia',
 'Bosnia': 'Bosnia-Herz',
 'USSR': 'Soviet Union',
 'Cape Verde': 'C. Verde Is.',
 'Equatorial Guinea': 'Eq. Guinea',
 'Ivory Coast': "Cote d'Ivoire",
 'Central African Republic': 'Cent. Af. Rep.',
 'Congo Brazzaville': 'Congo',
 'Congo Kinshasa': 'Congo (DRC)',
 'South Africa': 'S. Africa',
 'Comoros': 'Comoro Is.',
 'United Arab Emirate': 'UAE',
 'China': 'PRC',
 'North Korea': 'PRK',
 'South Korea': 'ROK',
 'Timor Leste': 'Timor-Leste',
 'Papua New Guinea': 'P. N. Guinea'}

In [13]:
country_idb = []
for country in protests.country:
    if country in countries.countryname.unique():
        country_idb.append(country)
    elif country in no_match.keys():
        country_idb.append(no_match[country])
    else:
        country_idb.append(np.nan)
        
protests['country_idb'] = country_idb

In [14]:
# Count the number of missing values
# These rows will be dropped since they don't have corresponding data in IDB for countryname
print('Rows w/missing data:', protests.country_idb.isna().sum())

Rows w/missing data: 68


In [15]:
protests['id_idb'] = protests.startyear.astype('str')+' '+protests.country_idb
protests.dropna(inplace=True)
protests.shape

(15140, 30)

In [16]:
merged = pd.merge(protests, countries, how='left', on='id_idb')

cols_to_drop = ['id_idb', 'country_idb', 'countryname', 'ifs', 'year']
merged.drop(cols_to_drop, axis=1, inplace=True)

In [17]:
merged

Unnamed: 0,protest_id,country,region,protestnumber,protesterviolence,protesteridentity,startyear,startmonth,startday,duration_days,participants,participants_category_original,participants_category_manufactured,notes,demand_labor-wage-dispute,demand_land-farm-issue,demand_police-brutality,demand_political-behavior/process,demand_price-increases/tax-policy,demand_removal-of-politician,demand_social-restrictions,response_accomodation,response_arrests,response_beatings,response_crowd-dispersal,response_ignore,response_killings,response_shootings,system,yrsoffc,finittrm,yrcurnt,termlimit,reelect,multpl,military,defmin,prtyin,execrlc,execnat,execrel,execage,allhouse,totalseats,oppmajh,oppmajs,legelec,exelec,liec,eiec,mdmh,mdms,ssh,pluralty,pr,housesys,sensys,thresh,cl,gq,gqi,fraud,auton,muni,state,author,numvote,oppvote,maj,partyage,herfgov,herfopp,frac,oppfrac,govfrac,tensys_strict,checks,stabs_strict,tenlong_strict,tenshort_strict,polariz,percent
0,201990001,Canada,North America,1,0,unknown,1990,1,15,0,1000,unknown,1000-1999,canada s railway passenger system was finally ...,1,0,0,1,0,0,0,0,0,0,0,1,0,0,Parliamentary,6.0,1.0,3.0,1.0,1.0,1.0,0.0,0.0,6.0,Right,0.0,OTHER,136.0,0.0,295.0,0.0,0.0,0.0,0.0,7.0,7.0,1.0,888.0,0.28,1.0,0.0,Plurality,NA_SS,-999.0,-999.0,3.0,0.0,OppositionBanned,0.0,1.0,1.0,1.0,0.4301,0.5246,0.570946,136.00,1.000000,0.541819,0.576225,0.461817,0.000000,60.0,4.0,0.0,6.0,6.0,0.0,
1,201990002,Canada,North America,2,0,unknown,1990,6,25,0,1000,unknown,1000-1999,protestors were only identified as young peopl...,0,0,0,1,0,0,0,0,0,0,0,1,0,0,Parliamentary,6.0,1.0,3.0,1.0,1.0,1.0,0.0,0.0,6.0,Right,0.0,OTHER,136.0,0.0,295.0,0.0,0.0,0.0,0.0,7.0,7.0,1.0,888.0,0.28,1.0,0.0,Plurality,NA_SS,-999.0,-999.0,3.0,0.0,OppositionBanned,0.0,1.0,1.0,1.0,0.4301,0.5246,0.570946,136.00,1.000000,0.541819,0.576225,0.461817,0.000000,60.0,4.0,0.0,6.0,6.0,0.0,
2,201990003,Canada,North America,3,0,separatist parti quebecois,1990,7,1,0,500,unknown,100-999,"the queen, after calling on canadians to remai...",0,0,0,1,0,0,0,0,0,0,0,1,0,0,Parliamentary,6.0,1.0,3.0,1.0,1.0,1.0,0.0,0.0,6.0,Right,0.0,OTHER,136.0,0.0,295.0,0.0,0.0,0.0,0.0,7.0,7.0,1.0,888.0,0.28,1.0,0.0,Plurality,NA_SS,-999.0,-999.0,3.0,0.0,OppositionBanned,0.0,1.0,1.0,1.0,0.4301,0.5246,0.570946,136.00,1.000000,0.541819,0.576225,0.461817,0.000000,60.0,4.0,0.0,6.0,6.0,0.0,
3,201990004,Canada,North America,4,1,mohawk indians,1990,7,12,56,100,unknown,100-999,canada s federal government has agreed to acqu...,0,1,0,0,0,0,0,1,0,0,0,0,0,0,Parliamentary,6.0,1.0,3.0,1.0,1.0,1.0,0.0,0.0,6.0,Right,0.0,OTHER,136.0,0.0,295.0,0.0,0.0,0.0,0.0,7.0,7.0,1.0,888.0,0.28,1.0,0.0,Plurality,NA_SS,-999.0,-999.0,3.0,0.0,OppositionBanned,0.0,1.0,1.0,1.0,0.4301,0.5246,0.570946,136.00,1.000000,0.541819,0.576225,0.461817,0.000000,60.0,4.0,0.0,6.0,6.0,0.0,
4,201990005,Canada,North America,5,1,local residents,1990,8,14,1,950,unknown,100-999,protests were directed against the state due t...,0,0,0,1,0,0,0,1,1,0,1,0,0,0,Parliamentary,6.0,1.0,3.0,1.0,1.0,1.0,0.0,0.0,6.0,Right,0.0,OTHER,136.0,0.0,295.0,0.0,0.0,0.0,0.0,7.0,7.0,1.0,888.0,0.28,1.0,0.0,Plurality,NA_SS,-999.0,-999.0,3.0,0.0,OppositionBanned,0.0,1.0,1.0,1.0,0.4301,0.5246,0.570946,136.00,1.000000,0.541819,0.576225,0.461817,0.000000,60.0,4.0,0.0,6.0,6.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15135,9102014001,Papua New Guinea,Oceania,1,1,asylum seekers,2014,2,16,2,100,100-999,100-999,? a government inquiry will be launched as ser...,0,0,0,1,0,0,0,0,0,0,1,0,0,1,Parliamentary,2.0,1.0,3.0,1.0,1.0,1.0,0.0,0.0,2.0,NA_SS,0.0,OTHER,21.0,0.0,108.0,0.0,-999.0,0.0,0.0,7.0,7.0,1.0,-999.0,-999.00,1.0,0.0,Plurality,NA_SS,-999.0,-999.0,0.0,0.0,OppositionBanned,0.0,1.0,1.0,,0.0000,0.0000,0.706422,10.25,0.179722,0.095703,0.915302,0.964583,0.831071,40.0,3.0,0.0,2.0,2.0,,
15136,9102016001,Papua New Guinea,Oceania,1,1,university students,2016,5,15,25,1000,1000-1999,1000-1999,police in papua new guinea fired gunshots wedn...,0,0,0,0,0,1,0,0,0,0,1,0,1,1,Parliamentary,4.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,4.0,NA_SS,0.0,OTHER,23.0,0.0,108.0,0.0,-999.0,0.0,0.0,7.0,7.0,1.0,-999.0,-999.00,1.0,0.0,Plurality,NA_SS,-999.0,-999.0,0.0,0.0,OppositionBanned,0.0,1.0,1.0,,0.0000,0.0000,0.709091,12.25,0.184183,0.095703,0.912338,0.964583,0.826412,42.0,3.0,0.0,4.0,4.0,,
15137,9102017001,Papua New Guinea,Oceania,1,0,protesters opposed to renewing the licence of ...,2017,6,15,0,50,50-99,50-99,the bougainville government has enacted an ind...,0,1,0,1,0,0,0,1,0,0,0,0,0,0,Parliamentary,5.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,5.0,NA_SS,0.0,OTHER,24.0,0.0,108.0,0.0,-999.0,1.0,0.0,7.0,7.0,1.0,-999.0,-999.00,1.0,0.0,Plurality,NA_SS,-999.0,-999.0,0.0,0.0,OppositionBanned,0.0,1.0,1.0,,0.0000,0.0000,0.711712,13.25,0.188683,0.095703,0.909331,0.964583,0.821718,43.0,3.0,0.0,5.0,5.0,,
15138,9102017002,Papua New Guinea,Oceania,2,1,protesters opposed to counting irregularities ...,2017,7,15,0,50,50-99,50-99,peter o neill has been reappointed as prime mi...,0,0,0,1,0,0,0,0,0,0,1,0,0,0,Parliamentary,5.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,5.0,NA_SS,0.0,OTHER,24.0,0.0,108.0,0.0,-999.0,1.0,0.0,7.0,7.0,1.0,-999.0,-999.00,1.0,0.0,Plurality,NA_SS,-999.0,-999.0,0.0,0.0,OppositionBanned,0.0,1.0,1.0,,0.0000,0.0000,0.711712,13.25,0.188683,0.095703,0.909331,0.964583,0.821718,43.0,3.0,0.0,5.0,5.0,,


# Export data to SQL 

In [18]:
engine = create_engine('sqlite:///../data/processed/merged.db')

with engine.begin() as connection:
    merged.to_sql(name='merged', con=connection, if_exists='replace', index=False)

In [19]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15140 entries, 0 to 15139
Data columns (total 82 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   protest_id                          15140 non-null  int64  
 1   country                             15140 non-null  object 
 2   region                              15140 non-null  object 
 3   protestnumber                       15140 non-null  int64  
 4   protesterviolence                   15140 non-null  int64  
 5   protesteridentity                   15140 non-null  object 
 6   startyear                           15140 non-null  int64  
 7   startmonth                          15140 non-null  int64  
 8   startday                            15140 non-null  int64  
 9   duration_days                       15140 non-null  int64  
 10  participants                        15140 non-null  int64  
 11  participants_category_original      15140