In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sqlalchemy import create_engine



from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


pd.options.display.max_rows = 200
pd.options.display.max_columns = 200

NOT_APLIC_STR = "NA_SS"
NOT_APLIC_NUM = -999.0

RANDOM_STATE = 2021
DAYS_PER_YEAR = 365.2425

In [2]:
# IMPORT PROTESTS DATASET
engine = create_engine('sqlite:///../data/processed/protests.db')
with engine.begin() as connection:
    df_protests = pd.read_sql('SELECT * FROM protests', con=connection)
    
df_protests.startdate = pd.to_datetime(df_protests.startdate)
df_protests.enddate = pd.to_datetime(df_protests.enddate)

In [3]:
# IMPORT GOVERNMENTS DATASET
engine = create_engine('sqlite:///../data/processed/governments.db')
with engine.begin() as connection:
    df_govts = pd.read_sql('SELECT * FROM governments', con=connection)

df_govts.index = df_govts.year_scode
df_govts.drop('year_scode', axis=1, inplace=True)

In [4]:
# IMPORT REGIME CHANGE DATASET
engine = create_engine('sqlite:///../data/processed/regime_changes.db')
with engine.begin() as connection:
    df_regimes = pd.read_sql('SELECT * FROM regime_changes', con=connection)
    
# Type conversions
df_regimes.startdate = pd.to_datetime(df_regimes.startdate)
df_regimes.enddate = pd.to_datetime(df_regimes.enddate)
df_regimes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1692 entries, 0 to 1691
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   country       1692 non-null   object        
 1   scode         1692 non-null   object        
 2   startdate     1692 non-null   datetime64[ns]
 3   enddate       1692 non-null   datetime64[ns]
 4   duration_yrs  1692 non-null   float64       
dtypes: datetime64[ns](2), float64(1), object(2)
memory usage: 66.2+ KB


In [5]:
# Join both dataframes
df = df_protests.join(df_govts, how='left', on='year_scode')

# Remove entries that don't have corresponding 'government' data
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15064 entries, 0 to 15207
Data columns (total 85 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   protest_id                          15064 non-null  int64         
 1   country                             15064 non-null  object        
 2   scode                               15064 non-null  object        
 3   region                              15064 non-null  object        
 4   protestnumber                       15064 non-null  int64         
 5   protesterviolence                   15064 non-null  int64         
 6   protesteridentity                   15064 non-null  object        
 7   startdate                           15064 non-null  datetime64[ns]
 8   enddate                             15064 non-null  datetime64[ns]
 9   duration_days                       15064 non-null  int64         
 10  participants          

In [6]:
# BASIC QC THAT NAMES & IDs MATCH
missing_countries = df.loc[(df.country != df.country_govt)][['scode', 'scode_govt', 'country', 'country_govt']]
missing_countries = missing_countries.drop_duplicates()
display(missing_countries.sort_values(by='scode'))

# These countries do not contain government data. Remove. 
scodes_to_remove = missing_countries.scode.unique()
scodes_to_remove_ind = [x in scodes_to_remove for x in df.scode]
df.drop(df.loc[scodes_to_remove_ind].index, axis=0, inplace=True)

Unnamed: 0,scode,scode_govt,country,country_govt


In [7]:
all_countries = df.scode.unique()
regime_countries = df_regimes.scode.unique()

missing = []
for country in all_countries:
    if country not in regime_countries:
        missing.append(country)

print('Countries missing from "Regimes" dataset:', missing)

# Remove these countries from dataset
scodes_to_remove_ind = [x in missing for x in df.scode]
df.drop(df.loc[scodes_to_remove_ind].index, axis=0, inplace=True)

Countries missing from "Regimes" dataset: ['LUX']


## Engineer new target

Game plan:
- Create column for "next regime change date"
- Create column for "days until next regime change"
- Create target column for [above column] < 365 (try for other targets too)

In [8]:
# USE THIS STRUCTURE TO LOOP OVER EACH COUNTRY, SIMILAR TO HOW ENDDATE WAS QC'D
# Loop over all country names
working_df = df[['scode', 'startdate']].copy()
working_df['next_regime_chg_date'] = None
working_df['days_until_next_regime_chg'] = None
working_df['target (<365)'] = None

for protest_index, protest_scode, protest_start in df.reset_index()[['index', 'scode', 'startdate']].values:
    
    # look only at country in question
    regime_country_df = df_regimes.loc[df_regimes.scode==protest_scode]
       
    
    
#     #TESTING ONLY
#     if protest_index == 7289:
#             print('protest start:', protest_start)
    
    # Loop over all regime indices
    for regime_index in regime_country_df.index:
        # isolate startdate and enddate for selected regime
        regime_start = regime_country_df.loc[regime_index, 'startdate']
        regime_end   = regime_country_df.loc[regime_index, 'enddate']
        
        
        # if protest occurs before statehood,set the 'regime end' to the date it became a state 
        # this would correspond to protests about creating a state. Note that this is very rare in this dataset
        if (regime_index == regime_country_df.index[0]) and (protest_start < regime_start):
            working_df.loc[protest_index, 'next_regime_chg_date'] = regime_start
        

        
#         #TESTING ONLY
#         elif protest_index == 7289:
#             print('regime start / end:', regime_start, '|', regime_end)
        
        # test if the protest is within selected regime row
        elif (protest_start >= regime_start) and (protest_start <= regime_end):
            working_df.loc[protest_index, 'next_regime_chg_date'] = regime_end
    if working_df.loc[protest_index, 'next_regime_chg_date'] == None:
        print('broke')
        break

    
# # Convert from 'object' to 'datetime' format
working_df['next_regime_chg_date'] = pd.to_datetime(working_df['next_regime_chg_date'])


# # Incorporate new column for "duration"
working_df['days_until_next_regime_chg'] = (working_df['next_regime_chg_date'] - working_df['startdate']).dt.days

working_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15061 entries, 0 to 15207
Data columns (total 5 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   scode                       15061 non-null  object        
 1   startdate                   15061 non-null  datetime64[ns]
 2   next_regime_chg_date        15061 non-null  datetime64[ns]
 3   days_until_next_regime_chg  15061 non-null  int64         
 4   target (<365)               0 non-null      object        
dtypes: datetime64[ns](2), int64(1), object(2)
memory usage: 1.2+ MB


#### Combine this data with main df

In [9]:
df['days_until_next_regime_chg'] = working_df['days_until_next_regime_chg']

## Basic cleaning

In [10]:
# Relace 'placeholder' NaN values, as defined by data dictionary (see raw data directory)
df.replace(NOT_APLIC_STR, np.nan, inplace=True)
df.replace(NOT_APLIC_NUM, np.nan, inplace=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15061 entries, 0 to 15207
Data columns (total 86 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   protest_id                          15061 non-null  int64         
 1   country                             15061 non-null  object        
 2   scode                               15061 non-null  object        
 3   region                              15061 non-null  object        
 4   protestnumber                       15061 non-null  int64         
 5   protesterviolence                   15061 non-null  int64         
 6   protesteridentity                   15061 non-null  object        
 7   startdate                           15061 non-null  datetime64[ns]
 8   enddate                             15061 non-null  datetime64[ns]
 9   duration_days                       15061 non-null  int64         
 10  participants          

In [11]:
# Set semi-arbitrary threshold for the maximum number of missing values to justify keeping
MAX_MISSING_VALUES = 250

# Copy main df to retain a full dataset
df_cut = df.copy()

# Determine the number of missing values in each column
na_counts_by_col = {}
for col in df_cut.columns:
    na_ct = df_cut[col].isna().sum()
    na_counts_by_col[col] = na_ct # Keep record in dictionary in case you want to investigate
    
    if na_ct > MAX_MISSING_VALUES:
        df_cut.drop(col, axis=1, inplace=True)
        
df_cut.dropna(inplace=True)
df_cut.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14558 entries, 0 to 15207
Data columns (total 46 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   protest_id                          14558 non-null  int64         
 1   country                             14558 non-null  object        
 2   scode                               14558 non-null  object        
 3   region                              14558 non-null  object        
 4   protestnumber                       14558 non-null  int64         
 5   protesterviolence                   14558 non-null  int64         
 6   protesteridentity                   14558 non-null  object        
 7   startdate                           14558 non-null  datetime64[ns]
 8   enddate                             14558 non-null  datetime64[ns]
 9   duration_days                       14558 non-null  int64         
 10  participants          

# Run it through a model!

In [12]:
response_cols = ['response_accomodation', 'response_arrests', 'response_beatings', 
                 'response_crowd-dispersal', 'response_ignore', 'response_killings', 'response_shootings', 'days_until_next_regime_chg', 'scode']


drop_cols = ['protest_id', 'protesteridentity', 'participants_category_original', 'notes', 'year_scode', 'scode_govt', 'country_govt']
model_inputs = df_cut.drop(response_cols+drop_cols, axis=1)
model_inputs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14558 entries, 0 to 15207
Data columns (total 30 columns):
 #   Column                              Non-Null Count  Dtype         
---  ------                              --------------  -----         
 0   country                             14558 non-null  object        
 1   region                              14558 non-null  object        
 2   protestnumber                       14558 non-null  int64         
 3   protesterviolence                   14558 non-null  int64         
 4   startdate                           14558 non-null  datetime64[ns]
 5   enddate                             14558 non-null  datetime64[ns]
 6   duration_days                       14558 non-null  int64         
 7   participants                        14558 non-null  int64         
 8   participants_category_manufactured  14558 non-null  object        
 9   demand_labor-wage-dispute           14558 non-null  int64         
 10  demand_land-farm-issue

In [24]:
df_responses = df_cut[response_cols]

violent_cols = ['response_beatings', 'response_killings', 'response_shootings']
df_violence = df_cut[violent_cols].any(axis=1).astype('int')
df_accommodation = df_cut['response_accomodation']
df_accommodation_only = pd.Series((df_responses.response_accomodation == 1) & 
                                  (df_responses.drop('response_accomodation', axis=1).sum(axis=1)==0)).astype('int')
df_ignore = df_cut['response_ignore']

DAYS_UNTIL_CHG = DAYS_PER_YEAR
change_in_power_soon = df_cut['days_until_next_regime_chg'] < DAYS_UNTIL_CHG


# ***IMPORTANT LINE - CHOOSE WHICH TARGET TO USE***
y = change_in_power_soon



x_traintest, x_holdout, y_traintest, y_holdout = train_test_split(model_inputs, y, random_state=RANDOM_STATE)
x_train, x_test, y_train, y_test = train_test_split(x_traintest, y_traintest, random_state=RANDOM_STATE)

### Define models and parameter grids

In [25]:
# Set parameter grid to search across
param_grid_logreg = {'model__C': np.logspace(-1, 5, 20)}


param_grid_dt = {
    'model__max_depth': [3, 5, 7], 
    'model__criterion': ['gini', 'entropy'],
    'model__min_samples_split': [5, 10],
    'model__min_samples_leaf': [5, 10]} 

param_grid_rf = {
    'model__n_estimators': [25, 75],#, 150],
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [3, 7],# [3, 6, 10],
    'model__min_samples_split': [5, 10],
    'model__min_samples_leaf': [3, 6]}


param_grid_knn = {
    'model__leaf_size': [25, 50, 75],
    'model__n_neighbors': [3, 5, 7, 9],
    'model__weights': ['uniform', 'distance']}#,
    #'model__metric': ['euclindean', 'manhattan', 'minkowski']} 
        
        
param_grid_xgb = {
    'model__learning_rate': [0.1, 0.2],
    'model__max_depth': [3, 7],
    #'model__min_child_weight': [1, 2],
    #'model__subsample': [0.5, 0.7],
    'model__n_estimators': [100, 150],
    'model__tree_method': ['exact', 'approx', 'hist']}


np.random.seed(RANDOM_STATE)
model_logreg = LogisticRegression(max_iter=5000)
model_dt = DecisionTreeClassifier()
model_rf = RandomForestClassifier()
model_knn = KNeighborsClassifier()
model_xgb = XGBClassifier(eval_metric='logloss', use_label_encoder=False)

grids = [param_grid_logreg, param_grid_dt, param_grid_rf, param_grid_knn, param_grid_xgb]
models = [model_logreg, model_dt, model_rf, model_knn, model_xgb]

In [26]:
def create_pipeline_and_run(model, param_grid, metric='accuracy'):
    np.random.seed(RANDOM_STATE)
    ohe = OneHotEncoder(handle_unknown='ignore')
    scaler = StandardScaler()
    smote = SMOTE()

    selector_object = make_column_selector(dtype_include='object')
    selector_numeric = make_column_selector(dtype_include='number')
    transformer = make_column_transformer((ohe, selector_object),
                                         (scaler, selector_numeric))


    pipe = Pipeline([('transformer', transformer),
                     ('smote', smote), 
                     ('model', model)])

    # Instantiate and fit grid search object
    grid = GridSearchCV(pipe, param_grid, return_train_score=True, scoring='f1', cv=3)
    grid.fit(x_train, y_train)
    pred = grid.best_estimator_.predict(x_test)
    print(f'{model}:')
    print(f'   - f1: {f1_score(y_test, pred)}')
    print(f'   - accuracy: {accuracy_score(y_test, pred)}')
    print(f'   - precision: {precision_score(y_test, pred)}')
    print(f'   - recall: {recall_score(y_test, pred)}')
    
    return grid.best_estimator_

### Dummy classifier results

In [27]:
from sklearn.dummy import DummyClassifier

for strategy in ["stratified", "uniform", "most_frequent"]:
    dummy_clf = DummyClassifier(strategy=strategy, random_state=RANDOM_STATE)
    dummy_clf.fit(x_train, y_train)
    
    pred_dummy = dummy_clf.predict(x_test)

    print(f'DUMMY SCORE ({strategy}):')
    print(f'   - f1 score: {f1_score(y_test, pred_dummy)}')
    print(f'   - accuracy: {accuracy_score(y_test, pred_dummy)}')
    print(f'   - precision: {precision_score(y_test, pred_dummy)}')
    print(f'   - recall: {recall_score(y_test, pred_dummy)}')

DUMMY SCORE (stratified):
   - f1 score: 0.11186903137789904
   - accuracy: 0.7615384615384615
   - precision: 0.11452513966480447
   - recall: 0.10933333333333334
DUMMY SCORE (uniform):
   - f1 score: 0.23300970873786409
   - accuracy: 0.508058608058608
   - precision: 0.14825581395348839
   - recall: 0.544
DUMMY SCORE (most_frequent):
   - f1 score: 0.0
   - accuracy: 0.8626373626373627
   - precision: 0.0
   - recall: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


### Run *only one* model

In [23]:
create_pipeline_and_run(model_logreg, param_grid_logreg);

LogisticRegression(max_iter=5000):
   - f1: 0.11405295315682282
   - accuracy: 0.8406593406593407
   - precision: 0.06292134831460675
   - recall: 0.6086956521739131


### Run *all models* defined above

In [28]:
pipes = []
for grid, model in zip(grids, models):
    pipe = create_pipeline_and_run(model, grid)
    pipes.append(pipe)

LogisticRegression(max_iter=5000):
   - f1: 0.5062034739454093
   - accuracy: 0.7813186813186813
   - precision: 0.3669064748201439
   - recall: 0.816
DecisionTreeClassifier():
   - f1: 0.5205183585313176
   - accuracy: 0.8373626373626374
   - precision: 0.43738656987295826
   - recall: 0.6426666666666667
RandomForestClassifier():
   - f1: 0.5005128205128205
   - accuracy: 0.8216117216117216
   - precision: 0.4066666666666667
   - recall: 0.6506666666666666
KNeighborsClassifier():
   - f1: 0.5785123966942148
   - accuracy: 0.8505494505494505
   - precision: 0.47217537942664417
   - recall: 0.7466666666666667
XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              eval_metric='logloss', gamma=None, gpu_id=None,
              importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, mono