In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sqlalchemy import create_engine


from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


pd.options.display.max_rows = 200
pd.options.display.max_columns = 200

NOT_APLIC_STR = "NA_SS"
NOT_APLIC_NUM = -999.0

RANDOM_STATE = 2021

In [2]:
path = 'sqlite:///../data/processed/'

# Import protest and countries data (already merged)
engine = create_engine(path+'merged.db')
with engine.begin() as connection:
    df = pd.read_sql('SELECT * FROM merged', con=connection)

In [4]:
# Relace 'placeholder' NaN values, as defined by data dictionary (see raw data directory)
df.replace(NOT_APLIC_STR, np.nan, inplace=True)
df.replace(NOT_APLIC_NUM, np.nan, inplace=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15140 entries, 0 to 15139
Data columns (total 82 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   protest_id                          15140 non-null  int64  
 1   country                             15140 non-null  object 
 2   region                              15140 non-null  object 
 3   protestnumber                       15140 non-null  int64  
 4   protesterviolence                   15140 non-null  int64  
 5   protesteridentity                   15140 non-null  object 
 6   startyear                           15140 non-null  int64  
 7   startmonth                          15140 non-null  int64  
 8   startday                            15140 non-null  int64  
 9   duration_days                       15140 non-null  int64  
 10  participants                        15140 non-null  int64  
 11  participants_category_original      15140

In [5]:
# Set semi-arbitrary threshold for the maximum number of missing values to justify keeping
MAX_MISSING_VALUES = 1000

# Copy main df to retain a full dataset
df_cut = df.copy()

# Determine the number of missing values in each column
na_counts_by_col = {}
for col in df_cut.columns:
    na_ct = df_cut[col].isna().sum()
    na_counts_by_col[col] = na_ct # Keep record in dictionary in case you want to investigate
    
    if na_ct > MAX_MISSING_VALUES:
        df_cut.drop(col, axis=1, inplace=True)
        
df_cut.dropna(inplace=True)
df_cut.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11720 entries, 0 to 15139
Data columns (total 56 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   protest_id                          11720 non-null  int64  
 1   country                             11720 non-null  object 
 2   region                              11720 non-null  object 
 3   protestnumber                       11720 non-null  int64  
 4   protesterviolence                   11720 non-null  int64  
 5   protesteridentity                   11720 non-null  object 
 6   startyear                           11720 non-null  int64  
 7   startmonth                          11720 non-null  int64  
 8   startday                            11720 non-null  int64  
 9   duration_days                       11720 non-null  int64  
 10  participants                        11720 non-null  int64  
 11  participants_category_original      11720

In [18]:
for note in df_cut.loc[df.response_accomodation==1, 'notes']:
    print(note, '\n\n')

canada s federal government has agreed to acquire a tract of land that has been the subject of armed dispute between mohawk indians and officials of a montreal suburb, quebec s minister of indian affairs announced today. at a press conference in montreal, the province s indian affairs minister, john ciaccia, said federal officials had approved the purchase or expropriation if necessary of 55 acres of woodland that mohawks of the kahnesatake community claim as ancestral land. the suburban town of oka, 18 miles west of montreal, planned to use it for an addition to a golf course. ciaccia did not say the land would be recognized as mohawk ancestral territory, but it was clear from his description that the federal action would be intended to block the golf course extension that the indians have been fighting. during the last week and a half of the four month golf course dispute, about 200 mohawks, armed with automatic assault rifles and other weapons, have faced off against hundreds of que

# Run it through a model!

In [7]:
response_cols = ['response_accomodation', 'response_arrests', 'response_beatings', 
                 'response_crowd-dispersal', 'response_ignore', 'response_killings', 'response_shootings']


drop_cols = ['protest_id', 'country', 'protesteridentity', 'participants_category_original', 'notes']
model_inputs = df_cut.drop(response_cols+drop_cols, axis=1)
model_inputs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 11720 entries, 0 to 15139
Data columns (total 44 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   region                              11720 non-null  object 
 1   protestnumber                       11720 non-null  int64  
 2   protesterviolence                   11720 non-null  int64  
 3   startyear                           11720 non-null  int64  
 4   startmonth                          11720 non-null  int64  
 5   startday                            11720 non-null  int64  
 6   duration_days                       11720 non-null  int64  
 7   participants                        11720 non-null  int64  
 8   participants_category_manufactured  11720 non-null  object 
 9   demand_labor-wage-dispute           11720 non-null  int64  
 10  demand_land-farm-issue              11720 non-null  int64  
 11  demand_police-brutality             11720

In [8]:
violent_cols = ['response_beatings', 'response_killings', 'response_shootings']
violent_response = df_cut[violent_cols].any(axis=1).astype('int')
accommodation = df_cut['response_accomodation']

y = accommodation ###### ***IMPORTANT LINE - CHOOSE WHICH TARGET TO USE*** #####
x_train, x_test, y_train, y_test = train_test_split(model_inputs, y, random_state=RANDOM_STATE)

### Define models and parameter grids

In [9]:
# Set parameter grid to search across
param_grid_logreg = {'model__C': np.logspace(-1, 5, 20)}


param_grid_dt = {
    'model__max_depth': [3, 5, 7], 
    'model__criterion': ['gini', 'entropy'],
    'model__min_samples_split': [5, 10],
    'model__min_samples_leaf': [5, 10]} 

param_grid_rf = {
    'model__n_estimators': [25, 75],#, 150],
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [3, 7],# [3, 6, 10],
    'model__min_samples_split': [5, 10],
    'model__min_samples_leaf': [3, 6]}


param_grid_knn = {
    'model__leaf_size': [25, 50, 75],
    'model__n_neighbors': [3, 5, 7, 9]}#,
    #'model__weights': ['uniform', 'distance'],
    #'model__metric': ['euclindean', 'manhattan', 'minkowski']} 
        
        
param_grid_xgb = {
    'model__learning_rate': [0.1, 0.2],
    'model__max_depth': [3, 7],
    #'model__min_child_weight': [1, 2],
    #'model__subsample': [0.5, 0.7],
    'model__n_estimators': [100, 150]}#,
    #'model__tree_method': ['exact', 'approx', 'hist']}


np.random.seed(RANDOM_STATE)
model_logreg = LogisticRegression(max_iter=5000)
model_dt = DecisionTreeClassifier()
model_rf = RandomForestClassifier()
model_knn = KNeighborsClassifier()
model_xgb = XGBClassifier(eval_metric='logloss', use_label_encoder=False)

grids = [param_grid_logreg, param_grid_dt, param_grid_rf, param_grid_knn, param_grid_xgb]
models = [model_logreg, model_dt, model_rf, model_knn, model_xgb]

In [10]:
def create_pipeline_and_run(model, param_grid, metric='accuracy'):
    np.random.seed(RANDOM_STATE)
    ohe = OneHotEncoder(handle_unknown='error')
    scaler = StandardScaler()
    smote = SMOTE()

    selector_object = make_column_selector(dtype_include='object')
    selector_numeric = make_column_selector(dtype_include='number')
    transformer = make_column_transformer((ohe, selector_object),
                                         (scaler, selector_numeric))


    pipe = Pipeline([('transformer', transformer),
                     ('smote', smote), 
                     ('model', model)])

    # Instantiate and fit grid search object
    grid = GridSearchCV(pipe, param_grid, return_train_score=True, scoring=metric, cv=3)
    grid.fit(x_train, y_train)
    print(f'{model} {metric}: {grid.best_estimator_.score(x_test, y_test)}')
    return grid.best_estimator_

### Select your model and grid from above list to run one selected model

In [11]:
create_pipeline_and_run(model_logreg, param_grid_logreg);

LogisticRegression(max_iter=5000) accuracy: 0.6819112627986348


### Run below cell to iterate over *all* above-defined models

In [12]:
pipes = []
for grid, model in zip(grids, models):
    pipe = create_pipeline_and_run(model, grid)
    pipes.append(pipe)

LogisticRegression(max_iter=5000) accuracy: 0.6819112627986348
DecisionTreeClassifier() accuracy: 0.7911262798634813
RandomForestClassifier() accuracy: 0.8668941979522184
KNeighborsClassifier() accuracy: 0.7648464163822526
XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None,
              eval_metric='logloss', gamma=None, gpu_id=None,
              importance_type='gain', interaction_constraints=None,
              learning_rate=None, max_delta_step=None, max_depth=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              random_state=None, reg_alpha=None, reg_lambda=None,
              scale_pos_weight=None, subsample=None, tree_method=None,
              use_label_encoder=False, validate_parameters=None,
              verbosity=None) accuracy: 0.8986348122866894


### Dummy classifier results

In [13]:
from sklearn.dummy import DummyClassifier

for strategy in ["stratified", "most_frequent", "uniform"]:
    dummy_clf = DummyClassifier(strategy=strategy, random_state=RANDOM_STATE)
    dummy_clf.fit(x_train, y_train)
    print(strategy, '-', dummy_clf.score(x_test, y_test))

stratified - 0.8196037539103233
most_frequent - 0.899895724713243
uniform - 0.49739311783107404
