In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sqlalchemy import create_engine



from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


pd.options.display.max_rows = 200
pd.options.display.max_columns = 200

NOT_APLIC_STR = "NA_SS"
NOT_APLIC_NUM = -999.0

RANDOM_STATE = 2021

In [2]:
path = 'sqlite:///../data/processed/'

# Import protest and countries data (already merged)
engine = create_engine(path+'merged.db')
with engine.begin() as connection:
    df = pd.read_sql('SELECT * FROM merged', con=connection)

In [3]:
# Relace 'placeholder' NaN values, as defined by data dictionary (see raw data directory)
df.replace(NOT_APLIC_STR, np.nan, inplace=True)
df.replace(NOT_APLIC_NUM, np.nan, inplace=True)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15140 entries, 0 to 15139
Data columns (total 82 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   protest_id                          15140 non-null  int64  
 1   country                             15140 non-null  object 
 2   region                              15140 non-null  object 
 3   protestnumber                       15140 non-null  int64  
 4   protesterviolence                   15140 non-null  int64  
 5   protesteridentity                   15140 non-null  object 
 6   startyear                           15140 non-null  int64  
 7   startmonth                          15140 non-null  int64  
 8   startday                            15140 non-null  int64  
 9   duration_days                       15140 non-null  int64  
 10  participants                        15140 non-null  int64  
 11  participants_category_original      15140

In [4]:
# Set semi-arbitrary threshold for the maximum number of missing values to justify keeping
MAX_MISSING_VALUES = 250

# Copy main df to retain a full dataset
df_cut = df.copy()

# Determine the number of missing values in each column
na_counts_by_col = {}
for col in df_cut.columns:
    na_ct = df_cut[col].isna().sum()
    na_counts_by_col[col] = na_ct # Keep record in dictionary in case you want to investigate
    
    if na_ct > MAX_MISSING_VALUES:
        df_cut.drop(col, axis=1, inplace=True)
        
df_cut.dropna(inplace=True)
df_cut.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14670 entries, 0 to 15139
Data columns (total 41 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   protest_id                          14670 non-null  int64  
 1   country                             14670 non-null  object 
 2   region                              14670 non-null  object 
 3   protestnumber                       14670 non-null  int64  
 4   protesterviolence                   14670 non-null  int64  
 5   protesteridentity                   14670 non-null  object 
 6   startyear                           14670 non-null  int64  
 7   startmonth                          14670 non-null  int64  
 8   startday                            14670 non-null  int64  
 9   duration_days                       14670 non-null  int64  
 10  participants                        14670 non-null  int64  
 11  participants_category_original      14670

In [5]:
for note in df_cut.loc[df.response_accomodation==1, 'notes'][:20]:
    print(note, '\n\n')

canada s federal government has agreed to acquire a tract of land that has been the subject of armed dispute between mohawk indians and officials of a montreal suburb, quebec s minister of indian affairs announced today. at a press conference in montreal, the province s indian affairs minister, john ciaccia, said federal officials had approved the purchase or expropriation if necessary of 55 acres of woodland that mohawks of the kahnesatake community claim as ancestral land. the suburban town of oka, 18 miles west of montreal, planned to use it for an addition to a golf course. ciaccia did not say the land would be recognized as mohawk ancestral territory, but it was clear from his description that the federal action would be intended to block the golf course extension that the indians have been fighting. during the last week and a half of the four month golf course dispute, about 200 mohawks, armed with automatic assault rifles and other weapons, have faced off against hundreds of que

# Run it through a model!

In [6]:
response_cols = ['response_accomodation', 'response_arrests', 'response_beatings', 
                 'response_crowd-dispersal', 'response_ignore', 'response_killings', 'response_shootings']


drop_cols = ['protest_id', 'protesteridentity', 'participants_category_original', 'notes']
model_inputs = df_cut.drop(response_cols+drop_cols, axis=1)
model_inputs.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14670 entries, 0 to 15139
Data columns (total 30 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   country                             14670 non-null  object 
 1   region                              14670 non-null  object 
 2   protestnumber                       14670 non-null  int64  
 3   protesterviolence                   14670 non-null  int64  
 4   startyear                           14670 non-null  int64  
 5   startmonth                          14670 non-null  int64  
 6   startday                            14670 non-null  int64  
 7   duration_days                       14670 non-null  int64  
 8   participants                        14670 non-null  int64  
 9   participants_category_manufactured  14670 non-null  object 
 10  demand_labor-wage-dispute           14670 non-null  int64  
 11  demand_land-farm-issue              14670

In [7]:
df_res = df_cut[response_cols]

violent_cols = ['response_beatings', 'response_killings', 'response_shootings']
violent_response = df_cut[violent_cols].any(axis=1).astype('int')
accommodation = df_cut['response_accomodation']
accommodation_only = pd.Series((df_res.response_accomodation == 1) & (df_res.drop('response_accomodation', axis=1).sum(axis=1)==0)).astype('int')
ignore = df_cut['response_ignore']

# ***IMPORTANT LINE - CHOOSE WHICH TARGET TO USE***
y = accommodation

x_traintest, x_holdout, y_traintest, y_holdout = train_test_split(model_inputs, y, random_state=RANDOM_STATE)
x_train, x_test, y_train, y_test = train_test_split(x_traintest, y_traintest, random_state=RANDOM_STATE)

### Define models and parameter grids

In [8]:
# Set parameter grid to search across
param_grid_logreg = {'model__C': np.logspace(-1, 5, 20)}


param_grid_dt = {
    'model__max_depth': [3, 5, 7], 
    'model__criterion': ['gini', 'entropy'],
    'model__min_samples_split': [5, 10],
    'model__min_samples_leaf': [5, 10]} 

param_grid_rf = {
    'model__n_estimators': [25, 75],#, 150],
    'model__criterion': ['gini', 'entropy'],
    'model__max_depth': [3, 7],# [3, 6, 10],
    'model__min_samples_split': [5, 10],
    'model__min_samples_leaf': [3, 6]}


param_grid_knn = {
    'model__leaf_size': [25, 50, 75],
    'model__n_neighbors': [3, 5, 7, 9],
    'model__weights': ['uniform', 'distance']}#,
    #'model__metric': ['euclindean', 'manhattan', 'minkowski']} 
        
        
param_grid_xgb = {
    'model__learning_rate': [0.1, 0.2],
    'model__max_depth': [3, 7],
    #'model__min_child_weight': [1, 2],
    #'model__subsample': [0.5, 0.7],
    'model__n_estimators': [100, 150],
    'model__tree_method': ['exact', 'approx', 'hist']}


np.random.seed(RANDOM_STATE)
model_logreg = LogisticRegression(max_iter=5000)
model_dt = DecisionTreeClassifier()
model_rf = RandomForestClassifier()
model_knn = KNeighborsClassifier()
model_xgb = XGBClassifier(eval_metric='logloss', use_label_encoder=False)

grids = [param_grid_logreg, param_grid_dt, param_grid_rf, param_grid_knn, param_grid_xgb]
models = [model_logreg, model_dt, model_rf, model_knn, model_xgb]

In [9]:
def create_pipeline_and_run(model, param_grid, metric='accuracy'):
    np.random.seed(RANDOM_STATE)
    ohe = OneHotEncoder(handle_unknown='ignore')
    scaler = StandardScaler()
    smote = SMOTE()

    selector_object = make_column_selector(dtype_include='object')
    selector_numeric = make_column_selector(dtype_include='number')
    transformer = make_column_transformer((ohe, selector_object),
                                         (scaler, selector_numeric))


    pipe = Pipeline([('transformer', transformer),
                     ('smote', smote), 
                     ('model', model)])

    # Instantiate and fit grid search object
    grid = GridSearchCV(pipe, param_grid, return_train_score=True, scoring='f1', cv=3)
    grid.fit(x_train, y_train)
    pred = grid.best_estimator_.predict(x_test)
    print(f'{model}:')
    print(f'   - f1: {f1_score(y_test, pred)}')
    print(f'   - accuracy: {accuracy_score(y_test, pred)}')
    print(f'   - precision: {precision_score(y_test, pred)}')
    print(f'   - recall: {recall_score(y_test, pred)}')
    
    return grid.best_estimator_

### Dummy classifier results

In [10]:
from sklearn.dummy import DummyClassifier

for strategy in ["stratified", "uniform", "most_frequent"]:
    dummy_clf = DummyClassifier(strategy=strategy, random_state=RANDOM_STATE)
    dummy_clf.fit(x_train, y_train)
    
    pred_dummy = dummy_clf.predict(x_test)

    print(f'DUMMY SCORE ({strategy}):')
    print(f'   - f1 score: {f1_score(y_test, pred_dummy)}')
    print(f'   - accuracy: {accuracy_score(y_test, pred_dummy)}')
    print(f'   - precision: {precision_score(y_test, pred_dummy)}')
    print(f'   - recall: {recall_score(y_test, pred_dummy)}')

DUMMY SCORE (stratified):
   - f1 score: 0.111731843575419
   - accuracy: 0.826608505997819
   - precision: 0.11627906976744186
   - recall: 0.10752688172043011
DUMMY SCORE (uniform):
   - f1 score: 0.14894894894894894
   - accuracy: 0.48491457651763
   - precision: 0.08946608946608947
   - recall: 0.4444444444444444
DUMMY SCORE (most_frequent):
   - f1 score: 0.0
   - accuracy: 0.8985823336968375
   - precision: 0.0
   - recall: 0.0


  _warn_prf(average, modifier, msg_start, len(result))


### Run *only one* model

In [11]:
#create_pipeline_and_run(model_logreg, param_grid_logreg);

### Run *all models* defined above

In [None]:
pipes = []
for grid, model in zip(grids, models):
    pipe = create_pipeline_and_run(model, grid)
    pipes.append(pipe)