In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sqlalchemy import create_engine

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE


pd.options.display.max_rows = 200
pd.options.display.max_columns = 200
sns.set_theme(style="white")

UNKNOWN = 'unknown'
RANDOM_STATE = 2021

## Import protest data

In [2]:
engine_protests = create_engine('sqlite:///../data/processed/protests.db')


with engine_protests.begin() as connection:
    protests = pd.read_sql('SELECT * FROM protests', connection)
    
protests['startdate'] = pd.to_datetime(protests.start_datetime)
protests['enddate'] = pd.to_datetime(protests.end_datetime)

AttributeError: 'DataFrame' object has no attribute 'start_datetime'

## Narrow to select features

In [None]:
response_cols = ['response_accomodation', 'response_arrests', 'response_beatings', 
                 'response_crowd-dispersal', 'response_ignore', 'response_killings', 'response_shootings']

In [None]:
model_inputs = protests.drop(response_cols+['id', 'location', 'protesteridentity', 'start_datetime', 'end_datetime', 'sources', 'notes', 'endday', 'endmonth', 'endyear', 'participants_category_original', 'country'], axis=1)
model_inputs.info()

## Correlation matrix

In [None]:
# Compute the correlation matrix
corr = model_inputs.corr()

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))
plt.figure(figsize=(15, 10))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});

In [None]:
violent_cols = ['response_beatings', 'response_killings', 'response_shootings']
violent_response = protests[violent_cols].any(axis=1).astype('int')
accommodation = protests['response_accomodation']

y = violent_response #IMPORTANT LINE - CHOOSE WHICH TARGET TO USE
x_train, x_test, y_train, y_test = train_test_split(model_inputs, y, random_state=RANDOM_STATE)

In [None]:
model_inputs.protestnumber.hist(bins=20)

In [None]:
model_inputs.startyear.hist(bins=30)

In [None]:
model_inputs.startmonth.hist(bins=12)

In [None]:
model_inputs.startday.hist(bins=31)

In [None]:
model_inputs.duration_days.hist()

In [None]:
model_inputs.duration_days.sort_values()[:14000].hist(bins=20)

In [None]:
model_inputs.participants.hist(bins=25)

In [None]:
model_inputs.participants.sort_values()[:13000].hist(bins=25)

In [None]:
model_inputs.participants_category_manufactured.value_counts()

In [None]:
demands = ['demand_labor-wage-dispute', 'demand_land-farm-issue', 'demand_police-brutality', 'demand_political-behavior/process', 'demand_price-increases/tax-policy', 'demand_removal-of-politician', 'demand_social-restrictions']

model_inputs[demands].sum()

In [None]:
protests[response_cols].sum()