In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import sys

# Custom functions

sys.path.insert(1, "../src")
from custom_plots import *

import shap

# Options
pd.options.display.max_rows = 200
pd.options.display.max_columns = 200
%matplotlib inline
sns.set_theme(style="white")

# Convenience for working with external src code files
%load_ext autoreload
%autoreload 2

# Exploratory Data Analysis

## Import protest data

In [None]:
# Import data from Modeling notebook
engine = create_engine("sqlite:///../data/processed/all_data.db")
with engine.begin() as connection:
    df = pd.read_sql("SELECT * FROM all_modeled_data", connection)

df["target_categorical"] = [
    "Revolution" if x == 1 else "No Revolution" for x in df.target
]
df.info()

#### Categorize all features for analysis

Print list of any features that aren't in one of the below bins as a QC.

In [None]:
demands = [
    "demand_labor-wage-dispute",
    "demand_land-farm-issue",
    "demand_police-brutality",
    "demand_political-behavior/process",
    "demand_price-increases/tax-policy",
    "demand_removal-of-politician",
    "demand_social-restrictions",
]

binary = demands + [
    "protesterviolence",
    "military",
    "legelec",
    "exelec",
    "defmin",
    "execnat",
    "oppmajh",
    "gq",
    "gqi",
    "auton",
]

continuous = [
    "protestnumber_log",
    "participants_log",
    "yrsoffc",
    "totalseats",
    "numvote",
    "duration_days_log",
    "termlimit",
    "maj",
    "herfgov",
    "checks",
    "stabs_strict",
    "xconst",
    "tensys_strict",
]

categorical = ["region", "system", "country", "execrel"]

targets = ["target", "target_categorical"]

remainders = df.drop(
    continuous + categorical + binary + targets, axis=1
).columns

if remainders.shape == (0,):
    print("All features successfully bucketed")
else:
    print("Remaining columns to be assigned to various bucket:")
    for col in remainders:
        print(col, "\n", df[col].value_counts(), "\n")

## Correlation matrix

In [None]:
custom_plot_matrix(df, max_corr=0.5)

### Protestnumber

In [None]:
fig = sns.displot(df.protestnumber_log)
fig.fig.set_figwidth(15)
fig.fig.set_figheight(6)
plt.title("Protestnumber");

### Protester Demands

In [None]:
plt.figure(figsize=(12, 4))
bar = df[demands].sum()
cols = [x[7:].replace("-", " ") for x in bar.index]
plt.barh(cols, bar.values)
plt.title("Protest demands")
plt.savefig("../images/protest_demands.png");

 **Target:** We see that the protester demand for "removal of politician" is most strongly correlated with an impending revolution. 

In [None]:
custom_plot_matrix(df[demands + targets], max_corr=0.2)

In [None]:
sns.pairplot(df[continuous], corner=True);

In [None]:
sns.catplot(
    y="military",
    x="protesterviolence",
    kind="violin",
    hue="target_categorical",
    data=df,
);

In [None]:
fig = sns.catplot(x="protestnumber_log", y="system", kind="violin", data=df)

fig.fig.set_figwidth(10)
fig.fig.set_figheight(5)

In [None]:
df.yrsoffc.mean() + 3 * df.yrsoffc.std()

In [None]:
high_end = df.totalseats.mean() + 3 * df.totalseats.std()
print("High end:", high_end)
df.loc[df.totalseats > high_end].totalseats.value_counts()

In [None]:
df.totalseats.value_counts().head(30)

In [None]:
fig = sns.displot(df.numvote)
fig.fig.set_figwidth(25)
fig.fig.set_figheight(12.5)

In [None]:
fig = sns.displot(df.totalseats)
fig.fig.set_figwidth(25)
fig.fig.set_figheight(12.5)

In [None]:
plt.figure(figsize=(12, 4))
protest_ct = pd.get_dummies(df.region).sum()
plt.barh(protest_ct.index, protest_ct.values)
plt.title("Number of *recorded* protests by region")
plt.savefig("../images/protests_by_region.png");

In [None]:
protest_ct = pd.get_dummies(df.system).sum()
plt.barh(protest_ct.index, protest_ct.values)
plt.title("Number of *recorded* protests by government type");