In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sqlalchemy import create_engine
import sys

# Custom functions

sys.path.insert(1, '../src')
from custom_plots import *


# Options
pd.options.display.max_rows = 200
pd.options.display.max_columns = 200
%matplotlib inline
sns.set_theme(style="white")

# Convenience for working with external src code files
%load_ext autoreload
%autoreload 2


# NOTE THAT SETTING THESE PARAMETERS ELIMINATES "MASK" FUNCTIONALITY ON CMAP
# FONT_SIZE=20
# sns.set(rc={
#     'figure.figsize': (df.shape[1]/2,df.shape[1]/3),
#     'font.size': FONT_SIZE,
#     'axes.titlesize': FONT_SIZE, 
#     'xtick.labelsize': FONT_SIZE, 
#     'ytick.labelsize': FONT_SIZE,
#     'axes.labelsize': FONT_SIZE})

## Import protest data

In [2]:
# Import data from Modeling notebook
engine = create_engine('sqlite:///../data/processed/all_data.db')
with engine.begin() as connection:
    df = pd.read_sql('SELECT * FROM all_modeled_data', connection)
    
df['target_categorical'] = ['Revolution' if x == 1 else "No Revolution" for x in df.target]
df['target30'] = df.alternative_target < 30
df['target90'] = df.alternative_target < 90
df['target180'] = df.alternative_target < 180
df['target365'] = df.alternative_target < 365
df['target_alltime'] = df.alternative_target
df.drop(['target', 'alternative_target'], axis=1, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14558 entries, 0 to 14557
Data columns (total 34 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   region                             14558 non-null  object 
 1   protestnumber                      14558 non-null  int64  
 2   protesterviolence                  14558 non-null  int64  
 3   duration_days                      14558 non-null  int64  
 4   participants                       14558 non-null  int64  
 5   participants_category              14558 non-null  object 
 6   demand_labor-wage-dispute          14558 non-null  int64  
 7   demand_land-farm-issue             14558 non-null  int64  
 8   demand_police-brutality            14558 non-null  int64  
 9   demand_political-behavior/process  14558 non-null  int64  
 10  demand_price-increases/tax-policy  14558 non-null  int64  
 11  demand_removal-of-politician       14558 non-null  int

In [None]:
demands = ['demand_labor-wage-dispute', 'demand_land-farm-issue', 'demand_police-brutality', 'demand_political-behavior/process', 'demand_price-increases/tax-policy', 'demand_removal-of-politician', 'demand_social-restrictions']
binary = demands + ['protesterviolence', 'finittrm', 'military', 'legelec', 'exelec']

continuous = ['protestnumber', 'duration_days', 'participants', 'yrsoffc', 'totalseats', 'startyear', 'liec', 'eiec', 'numvote', 'oppvote', 'gq', 'tensys_strict', 'stabs_strict']
categorical = ['region', 'participants_category', 'system']
targets = ['target30', 'target90', 'target180', 'target365', 'target_categorical', 'target_alltime']

remainders = df.drop(continuous+categorical+binary+targets, axis=1).columns

if remainders.shape==(0,):
    print("All features successfully bucketed")
else:
    print('Remaining columns to be assigned to various bucket:')
    for col in remainders:
        print(col, '\n', df[col].value_counts(), '\n')

## Correlation matrix

In [None]:
custom_plot_matrix(df, max_corr=0.5)

### Protestnumber

In [None]:
fig = sns.displot(df.protestnumber, bins=(df.protestnumber.max() - df.protestnumber.min()))
fig.fig.set_figwidth(15)
fig.fig.set_figheight(6)
plt.title('Protestnumber');

### Protest start year

In [None]:
fig = sns.displot(df.startyear)
fig.fig.set_figwidth(10)
fig.fig.set_figheight(5)
plt.title('Startyear');

### Protest duration

In [None]:
# Boxplot
print('Value counts (top 10):\n', df.duration_days.value_counts()[:10])
plt.figure()
pd.DataFrame(df.duration_days).boxplot()
plt.title('All protest durations')

# All values
fig = sns.displot(df.duration_days, bins=(df.duration_days.max() - df.duration_days.min()))
fig.fig.set_figwidth(10)
fig.fig.set_figheight(5)
plt.title('Duration (days)');


# All values > 1
more_than_one_day = df.loc[df.duration_days>1, 'duration_days']
fig = sns.displot(more_than_one_day, bins=(more_than_one_day.max() - more_than_one_day.min()))
fig.fig.set_figwidth(10)
fig.fig.set_figheight(5)
plt.title('Duration (days > 1)');

### Participant counts

In [None]:
# Participant count
print('Value counts (top 20):\n', df.participants.value_counts()[:20])

fig = sns.displot(df.participants)
fig.fig.set_figwidth(10)
fig.fig.set_figheight(5)
plt.title('Participant Count');

In [None]:
categories = ['50-99', '100-999', '1000-1999', '2000-4999', '5000-9999', '10000+']
participants = []

lookup = dict(df.participants_category.value_counts())
for category in categories:
    participants.append(lookup[category])

plt.figure(figsize=(8, 4))
plt.title('Protest participants')
plt.bar(categories, participants);

### Protester Demands

In [None]:
plt.figure(figsize=(12, 4))
plt.barh(df[demands].sum().index, df[demands].sum().values)
plt.title('Protest demands');

 **Target:** We see that the protester demand for "removal of politician" is most strongly correlated with an impending revolution. 
 
 **Alternative target:** We see that the rem

In [None]:
custom_plot_matrix(df[demands+targets], max_corr=0.2)

In [None]:
stop

In [None]:
sns.catplot(x='startyear', y='target_categorical',kind="violin", data=df)

In [None]:
sns.pairplot(df)

In [None]:
sns.catplot(y='military', x='protesterviolence',kind="violin", hue='target_categorical', data=df)



#plt.scatter(data=df, x='startyear', y='participants')

In [None]:
#plt.rcParams['figure.figsize'] = 12,10
fig = sns.catplot(x=demands[0], y="startyear", data=df, kind='swarm')

fig.fig.set_figwidth(12)
fig.fig.set_figheight(6)
plt.show

In [None]:
plt.scatter(df[demands[0]], df['target_alltime'])