### Exploratory Analysis - Mass Mobilization Protests

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
import os, sys

from sklearn.preprocessing import OneHotEncoder

pd.options.display.max_columns = 100
pd.options.display.max_rows = 300

path_data = os.path.join('..', 'data', 'raw')

UNKNOWN = 'unknown'

In [2]:
path_csv = os.path.join(path_data, 'Mass-Mobilization-Protests', 'mmALL_073120_csv.csv')
df = pd.read_csv(path_csv)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17145 entries, 0 to 17144
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     17145 non-null  int64  
 1   country                17145 non-null  object 
 2   ccode                  17145 non-null  int64  
 3   year                   17145 non-null  int64  
 4   region                 17145 non-null  object 
 5   protest                17145 non-null  int64  
 6   protestnumber          17145 non-null  int64  
 7   startday               15239 non-null  float64
 8   startmonth             15239 non-null  float64
 9   startyear              15239 non-null  float64
 10  endday                 15239 non-null  float64
 11  endmonth               15239 non-null  float64
 12  endyear                15239 non-null  float64
 13  protesterviolence      15758 non-null  float64
 14  location               15218 non-null  object 
 15  pa

In [4]:
def print_counts(series):
    unique = series.value_counts()
    return unique

In [5]:
print_counts(df.id)

512000000     1
4502005006    1
2202017017    1
2001995002    1
1352011003    1
             ..
2202002009    1
2001998017    1
4901991003    1
6402014010    1
701995005     1
Name: id, Length: 17145, dtype: int64

In [6]:
# countries = df.country.unique()
# countries.sort()
# countries

In [7]:
# print_counts(df[['ccode', 'country']])

In [8]:
# year = df.year.unique()
# year.sort()
# print(year)

# print_counts(df.year)

In [9]:
# print_counts(df.region)

In [10]:
# print_counts(df.protest)

In [11]:
# print_counts(df.protestnumber)

In [12]:
# print(df.loc[df.protestnumber == 0].protesterdemand1.value_counts())
# print(df.loc[df.protestnumber == 0].stateresponse1.value_counts())
# print(df.loc[df.protestnumber == 0].notes.value_counts())

In [13]:
df = df.loc[df.protest == 1].copy()
df.reset_index(inplace=True, drop=True)

##### Notes: drop all rows with df.protest == 0. This means no protest took place, which can be verified by investigating a sample of those entries. Since this analysis studies only protests, non-protest entries are not of interest.

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15239 entries, 0 to 15238
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     15239 non-null  int64  
 1   country                15239 non-null  object 
 2   ccode                  15239 non-null  int64  
 3   year                   15239 non-null  int64  
 4   region                 15239 non-null  object 
 5   protest                15239 non-null  int64  
 6   protestnumber          15239 non-null  int64  
 7   startday               15239 non-null  float64
 8   startmonth             15239 non-null  float64
 9   startyear              15239 non-null  float64
 10  endday                 15239 non-null  float64
 11  endmonth               15239 non-null  float64
 12  endyear                15239 non-null  float64
 13  protesterviolence      15239 non-null  float64
 14  location               15218 non-null  object 
 15  pa

In [15]:
# print_counts(df.startday)

In [16]:
# print_counts(df.startmonth)

In [17]:
# print_counts(df.startyear)

In [18]:
# print_counts(df.endday)

In [19]:
# print_counts(df.endmonth)

In [20]:
# print_counts(df.endyear)

In [21]:
# print_counts(df.protesterviolence)

In [22]:
# df.location.value_counts().shape

##### 'Locations' column isn't at all standardized. Avoid using unless absolutely necessary. The 'Country' column should provide the necessary information for an MVP.

Fill na's with UNKNOWN

In [23]:
df['location'] = df.location.fillna(UNKNOWN)

In [24]:
# df.info()

In [25]:
# df.participants.isna()

In [26]:
# #print(df.participants_category.value_counts())
# df.loc[df.participants.isna()]

### Fill in missing "Participants Category" value using the "Participants" column. 
This takes some manual  changes. Everything that can be changed from string to int is done automatically. Everything that can't be done automatically is added to a dictionary to be done manually. This *drastically* reduces any need for manual work

In [27]:
df.loc[13542]

id                                                              7502017004
country                                                              India
ccode                                                                  750
year                                                                  2017
region                                                                Asia
protest                                                                  1
protestnumber                                                            4
startday                                                              27.0
startmonth                                                             5.0
startyear                                                           2017.0
endday                                                                27.0
endmonth                                                               5.0
endyear                                                             2017.0
protesterviolence        

In [28]:
# Gather array of unique 'participants' values
participants_unique = df.participants.dropna().unique()


converter = {}
unable_to_convert = {}

# Create dictionary to convert messy strings to ints
for participant in participants_unique:
    
    # Try simply converting to integer
    try:
        converter[participant] = int(participant)
    
    # Else remove commonly unnecessary characters
    except:
        participant_clean = str.lower(participant)\
                               .replace('+', '')\
                               .replace(',', '')\
                               .replace('>', '')\
                               .replace('<', '')\
                               .replace('about', '')\
                               .replace('around', '')\
                               .replace('almost','')\
                               .replace('more than','')\
                               .replace('less than', '')\
                               .replace('at least', '')\
                               .replace('over', '')\
                               .replace('nearly', '')\
                               .replace('up to', '')\
                               .replace('people', '')\
                               .replace('some', '')\
                               .replace('estimated', '')\
                               .replace('protester', '')\
                               .replace('construction worker', '')\
                               .replace('member', '')\
                               .replace('citizen', '')\
                               .replace('parent', '')\
                               .replace('local', '')\
                               .replace('demonstrator', '')\
                               .replace('teacher', '')\
                               .replace('activist', '')\
                               .replace('supporter', '')\
                               .replace('villager', '')\
                               .replace('campaign', '')\
                               .replace('campaigner', '')\
                               .replace('driver', '')\
                               .replace('resident', '')\
                               .replace('participant', '')\
                               .replace(' of ', '')\
                               .replace('_', '')\
                               .replace(' to ', '-')\
                               .replace('s', '')
        # Try converting clean value
        try:
            
            # Append to converter dictionary if successful
            converter[participant] = int(participant_clean)
            
        # Else convert values of format "x-y"
        except:
            try:
                # Split "x-y" format
                participant_range = str.split(participant_clean, '-')
                
                # Convert both "x" and "y" to ints before averaging
                avg = (int(participant_range[0]) + int(participant_range[1])) / 2
                
                # Append to converter dictionary if successful
                converter[participant] = int(avg)
        
        
            # Else create dict to manually clean
            except:
                print("Couldn't convert: {}  /  {}".format(participant, participant_clean))
                unable_to_convert[participant] = UNKNOWN
            
print("MANUALLY CONVERT THE ABOVE VALUES (ALGORITMS COULDN'T SOLVE)")

Couldn't convert: hundreds  /  hundred
Couldn't convert: dozens  /  dozen
Couldn't convert: a group  /  a group
Couldn't convert: busloads  /  buload
Couldn't convert: widespread  /  widepread
Couldn't convert: thousands  /  thouand
Couldn't convert: several hundred  /  everal hundred
Couldn't convert: tens of thousands  /  tenthouand
Couldn't convert: more than 1500 detained  /   1500 detained
Couldn't convert: 230 families  /  230 familie
Couldn't convert: several dozen  /  everal dozen
Couldn't convert: Tens of thousands  /  tenthouand
Couldn't convert: hundreds   /  hundred 
Couldn't convert: hundreds of thousands  /  hundredthouand
Couldn't convert: 164 arrested  /  164 arreted
Couldn't convert: Thousands  /  thouand
Couldn't convert: several dozen arrests  /  everal dozen arret
Couldn't convert: several thousand  /  everal thouand
Couldn't convert: more than 400 arrested  /   400 arreted
Couldn't convert: Hundreds of thousands  /  hundredthouand
Couldn't convert: 20 arrested  /  

In [29]:
df.loc[13542]

id                                                              7502017004
country                                                              India
ccode                                                                  750
year                                                                  2017
region                                                                Asia
protest                                                                  1
protestnumber                                                            4
startday                                                              27.0
startmonth                                                             5.0
startyear                                                           2017.0
endday                                                                27.0
endmonth                                                               5.0
endyear                                                             2017.0
protesterviolence        

In [30]:
# Manually define remaining conversions
path_cleaning_dict = os.path.join('..', 'data', 'processed', 'clean_protester_count.csv')
cleaning_dict = pd.read_csv(path_cleaning_dict)
for original, clean in zip(cleaning_dict.iloc[:, 0], cleaning_dict.iloc[:, 1]):
    try:
        converter[original] = int(clean)
    except:
        pass

In [31]:
df['participants_category'].value_counts()

100-999       3204
50-99         2508
2000-4999     1580
>10000        1470
5000-10000     642
1000-1999      483
Name: participants_category, dtype: int64

In [32]:
df.loc[13542]

id                                                              7502017004
country                                                              India
ccode                                                                  750
year                                                                  2017
region                                                                Asia
protest                                                                  1
protestnumber                                                            4
startday                                                              27.0
startmonth                                                             5.0
startyear                                                           2017.0
endday                                                                27.0
endmonth                                                               5.0
endyear                                                             2017.0
protesterviolence        

In [34]:
df.loc[13542]['participants']#.map(converter)

'Widespread protests throughout the valley'

In [35]:
# Add new column for the clean values
participant_ct_clean_df = df['participants'].map(converter).astype('Int64')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15239 entries, 0 to 15238
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     15239 non-null  int64  
 1   country                15239 non-null  object 
 2   ccode                  15239 non-null  int64  
 3   year                   15239 non-null  int64  
 4   region                 15239 non-null  object 
 5   protest                15239 non-null  int64  
 6   protestnumber          15239 non-null  int64  
 7   startday               15239 non-null  float64
 8   startmonth             15239 non-null  float64
 9   startyear              15239 non-null  float64
 10  endday                 15239 non-null  float64
 11  endmonth               15239 non-null  float64
 12  endyear                15239 non-null  float64
 13  protesterviolence      15239 non-null  float64
 14  location               15239 non-null  object 
 15  pa

In [45]:
participant_ct_clean_df.loc[13542]

<NA>

In [55]:
print(participant_ct_clean_df.value_counts())
print('NA values:', participant_ct_clean_df.isna().sum())

50        3173
100       2610
1000      2316
10000      976
200        453
          ... 
290000       1
3300         1
71           1
52500        1
180000       1
Name: participants, Length: 320, dtype: Int64
NA values: 986


In [74]:
# THIS CELL IS KEY IS ISOLATING THE PROTESTS THAT WEREN'T CLASSIFIED BY THE CONVERTER YET HAVE POTENTIAL OF BEING CONVERTED BY THEIR CATEGORY
na_counts = df.loc[participant_ct_clean_df.isna()]

print(na_counts.participants_category.value_counts())
na_counts_WITH_category = na_counts.loc[~na_counts.participants_category.isna()]
na_counts_WITH_category

100-999       341
2000-4999     218
5000-10000    212
50-99         107
>10000         76
1000-1999      19
Name: participants_category, dtype: int64


Unnamed: 0,id,country,ccode,year,region,protest,protestnumber,startday,startmonth,startyear,endday,endmonth,endyear,protesterviolence,location,participants_category,participants,protesteridentity,protesterdemand1,protesterdemand2,protesterdemand3,protesterdemand4,stateresponse1,stateresponse2,stateresponse3,stateresponse4,stateresponse5,stateresponse6,stateresponse7,sources,notes
35,202016002,Canada,20,2016,North America,1,2,10.0,2.0,2016.0,10.0,2.0,2016.0,0.0,Montreal Pierre Elliot Trudeau International ...,100-999,hundreds,taxi drivers against uber,labor wage dispute,,,,ignore,,,,,,,"cabbies right to back down. the toronto star, ...",taxi drivers who were threatening to block aft...
37,202016004,Canada,20,2016,North America,1,4,20.0,3.0,2016.0,4.0,4.0,2016.0,0.0,"Toronto, Ontario",100-999,hundreds,black lives matter,police brutality,,,,crowd dispersal,,,,,,,"1. police, demonstrators clash at andrew loku ...","ontario s special investigations unit, the civ..."
38,202016005,Canada,20,2016,North America,1,5,24.0,3.0,2016.0,24.0,3.0,2016.0,0.0,"Toronto, Ontario",50-99,dozens,women,"political behavior, process",,,,arrests,,,,,,,"1. in toronto, former star radio host is acqui...","ghomeshi, who first gained fame as a member of..."
40,202016007,Canada,20,2016,North America,1,7,14.0,4.0,2016.0,14.0,4.0,2016.0,0.0,Toronto,50-99,a group,aboriginal activists,"political behavior, process",,,,ignore,,,,,,,standing up for attawapiskat. the toronto star...,a group of aboriginal activists and their supp...
43,202016010,Canada,20,2016,North America,1,10,13.0,10.0,2016.0,11.0,11.0,2016.0,0.0,Labrador,100-999,busloads,indigenous residents,"political behavior, process",,,,arrests,,,,,,,1. protests against muskrat falls hydro projec...,protests. hunger strikes. sit ins that disrupt...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15026,8402017002,Philippines,840,2017,Asia,1,2,15.0,8.0,2017.0,15.0,8.0,2017.0,0.0,Manila,2000-4999,Thousands,protesters organized by the roman catholic chu...,police brutality,,,,ignore,,,,,,,as anti drug push s toll grows in the philippi...,in a dimly lit events room in a church compoun...
15167,8502016002,Indonesia,850,2016,Asia,1,2,16.0,8.0,2016.0,16.0,8.0,2016.0,0.0,"Soewondo Air Force Base, Medan, North Sumatra",100-999,Hundreds,journalists,"political behavior, process",police brutality,,,ignore,,,,,,,indonesian journalists slam military brutality...,hundreds of journalists staged a peaceful rall...
15168,8502016003,Indonesia,850,2016,Asia,1,3,29.0,9.0,2016.0,29.0,9.0,2016.0,0.0,Jakarta,2000-4999,Thousands,union workers,"price increases, tax policy","political behavior, process",,,ignore,,,,,,,"thousands protest unfair tax amnesty, the nati...",thousands of indonesian union workers marched ...
15174,8502017002,Indonesia,850,2017,Asia,1,2,1.0,5.0,2017.0,1.0,5.0,2017.0,0.0,Jakarta,2000-4999,Thousands,workers,"political behavior, process",,,,ignore,,,,,,,"from france to indonesia, marking may day with...",thousands of workers marched toward the presid...


In [None]:
na_counts_WITH_category

In [None]:
START HERE ---- THIS CELL IS OFF TO A GOOD START --- 

APPLY THIS FUNCTION YOU'RE ABOUT TO DEFINE TO ALL PROTESTS IN THE ABOVE DATAFRAME
THEN UPDATE THE COUNT VALUES IN THE DATAFRAME WITH THESE NEW AVERAGES!!!

def convert_range_to_average(string):
    

# Split "x-y" format
                participant_range = str.split(participant_clean, '-')
                
                # Convert both "x" and "y" to ints before averaging
                avg = (int(participant_range[0]) + int(participant_range[1])) / 2
                
                # Append to converter dictionary if successful
                converter[participant] = int(avg)

In [69]:
categories_dict = df.participants_category.dropna().unique()
ranges = [str.split(cat, '-') for cat in categories_dict]
categories_dict = dict(zip(categories_dict, ranges))
categories_dict['>10000'] = [10001, 10**15]
for cat in categories_dict:
    lo = int(categories_dict[cat][0])
    hi = int(categories_dict[cat][1])
    categories_dict[cat] = [lo, hi]

In [70]:
categories_dict

{'50-99': [50, 99],
 '100-999': [100, 999],
 '2000-4999': [2000, 4999],
 '1000-1999': [1000, 1999],
 '5000-10000': [5000, 10000],
 '>10000': [10001, 1000000000000000]}

In [None]:
ranges

In [None]:
plt.figure()
participant_ct_clean_df.dropna().sort_values().hist(bins=25);
plt.title('Participant Turnout Hist')


plt.figure()
participant_ct_clean_df.dropna().sort_values().hist(bins=2000)
plt.xlim([0, 50000])
plt.title('Participant Turnout Hist (Lower Values)');

In [None]:
df.info()

In [None]:
categories_dict

In [None]:
df.loc[13542]

In [None]:
categories_dict

In [None]:
participant_ct_clean_df.loc[13542]

In [None]:
df.loc[13542]


In [None]:
df.participants_category.value_counts()

In [None]:
stop

In [None]:
def pick_bucket(x):
    if pd.isna(x):
        return np.nan
    
    for key in categories_dict:
        lim_low = categories_dict[key][0]
        lim_hi = categories_dict[key][1]
        
        if (x >= lim_low) and (x <= lim_hi):
            return key
    return np.nan

participant_cat_clean_df = pd.DataFrame(np.array(participant_ct_clean_df.apply(pick_bucket)))
participant_cat_clean_df.columns = ['participants_category']

In [None]:
df.info()

In [None]:
participant_cat_clean_df.loc[13542]

#### Move on beyond Protester Counts

Next: protesteridentity

In [None]:
# Replace unknowns with unknown for consistency
df.protesteridentity.replace('unspecified', UNKNOWN, inplace=True)
df['protesteridentity'] = df['protesteridentity'].fillna(UNKNOWN)


df.protesteridentity.value_counts()

### Protester Demands

In [None]:
print(df.protesterdemand1.value_counts())
print('-----')
print(df.protesterdemand2.value_counts())
print('-----')
print(df.protesterdemand3.value_counts())
print('-----')
print(df.protesterdemand4.value_counts())

In [None]:
# Create list of unique demands
demands = np.concatenate([df.protesterdemand1.dropna().unique(), 
                          df.protesterdemand2.dropna().unique(),
                          df.protesterdemand3.dropna().unique(), 
                          df.protesterdemand4.dropna().unique()])
# Verify there are only 7 demands as indicated by data manual
set(demands)

In [None]:
demand_cols = ['protesterdemand1', 'protesterdemand2', 'protesterdemand3', 'protesterdemand4']

# Replace '.' with np.nan
for demand_col in demand_cols:
    df[demand_col].replace('.', np.nan, inplace=True)

In [None]:
df.info()

In [None]:
ohe = OneHotEncoder(sparse=False)


for demand_col in demand_cols:
    if demand_col == demand_cols[0]: #first iteration
        fit = ohe.fit_transform(df[demand_col].values.reshape(-1, 1))
        demand_array = np.array(fit)     
        print('Protester demand count:', np.sum(demand_array))
    else:
        fit = ohe.transform(df[demand_col].values.reshape(-1, 1))
        demand_array = np.logical_or(demand_array, fit)
        print('Protester demand count:', np.sum(demand_array))
        


# Clean column names
remove_commas = lambda x: str.replace(x, ', ', '/')
remove_spaces = lambda x: str.replace(x, ' ', '-')
col_names = ohe.get_feature_names(['demands'])
col_names = list(map(remove_spaces, map(remove_commas, col_names)))

# Store as dataframe 
demand_df = pd.DataFrame(demand_array, dtype='int', columns=col_names)
demand_df.drop('demands_nan', axis=1, inplace=True)

print('Final shape:', demand_df.shape)
print('Column names:', demand_df.columns)

### State responses

In [None]:
# column names containing state response data
response_cols = ['stateresponse1', 'stateresponse2', 'stateresponse3', 'stateresponse4', 
             'stateresponse5', 'stateresponse6', 'stateresponse7']

# Replace '.' with np.nan
for col in response_cols:
    df[col].replace('.', np.nan, inplace=True)

In [None]:
# Verify there are only 7 responses as indicated by data manual
# Create list of unique demands

demands = np.array([])
for col in response_cols:
    demands = np.concatenate([demands, df[col]])
print(set(demands))
# We see that there are 7 demands, plus placeholder '.' and 'nan' values to drop later

In [None]:
ohe = OneHotEncoder(sparse=False)

for col in response_cols:
    if col == response_cols[0]: #first iteration
        fit = ohe.fit_transform(df[col].values.reshape(-1, 1))
        response_array = np.array(fit)     
        print('State response count:', np.sum(response_array))
        print('Column_names:', ohe.get_feature_names(['responses']))
    else:
        fit = ohe.transform(df[col].values.reshape(-1, 1))
        response_array = np.logical_or(response_array, fit)
        print('State response count:', np.sum(response_array))
        
print('\nPre-cleaning column names:', ohe.get_feature_names(['responses']))
        
        
# # Clean column names
remove_spaces = lambda x: str.replace(x, ' ', '-')
col_names = ohe.get_feature_names(['responses'])
col_names = list(map(remove_spaces, col_names))
print('\nPost-cleaning column names:', col_names)

# Store as dataframe 
response_df = pd.DataFrame(response_array, dtype='int', columns=col_names)
response_df.drop('responses_nan', axis=1, inplace=True)

print('\nFinal shape:', response_df.shape)
print('\nColumn names:', response_df.columns)

In [None]:
df.columns

### Investigate "sources" and "notes" columns

In [None]:
#df.sources.value_counts()

In [None]:
#df.notes.value_counts()

##### As can be seen above, the 'source' and 'notes' columns don't contain standardized text. It is best to fill na's such that rows aren't dropped just because these columns aren't filled in

In [None]:
df['notes'] = df['notes'].fillna(UNKNOWN)
df['sources'] = df['sources'].fillna(UNKNOWN)

### Combine cleaned columns into one dataframe

In [None]:
df.info()

In [None]:
demand_cols

In [None]:
response_cols

In [None]:
exclude_cols = ['participants_category', 'participants']+demand_cols+response_cols
df.drop(exclude_cols, axis=1).info()

In [None]:
df_clean = df.drop(exclude_cols, axis=1)

df_clean = pd.concat([df_clean, participant_ct_clean_df, participant_cat_clean_df, demand_df, response_df], axis=1)

df_clean.info()

In [None]:
df_clean['participants_category'].isna().sum()

In [None]:
temp = df_clean.loc[df_clean['participants_category'].isna()]
temp.id

In [None]:
df_clean['protesteridentity'].value_counts()

In [None]:
df_temp = df.copy()

df_temp['protesteridentity'] = df_temp['protesteridentity'].fillna(UNKNOWN)
df_temp.info()

In [None]:
temp.id.index

In [None]:
#df.loc[temp.id.index]

In [None]:
df_clean['participants_category_old'] = df['participants_category']
df_clean['participants_old'] = df['participants']

In [None]:
df_clean.info()

In [None]:
participant_cols = ['id', 'participants', 'participants_category', 'participants_category_old', 'participants_old']

part = df_clean[participant_cols]

part.info()

In [None]:
part.loc[part.participants_category.isna()]

In [None]:
NOTE TO SELF - DOUBLE CHECK THAT THE PARTICIPANT COUNTS AND CATEGORIZATIONS ARE CORRECT FOR INDEX 13542 AS A QC