### Exploratory Analysis - Mass Mobilization Protests

In [63]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as sk
import os, sys
import math

from sklearn.preprocessing import OneHotEncoder

pd.options.display.max_columns = 100
pd.options.display.max_rows = 300

path_data = os.path.join('..', 'data', 'raw')

UNKNOWN = 'unknown'

In [2]:
path_csv = os.path.join(path_data, 'Mass-Mobilization-Protests', 'mmALL_073120_csv.csv')
df = pd.read_csv(path_csv)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17145 entries, 0 to 17144
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     17145 non-null  int64  
 1   country                17145 non-null  object 
 2   ccode                  17145 non-null  int64  
 3   year                   17145 non-null  int64  
 4   region                 17145 non-null  object 
 5   protest                17145 non-null  int64  
 6   protestnumber          17145 non-null  int64  
 7   startday               15239 non-null  float64
 8   startmonth             15239 non-null  float64
 9   startyear              15239 non-null  float64
 10  endday                 15239 non-null  float64
 11  endmonth               15239 non-null  float64
 12  endyear                15239 non-null  float64
 13  protesterviolence      15758 non-null  float64
 14  location               15218 non-null  object 
 15  pa

In [4]:
def print_counts(series):
    unique = series.value_counts()
    return unique

In [5]:
print_counts(df.id)

512000000     1
4502005006    1
2202017017    1
2001995002    1
1352011003    1
             ..
2202002009    1
2001998017    1
4901991003    1
6402014010    1
701995005     1
Name: id, Length: 17145, dtype: int64

In [6]:
# countries = df.country.unique()
# countries.sort()
# countries

In [7]:
# print_counts(df[['ccode', 'country']])

In [8]:
# year = df.year.unique()
# year.sort()
# print(year)

# print_counts(df.year)

In [9]:
# print_counts(df.region)

In [10]:
# print_counts(df.protest)

In [11]:
# print_counts(df.protestnumber)

In [12]:
# print(df.loc[df.protestnumber == 0].protesterdemand1.value_counts())
# print(df.loc[df.protestnumber == 0].stateresponse1.value_counts())
# print(df.loc[df.protestnumber == 0].notes.value_counts())

In [13]:
df = df.loc[df.protest == 1].copy()
df.reset_index(inplace=True, drop=True)

##### Notes: drop all rows with df.protest == 0. This means no protest took place, which can be verified by investigating a sample of those entries. Since this analysis studies only protests, non-protest entries are not of interest.

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15239 entries, 0 to 15238
Data columns (total 31 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     15239 non-null  int64  
 1   country                15239 non-null  object 
 2   ccode                  15239 non-null  int64  
 3   year                   15239 non-null  int64  
 4   region                 15239 non-null  object 
 5   protest                15239 non-null  int64  
 6   protestnumber          15239 non-null  int64  
 7   startday               15239 non-null  float64
 8   startmonth             15239 non-null  float64
 9   startyear              15239 non-null  float64
 10  endday                 15239 non-null  float64
 11  endmonth               15239 non-null  float64
 12  endyear                15239 non-null  float64
 13  protesterviolence      15239 non-null  float64
 14  location               15218 non-null  object 
 15  pa

In [15]:
# print_counts(df.startday)

In [16]:
# print_counts(df.startmonth)

In [17]:
# print_counts(df.startyear)

In [18]:
# print_counts(df.endday)

In [19]:
# print_counts(df.endmonth)

In [20]:
# print_counts(df.endyear)

In [21]:
# print_counts(df.protesterviolence)

In [22]:
# df.location.value_counts().shape

##### 'Locations' column isn't at all standardized. Avoid using unless absolutely necessary. The 'Country' column should provide the necessary information for an MVP.

Fill na's with UNKNOWN

In [23]:
df['location'] = df.location.fillna(UNKNOWN)

In [24]:
# df.info()

In [25]:
# df.participants.isna()

In [26]:
# #print(df.participants_category.value_counts())
# df.loc[df.participants.isna()]

### Fill in missing "Participants Category" value using the "Participants" column. 
This takes some manual  changes. Everything that can be changed from string to int is done automatically. Everything that can't be done automatically is added to a dictionary to be done manually. This *drastically* reduces any need for manual work

In [27]:
df.loc[13542]

id                                                              7502017004
country                                                              India
ccode                                                                  750
year                                                                  2017
region                                                                Asia
protest                                                                  1
protestnumber                                                            4
startday                                                              27.0
startmonth                                                             5.0
startyear                                                           2017.0
endday                                                                27.0
endmonth                                                               5.0
endyear                                                             2017.0
protesterviolence        

In [249]:
# Convert range in format "a-b" to average(a, b)
def convert_range_to_average(range_string):

    if range_string is np.nan:
        return np.nan
    
    # Split "x-y" format
    range_list = str.split(range_string, '-')
    
    if len(range_list)==1:
        avg = range_list[0].replace('<', '').replace('>', '')
    
    elif len(range_list)==2:
        # Convert both "x" and "y" to ints before averaging
        avg = np.mean([int(range_list[0]), int(range_list[1])])
    
    else:
        raise error

    # Return average
    return int(avg)

In [368]:
# ----- CREATE 'CONVERTER' DICTIONARY -----
# also create dictionary of 'participant' values that couldn't be converted

# Gather array of unique 'participants' values
participants_unique = df.participants.dropna().unique()

# Create empty dictionaries to hold lookups
converter = {}

# Create dictionary to convert messy strings to ints
#print("COULDN'T CONVERT THE FOLLOWING VALUES:\n")
for participant in participants_unique:

    # Else remove commonly unnecessary characters
    participant_clean = str.lower(participant)\
                           .replace('+', '')\
                           .replace(',', '')\
                           .replace('>', '')\
                           .replace('<', '')\
                           .replace('about', '')\
                           .replace('around', '')\
                           .replace('almost','')\
                           .replace('more than','')\
                           .replace('less than', '')\
                           .replace('at least', '')\
                           .replace('over', '')\
                           .replace('nearly', '')\
                           .replace('up to', '')\
                           .replace('people', '')\
                           .replace('some', '')\
                           .replace('estimated', '')\
                           .replace('protester', '')\
                           .replace('construction worker', '')\
                           .replace('member', '')\
                           .replace('citizen', '')\
                           .replace('parent', '')\
                           .replace('local', '')\
                           .replace('demonstrator', '')\
                           .replace('teacher', '')\
                           .replace('activist', '')\
                           .replace('supporter', '')\
                           .replace('villager', '')\
                           .replace('campaign', '')\
                           .replace('campaigner', '')\
                           .replace('driver', '')\
                           .replace('resident', '')\
                           .replace('participant', '')\
                           .replace(' of ', '')\
                           .replace('_', '')\
                           .replace(' to ', '-')#\
                           #.replace('s', '')
    # Try converting clean value
    try:

        # Append to converter dictionary if successful
        converter[participant] = int(participant_clean)

    # Else convert values of format "x-y"
    except:
        try:

            converter[participant] = convert_range_to_average(participant_clean)

        # Else create dict to manually clean
        except:
            #print("Couldn't convert: {}  /  {}".format(participant, participant_clean))
            pass



In [369]:
# Identify the rows that weren't able to be converted using the dict above. 
# Estimate whether or not it is practical top use "participants_category"
# to average values within a range and use that value within participants_clean

temp = df[['id', 'participants', 'participants_category']].copy()
temp['participants_clean'] = df['participants'].map(converter)#.astype('Int64')

nas = temp.loc[temp['participants_clean'].isna()]
print('Remaining rows:', nas.shape[0])
print('Examples include:')


nas.sort_values(by='participants').head(25)

Remaining rows: 4171
Examples include:


Unnamed: 0,id,participants,participants_category,participants_clean
1410,1012017014,"""Rocked by protests""",100-999,
15087,8501998019,.,,
1633,1302005002,.,,
15100,8501999009,.,,
11631,6602002005,.,,
12975,7321990014,.,,
11615,6601992002,.,,
6651,3641990010,1 million,>10000,
1144,1011992007,"1,000,000s",,
1143,1011992006,"1,000,000s",,


In [370]:
# Apply dictionary to convert original participant counts
counts_converted = df['participants'].map(converter)#.astype('Int64')

# Create a series to hold the averages converted from the range column
range_as_average = df['participants_category'].apply(convert_range_to_average)

# Choose the count value unless it is NaN, in which case use avg value
chose_non_nan = lambda avg, count: avg if math.isnan(float(count)) else count

participants = pd.DataFrame(map(chose_non_nan, range_as_average, counts_converted), columns=['participants'])
print('Total remaining NaN values:', participants.participants.isna().sum())

Total remaining NaN values: 798


In [371]:
df['participants_clean'] = participants

#### YAY the above worked to fill in missing Participant values from Participant Categories values

In [372]:
participant_categories_dirty = df.participants_category
participant_categories_dirty

0              NaN
1              NaN
2              NaN
3              NaN
4              NaN
           ...    
15234      100-999
15235    1000-1999
15236        50-99
15237        50-99
15238      100-999
Name: participants_category, Length: 15239, dtype: object

In [373]:
categories_dict = df.participants_category.dropna().unique()
ranges = [str.split(cat, '-') for cat in categories_dict]
categories_dict = dict(zip(categories_dict, ranges))
categories_dict['>10000'] = [10001, 10**15]
for cat in categories_dict:
    lo = int(categories_dict[cat][0])
    hi = int(categories_dict[cat][1])
    categories_dict[cat] = [lo, hi]



def pick_bucket(x):
    for key in categories_dict:
        lim_low = categories_dict[key][0]
        lim_hi = categories_dict[key][1]
        #print(x)
        if (x >= lim_low) and (x <= lim_hi):
            return key
    return np.nan

participant_categories_applied = pd.DataFrame(map(pick_bucket, participants.participants), columns = ['participants_category'])
df['participant_categories_applied'] = participant_categories_applied

participant_categories_applied

Unnamed: 0,participants_category
0,
1,1000-1999
2,100-999
3,
4,100-999
...,...
15234,100-999
15235,1000-1999
15236,50-99
15237,50-99


In [379]:
categories_dict

{'50-99': [50, 99],
 '100-999': [100, 999],
 '2000-4999': [2000, 4999],
 '1000-1999': [1000, 1999],
 '5000-10000': [5000, 10000],
 '>10000': [10001, 1000000000000000]}

#### Now, check if the above categories match the Categories column in df as a QC

In [374]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15239 entries, 0 to 15238
Data columns (total 33 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              15239 non-null  int64  
 1   country                         15239 non-null  object 
 2   ccode                           15239 non-null  int64  
 3   year                            15239 non-null  int64  
 4   region                          15239 non-null  object 
 5   protest                         15239 non-null  int64  
 6   protestnumber                   15239 non-null  int64  
 7   startday                        15239 non-null  float64
 8   startmonth                      15239 non-null  float64
 9   startyear                       15239 non-null  float64
 10  endday                          15239 non-null  float64
 11  endmonth                        15239 non-null  float64
 12  endyear                         

In [375]:
df_participants = df[['participants_category', 'participants', 'participants_clean', 'participant_categories_applied']]
df_participants

Unnamed: 0,participants_category,participants,participants_clean,participant_categories_applied
0,,1000s,,
1,,1000,1000.0,1000-1999
2,,500,500.0,100-999
3,,100s,,
4,,950,950.0,100-999
...,...,...,...,...
15234,100-999,100+,100.0,100-999
15235,1000-1999,About 1000,1000.0,1000-1999
15236,50-99,50+,50.0,50-99
15237,50-99,50+,50.0,50-99


In [378]:
df_participants.loc[(df_participants.participants_category != df_participants.participant_categories_applied) & (~df_participants.participants_category.isna())]

Unnamed: 0,participants_category,participants,participants_clean,participant_categories_applied
57,100-999,50+,50.0,50-99
242,100-999,50+,50.0,50-99
256,>10000,tens of thousands,10000.0,5000-10000
268,>10000,tens of thousands,10000.0,5000-10000
316,>10000,tens of thousands,10000.0,5000-10000
...,...,...,...,...
15023,>10000,"About 12,000 protesters",10000.0,5000-10000
15025,>10000,"At least 10,000 people",10000.0,5000-10000
15169,>10000,Tens of thousands,10000.0,5000-10000
15177,>10000,About 10000,10000.0,5000-10000


In [None]:
participant_ct_clean_df.loc[13542]

In [None]:
print(participant_ct_clean_df.value_counts())
print('NA values:', participant_ct_clean_df.isna().sum())

In [None]:
# THIS CELL IS KEY IS ISOLATING THE PROTESTS THAT WEREN'T CLASSIFIED BY THE CONVERTER YET HAVE POTENTIAL OF BEING CONVERTED BY THEIR CATEGORY
na_counts = df.loc[participant_ct_clean_df.isna()]

print(na_counts.participants_category.value_counts())
na_counts_WITH_category = na_counts.loc[~na_counts.participants_category.isna()]
na_counts_WITH_category

In [None]:
na_counts_WITH_category.participants_category.apply(convert_range_to_average)

In [None]:
for item in categories_dict:
    print(convert_range_to_average(item))

In [None]:
categories_dict

In [None]:
categories_dict

In [None]:
ranges

In [None]:
plt.figure()
participant_ct_clean_df.dropna().sort_values().hist(bins=25);
plt.title('Participant Turnout Hist')


plt.figure()
participant_ct_clean_df.dropna().sort_values().hist(bins=2000)
plt.xlim([0, 50000])
plt.title('Participant Turnout Hist (Lower Values)');

In [None]:
df.info()

In [None]:
categories_dict

In [None]:
df.loc[13542]

In [None]:
categories_dict

In [None]:
participant_ct_clean_df.loc[13542]

In [None]:
df.loc[13542]


In [None]:
df.participants_category.value_counts()

In [None]:
stop

In [None]:
def pick_bucket(x):
    if pd.isna(x):
        return np.nan
    
    for key in categories_dict:
        lim_low = categories_dict[key][0]
        lim_hi = categories_dict[key][1]
        
        if (x >= lim_low) and (x <= lim_hi):
            return key
    return np.nan

participant_cat_clean_df = pd.DataFrame(np.array(participant_ct_clean_df.apply(pick_bucket)))
participant_cat_clean_df.columns = ['participants_category']

In [None]:
df.info()

In [None]:
participant_cat_clean_df.loc[13542]

#### Move on beyond Protester Counts

Next: protesteridentity

In [None]:
# Replace unknowns with unknown for consistency
df.protesteridentity.replace('unspecified', UNKNOWN, inplace=True)
df['protesteridentity'] = df['protesteridentity'].fillna(UNKNOWN)


df.protesteridentity.value_counts()

### Protester Demands

In [None]:
print(df.protesterdemand1.value_counts())
print('-----')
print(df.protesterdemand2.value_counts())
print('-----')
print(df.protesterdemand3.value_counts())
print('-----')
print(df.protesterdemand4.value_counts())

In [None]:
# Create list of unique demands
demands = np.concatenate([df.protesterdemand1.dropna().unique(), 
                          df.protesterdemand2.dropna().unique(),
                          df.protesterdemand3.dropna().unique(), 
                          df.protesterdemand4.dropna().unique()])
# Verify there are only 7 demands as indicated by data manual
set(demands)

In [None]:
demand_cols = ['protesterdemand1', 'protesterdemand2', 'protesterdemand3', 'protesterdemand4']

# Replace '.' with np.nan
for demand_col in demand_cols:
    df[demand_col].replace('.', np.nan, inplace=True)

In [None]:
df.info()

In [None]:
ohe = OneHotEncoder(sparse=False)


for demand_col in demand_cols:
    if demand_col == demand_cols[0]: #first iteration
        fit = ohe.fit_transform(df[demand_col].values.reshape(-1, 1))
        demand_array = np.array(fit)     
        print('Protester demand count:', np.sum(demand_array))
    else:
        fit = ohe.transform(df[demand_col].values.reshape(-1, 1))
        demand_array = np.logical_or(demand_array, fit)
        print('Protester demand count:', np.sum(demand_array))
        


# Clean column names
remove_commas = lambda x: str.replace(x, ', ', '/')
remove_spaces = lambda x: str.replace(x, ' ', '-')
col_names = ohe.get_feature_names(['demands'])
col_names = list(map(remove_spaces, map(remove_commas, col_names)))

# Store as dataframe 
demand_df = pd.DataFrame(demand_array, dtype='int', columns=col_names)
demand_df.drop('demands_nan', axis=1, inplace=True)

print('Final shape:', demand_df.shape)
print('Column names:', demand_df.columns)

### State responses

In [None]:
# column names containing state response data
response_cols = ['stateresponse1', 'stateresponse2', 'stateresponse3', 'stateresponse4', 
             'stateresponse5', 'stateresponse6', 'stateresponse7']

# Replace '.' with np.nan
for col in response_cols:
    df[col].replace('.', np.nan, inplace=True)

In [None]:
# Verify there are only 7 responses as indicated by data manual
# Create list of unique demands

demands = np.array([])
for col in response_cols:
    demands = np.concatenate([demands, df[col]])
print(set(demands))
# We see that there are 7 demands, plus placeholder '.' and 'nan' values to drop later

In [None]:
ohe = OneHotEncoder(sparse=False)

for col in response_cols:
    if col == response_cols[0]: #first iteration
        fit = ohe.fit_transform(df[col].values.reshape(-1, 1))
        response_array = np.array(fit)     
        print('State response count:', np.sum(response_array))
        print('Column_names:', ohe.get_feature_names(['responses']))
    else:
        fit = ohe.transform(df[col].values.reshape(-1, 1))
        response_array = np.logical_or(response_array, fit)
        print('State response count:', np.sum(response_array))
        
print('\nPre-cleaning column names:', ohe.get_feature_names(['responses']))
        
        
# # Clean column names
remove_spaces = lambda x: str.replace(x, ' ', '-')
col_names = ohe.get_feature_names(['responses'])
col_names = list(map(remove_spaces, col_names))
print('\nPost-cleaning column names:', col_names)

# Store as dataframe 
response_df = pd.DataFrame(response_array, dtype='int', columns=col_names)
response_df.drop('responses_nan', axis=1, inplace=True)

print('\nFinal shape:', response_df.shape)
print('\nColumn names:', response_df.columns)

In [None]:
df.columns

### Investigate "sources" and "notes" columns

In [None]:
#df.sources.value_counts()

In [None]:
#df.notes.value_counts()

##### As can be seen above, the 'source' and 'notes' columns don't contain standardized text. It is best to fill na's such that rows aren't dropped just because these columns aren't filled in

In [None]:
df['notes'] = df['notes'].fillna(UNKNOWN)
df['sources'] = df['sources'].fillna(UNKNOWN)

### Combine cleaned columns into one dataframe

In [None]:
df.info()

In [None]:
demand_cols

In [None]:
response_cols

In [None]:
exclude_cols = ['participants_category', 'participants']+demand_cols+response_cols
df.drop(exclude_cols, axis=1).info()

In [None]:
df_clean = df.drop(exclude_cols, axis=1)

df_clean = pd.concat([df_clean, participant_ct_clean_df, participant_cat_clean_df, demand_df, response_df], axis=1)

df_clean.info()

In [None]:
df_clean['participants_category'].isna().sum()

In [None]:
temp = df_clean.loc[df_clean['participants_category'].isna()]
temp.id

In [None]:
df_clean['protesteridentity'].value_counts()

In [None]:
df_temp = df.copy()

df_temp['protesteridentity'] = df_temp['protesteridentity'].fillna(UNKNOWN)
df_temp.info()

In [None]:
temp.id.index

In [None]:
#df.loc[temp.id.index]

In [None]:
df_clean['participants_category_old'] = df['participants_category']
df_clean['participants_old'] = df['participants']

In [None]:
df_clean.info()

In [None]:
participant_cols = ['id', 'participants', 'participants_category', 'participants_category_old', 'participants_old']

part = df_clean[participant_cols]

part.info()

In [None]:
part.loc[part.participants_category.isna()]

In [None]:
NOTE TO SELF - DOUBLE CHECK THAT THE PARTICIPANT COUNTS AND CATEGORIZATIONS ARE CORRECT FOR INDEX 13542 AS A QC