In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

In [2]:
csv_files = ["data/"+f for f in os.listdir("data/") if f.endswith('.csv')]

In [3]:
csv_files

["data/2017-01-21 Women's March.csv",
 "data/2018-01-Women's March.csv",
 'data/2017-08.csv',
 'data/2017-09.csv',
 'data/2017-02.csv',
 'data/2017-03.csv',
 'data/2017-07.csv',
 'data/2017-06.csv',
 'data/2017-12.csv',
 'data/2017-04.csv',
 'data/2017-10.csv',
 'data/2017-11.csv',
 'data/2017-05.csv',
 'data/2018-01-Other.csv']

In [111]:
def mentions_trump(claim):
    if 'Ivanka' in claim:
        return False
    if 'Trump' in claim or 'impeach' in claim.lower():
        return True
    else:
        return False

def mentions_race(claim):
    return any(word in claim.lower() for word in ['racial', 'racism', 'racist', 'race relations',
                                                  'anti-semi', 'black lives matter', 'white supre',
                                                  'nazi', 'white nationalist', 'white lives', 'confedera',
                                                 'of color'])

def blm(actor):
    try:
        return "black lives matter" in actor.lower()
    except AttributeError:
        return False

def immigration(claim):
    return any(w in claim.split() for w in ('ICE', 'DACA')) or any(x in claim.lower() for x in ('dreamers',
                                                                                     'deportation', 'border wall',
                                                                                    'dream act', 'immigra',
                                                                                    'sanctuary', 'travel ban',
                                                                                    'muslim ban'))

def sexual_assault(claim):
    if "rape" in claim.split():
        return True
    else:
        return any(word in claim.lower() for word in ['domestic violence', 'sexual', 'metoo',
                                                      #'roy moore',
                                                     'take back the night'])

def police_brutality(claim):
    return any(w in claim.split() for w in ('cop')) or any(x in claim.lower() for x in ('police',
                                                                                       'agent',
                                                                                       'officer'))
def guns(claim):
    return any(w in claim.split() for w in ('gun', 'guns', 'NRA')) or any(x in claim.lower() for x in ('campus carry',
                                                                                              'pro-gun',
                                                                                              'anti-gun',
                                                                                              'concealed carry'))

def environment(claim):
    return any(w in claim.split() for w in ('EPA', 'DAPL')) or any(x in claim.lower() for x in ('climate change',
                                                                                               'fracking',
                                                                                               'pipeline',
                                                                                               'pipe-line',
                                                                                               'fossil fuel',
                                                                                               'energy',
                                                                                               'fracturing',
                                                                                               'flint',
                                                                                               'dakota access',
                                                                                               'keystone',
                                                                                               'radioactive waste',
                                                                                               'pollution',
                                                                                               'paris agreement'))

def lgbt(claim):
    return 'gay' in claim.split() or any(x in claim.lower() for x in ('transgender',
                                                                       'pro-gay',
                                                                       'anti-gay',
                                                                       'lgbt',
                                                                      'homopho',
                                                                      'homosex'))

def reproductive(claim):
    return any(x in claim.lower() for x in ('reproduc', 'pro-life', 'abortion', 'pro-choice',
                                            'pro life', 'pro choice', 'planned parenthood'))



In [112]:
def process_file(file):

    raw_df = pd.read_csv(file)
    
    # Get year and month from filename
    raw_df['Year'] = file[5:9]
    raw_df['Month'] = file[10:12]
    
    # Convert 'EstimateHigh' if it's not coded as numeric
    if raw_df['EstimateHigh'].dtype=='O':
        raw_df['EstimateHigh'] = pd.to_numeric(raw_df['EstimateHigh'], errors='coerce')
    
    # Make avg estimate out of estimatelow and estimatehigh
    raw_df['Avg_Estimate'] = (raw_df['EstimateLow'] + raw_df['EstimateHigh']) / 2

    # Replace with avg where BestGuess is None
    raw_df['BestGuess'] = raw_df['BestGuess'].where(raw_df['BestGuess']==None,raw_df['Avg_Estimate'])
    
    # Find the column "Pro/Anti" and rename it to "ProAnti"
    p_a = next(x for x in raw_df.columns.values if x.startswith('Pro'))
    raw_df.rename(columns={p_a: 'ProAnti'}, inplace=True)
    
    if 'CityTown' in raw_df.columns.values:
        raw_df.rename(columns={'CityTown': 'City/Town'}, inplace=True)
        
    if 'protest' in raw_df.columns.values:
        raw_df.rename(columns={'protest': 'EventType'}, inplace=True)
    
    # Select columns of interest
    raw_df = raw_df[['City/Town', 'StateTerritory', 'Year', 'Month', 'Date', 'BestGuess', 'Actor',
                     'Claim', 'ProAnti', 'EventType']]
    
    # Drop all rows where BestGuess or Claim or StateTerritory is NaN
    raw_df = raw_df.dropna(subset=['BestGuess', 'Claim', 'StateTerritory', 'Date'])
    
    raw_df['Trump'] = raw_df['Claim'].apply(lambda x: mentions_trump(x))
    raw_df['Race'] = raw_df['Claim'].apply(lambda x: mentions_race(x)) | raw_df['Actor'].apply(lambda x: blm(x))
    raw_df['Immigration'] = raw_df['Claim'].apply(lambda x: immigration(x))
    raw_df['MeToo'] = raw_df['Claim'].apply(lambda x: sexual_assault(x))
    raw_df['Police'] = raw_df['Claim'].apply(lambda x: police_brutality(x))
    raw_df['Guns'] = raw_df['Claim'].apply(lambda x: guns(x))
    raw_df['Environment'] = raw_df['Claim'].apply(lambda x: environment(x))
    raw_df['LGBT'] = raw_df['Claim'].apply(lambda x: lgbt(x))
    raw_df['Reproductive'] = raw_df['Claim'].apply(lambda x: reproductive(x))
    
    return raw_df

In [113]:
# Ignore Women's March files for now
raw_dfs = [process_file(f) for f in csv_files[2:]]

In [114]:
full_df = pd.concat(raw_dfs, ignore_index=True)

In [115]:
full_df.shape

(5969, 19)

In [116]:
full_df.columns.values

array(['City/Town', 'StateTerritory', 'Year', 'Month', 'Date', 'BestGuess',
       'Actor', 'Claim', 'ProAnti', 'EventType', 'Trump', 'Race',
       'Immigration', 'MeToo', 'Police', 'Guns', 'Environment', 'LGBT',
       'Reproductive'], dtype=object)

In [117]:
full_df['DateTime'] = pd.to_datetime(full_df['Date'], format='%Y-%m-%d', errors='coerce')

In [118]:
# Deal with weird dates

def change_datetime(row):
    
    # Check if DateTime year/month confirms with year, month that we got from .csv filename:
    if float(row['Year'])==float(row['DateTime'].year) and float(row['Month'])==float(row['DateTime'].month):
        return row['DateTime']
    
    # If no match, piece together new date string based on filename and last 2 digits of 'Date' column
    # Return as datetime object
    else:
        date_str = '-'.join([str(row['Year']), str(row['Month']), row['Date'][-2:]])
        return pd.to_datetime(date_str, format='%Y-%m-%d')

# Replace datetime column
full_df['DateTime'] = full_df.apply(lambda x: change_datetime(x),axis=1)

In [119]:
# full_df['MonthYear'] = full_df['Year'] + '-' + full_df['Month']

# Calculate number of days since the first date in the df
full_df['Day'] = (full_df['DateTime'] - np.min(full_df['DateTime'])).dt.days

In [120]:
full_df

Unnamed: 0,City/Town,StateTerritory,Year,Month,Date,BestGuess,Actor,Claim,ProAnti,EventType,...,Race,Immigration,MeToo,Police,Guns,Environment,LGBT,Reproductive,DateTime,Day
0,Austin,TX,2017,08,2017-08-01,250.0,general protestors,Protesting legislature's proposed agenda,0,protest,...,False,False,False,False,False,False,False,False,2017-08-01,181
1,Austin,TX,2017,08,2017-08-01,225.0,Texas Impacts,Oppose Texas's anti-transgender bathroom bill,1,rally,...,False,False,False,False,False,False,True,False,2017-08-01,181
2,Austin,TX,2017,08,2017-08-01,320.0,First Baptist Church of Austin; Friends Church...,Oppose Texas's anti-transgender bathroom bill,1,rally,...,False,False,False,False,False,False,True,False,2017-08-01,181
3,Cold Spring,NY,2017,08,2017-08-01,5.0,Cold Spring Brewing workers,Protesting working conditions,0,protest; walk-out,...,False,False,False,False,False,False,False,False,2017-08-01,181
4,Fort Worth,TX,2017,08,2017-08-01,400.0,United Fort Worth,Protesting TX's anti-sanctuary city bill,1,protest,...,False,True,False,False,False,False,False,False,2017-08-01,181
5,Fort Worth,TX,2017,08,2017-08-01,2.0,general protestors,Counter-protest; supporting TX's anti-sanctuar...,2,protest,...,False,True,False,False,False,False,False,False,2017-08-01,181
6,Hendersonville,NC,2017,08,2017-08-01,80.0,Progressive Organized Women of Hendersonville,Campaign rally for Democratic candidate Philli...,1,rally,...,False,False,False,False,False,False,False,False,2017-08-01,181
7,Marquette,MI,2017,08,2017-08-01,200.0,Michigan Nurses Association,Protesting policy of understaffing hospital wo...,0,protest,...,False,False,False,False,False,False,False,False,2017-08-01,181
8,New York,NY,2017,08,2017-08-01,24.0,Black Alliance for Just Immigration,Night Out for Safety and Liberation,0,rally,...,False,False,False,False,False,False,False,False,2017-08-01,181
9,Philadelphia,PA,2017,08,2017-08-01,50.0,Heeding Cheyney's call,Rally to save Cheyney University,0,rally,...,False,False,False,False,False,False,False,False,2017-08-01,181


In [121]:
np.max(full_df['Day'])

364

In [122]:
# Remove rows with BestGuess is less than 50
full_df = full_df[full_df['BestGuess'] > 49]

In [123]:
full_df.shape

(3468, 21)

In [124]:
just_immigration = full_df[full_df['Immigration']][['Actor', 'Claim', 'ProAnti', 'BestGuess',
                                                   'DateTime', 'Day', 'City/Town', 'StateTerritory']]
just_immigration.shape

(524, 8)

In [125]:
just_race = full_df[full_df['Race']][['Actor', 'Claim', 'ProAnti', 'BestGuess',
                                                   'DateTime', 'Day', 'City/Town', 'StateTerritory']]
just_race.shape

(365, 8)

In [126]:
just_trump = full_df[full_df['Trump']][['Actor', 'Claim', 'ProAnti', 'BestGuess',
                                                   'DateTime', 'Day', 'City/Town', 'StateTerritory']]
just_trump.shape

(612, 8)

In [127]:
just_metoo = full_df[full_df['MeToo']][['Actor', 'Claim', 'ProAnti', 'BestGuess',
                                                   'DateTime', 'Day', 'City/Town', 'StateTerritory']]
just_metoo.shape

(23, 8)

In [128]:
just_police = full_df[full_df['Police']][['Actor', 'Claim', 'ProAnti', 'BestGuess',
                                                   'DateTime', 'Day', 'City/Town', 'StateTerritory']]
just_police.shape

(96, 8)

In [129]:
just_guns = full_df[full_df['Guns']][['Actor', 'Claim', 'ProAnti', 'BestGuess',
                                                   'DateTime', 'Day', 'City/Town', 'StateTerritory']]
just_guns.shape

(41, 8)

In [130]:
just_environment = full_df[full_df['Environment']][['Actor', 'Claim', 'ProAnti', 'BestGuess',
                                                   'DateTime', 'Day', 'City/Town', 'StateTerritory']]
just_environment.shape

(75, 8)

In [131]:
just_lgbt = full_df[full_df['LGBT']][['Actor', 'Claim', 'ProAnti', 'BestGuess',
                                                   'DateTime', 'Day', 'City/Town', 'StateTerritory']]
just_lgbt.shape

(53, 8)

In [132]:
just_reproductive = full_df[full_df['Reproductive']][['Actor', 'Claim', 'ProAnti', 'BestGuess',
                                                   'DateTime', 'Day', 'City/Town', 'StateTerritory']]
just_reproductive.shape

(149, 8)

In [135]:
temp = just_reproductive
# temp[['Claim', 'ProAnti']]
# temp['Claim'].values

In [136]:
np.max(full_df['Day'])

364

In [137]:
np.max(just_trump['BestGuess'])

11600.0

In [138]:
np.min(just_trump['BestGuess'])

49.5

In [139]:
# Write out

just_immigration.to_csv('immigration.csv', index=False)
just_race.to_csv('race.csv', index=False)
just_trump.to_csv('trump.csv', index=False)
just_metoo.to_csv('metoo.csv', index=False)
just_police.to_csv('police.csv', index=False)
just_guns.to_csv('guns.csv', index=False)
just_environment.to_csv('environment.csv', index=False)
just_lgbt.to_csv('lgbt.csv', index=False)
just_reproductive.to_csv('reproductive.csv', index=False)