In [1]:
import numpy as np
import pandas as pd
from collections import defaultdict
import datetime

# 1. Load data

In [2]:
# Load raw data
df = pd.read_csv('/gh/data/politifact/20190701scrape/raw_processed.csv', index_col=0)
df.head()

Unnamed: 0,name,statement,source,date,finding,comment,personalities_page,article_page
0,Donald Trump,"""The Obama administration was begging for a me...",— PolitiFact National,"on Monday, July 1st, 2019",False,Obama set high bar for meetings,/personalities/donald-trump/,/truth-o-meter/statements/2019/jul/01/donald-t...
1,Facebook posts,Six Flags is offering five free passes and unl...,— PolitiFact Facebook fact-checks,"on Monday, July 1st, 2019",False,Summer scam-o-rama,/personalities/facebook-posts/,/facebook-fact-checks/statements/2019/jul/01/f...
2,Beto O'Rourke,"""The Census is predicting a 6% undercount that...",— PolitiFact Texas,"on Monday, July 1st, 2019",Half-True,Confuses key details,/personalities/beto-orourke/,/texas/statements/2019/jul/01/beto-orourke/oro...
3,Elizabeth Warren,"""Prison phone companies charge as much as $25 ...",— PolitiFact National,"on Monday, July 1st, 2019",Mostly True,"In some jails, talk isn't cheap",/personalities/elizabeth-warren/,/truth-o-meter/statements/2019/jul/01/elizabet...
4,Ta-Nehisi Coates,"""The typical black family in this country has ...",— PunditFact,"on Monday, July 1st, 2019",True,Racial gap has persisted for years,/personalities/ta-nehisi-coates/,/punditfact/statements/2019/jul/01/ta-nehisi-c...


# 2. Change some columns

In [3]:
# Replace personalities page with identity ID
df['identity_id'] = [x.split('/')[-2] for x in df['personalities_page']]
df.drop('personalities_page', axis=1, inplace=True)

In [4]:
# Fix date and source columns
df.reset_index(drop=True, inplace=True)
df['date'] = pd.to_datetime([s.split('y, ')[1] for s in df['date']])
df['source'] = [s.split('— ')[1] for s in df['source']]

In [5]:
# Compute truth score from "finding" label
truth_score_dict = {'True': 5,
                    'Mostly True': 4,
                    'Half-True': 3,
                    'Mostly False': 2,
                    'False': 1,
                    'Pants on Fire!': 0}
df['truth_score'] = df['finding'].map(truth_score_dict)

In [12]:
df.head()

Unnamed: 0,name,statement,source,date,finding,comment,article_page,identity_id,truth_score
0,Donald Trump,"""The Obama administration was begging for a me...",PolitiFact National,2019-07-01,False,Obama set high bar for meetings,/truth-o-meter/statements/2019/jul/01/donald-t...,donald-trump,1.0
1,Facebook posts,Six Flags is offering five free passes and unl...,PolitiFact Facebook fact-checks,2019-07-01,False,Summer scam-o-rama,/facebook-fact-checks/statements/2019/jul/01/f...,facebook-posts,1.0
2,Beto O'Rourke,"""The Census is predicting a 6% undercount that...",PolitiFact Texas,2019-07-01,Half-True,Confuses key details,/texas/statements/2019/jul/01/beto-orourke/oro...,beto-orourke,3.0
3,Elizabeth Warren,"""Prison phone companies charge as much as $25 ...",PolitiFact National,2019-07-01,Mostly True,"In some jails, talk isn't cheap",/truth-o-meter/statements/2019/jul/01/elizabet...,elizabeth-warren,4.0
4,Ta-Nehisi Coates,"""The typical black family in this country has ...",PunditFact,2019-07-01,True,Racial gap has persisted for years,/punditfact/statements/2019/jul/01/ta-nehisi-c...,ta-nehisi-coates,5.0


In [8]:
# Resave processed dataframe
df.to_csv('/gh/data/politifact/20190701scrape/processed_v2.csv', index=False)

# 3. Define type of identity (e.g. person, republican organization)
* Only done for identities with at least 6 fact checks

In [9]:
# Manually classify some non-person identities with at least 6 fact checks
name_types = {'internet': ['Bloggers', 'Facebook posts', 'Chain email', 'Viral image', 'Doonesbury'],
              'democrat_org': df['name'][df['name'].str.lower().str.contains('democrat')].unique(),
              'republican_org': df['name'][df['name'].str.lower().str.contains('republica')].unique(),
              'other_political_org': ['National Rifle Association', 'Crossroads GPS', 'Americans for Prosperity',
                                       'U.S. Chamber of Commerce', 'Greater Wisconsin Committee',
                                       'American Crossroads', 'Senate Majority PAC', 
                                       'MoveOn.org', 'Priorities USA Action', 'YourNewsWire.com',
                                       'Congressional Leadership Fund',
                                       'Senate Leadership Fund', 'One Wisconsin Now', 'Americans United for Change',
                                       'AFSCME', 'Club for Growth', 'Winning Our Future', 'Restore Our Future',
                                       'Wisconsin Manufacturers and Commerce', 'PuppetStringNews.com', 'Americans For Tax Reform',
                                       'TheLastLineOfDefense.org', 'American Bridge 21st Century', 'Planned Parenthood', 'Progress Texas',
                                       'EMILY\'s List', 'Sacramento Bee Editorial Board', 'Florida Chamber of Commerce', 'Ending Spending Action Fund',
                                       'National Right to Life Committee', 'Government is Not God PAC'
                                       ]
             }
name_to_type = {v:k for k in name_types.keys() for v in name_types[k]}

# Determine all other identities
# to declare that all other identities are persons
thresh_count = 6
counts_by_name = df['name'].value_counts()
names_above_thresh = counts_by_name >= thresh_count
names_above_thresh = names_above_thresh[names_above_thresh].index

# Add person keys for people that were checked (those with >= 6 reviews)
known_people = [n for n in names_above_thresh if n not in name_to_type.keys()]
name_to_type_people = {p:'person' for p in known_people}
name_to_type = {**name_to_type, **name_to_type_people}

# 4. Make identity table
* Note this could be greatly improved by incorporating the title better (for organizations) and incorporating the bio

In [10]:
# Load titles and bios of people
df_person_info = pd.read_csv('/gh/data/politifact/20190707scrape_personalities/raw_processed.csv', index_col=0)

# add some demographic information
def get_party(x):
    x = x.lower()
    if 'republican' in x:
        return 'R'
    if 'democrat' in x:
        return 'D'
    if 'independent' in x:
        return 'I'
    else:
        return None

df_person_info['party_title'] = df_person_info['title'].apply(get_party)
df_person_info['state'] = [x.split(' from ')[1] if 'from' in x else None for x in df_person_info['title']]

# Clean
df_person_info['identity_id'] = df_person_info['person']
df_person_info.drop('person', axis=1, inplace=True)

# Get person's name
df_names = df[['name', 'identity_id']].drop_duplicates()
df_person_info = df_person_info.merge(df_names, on='identity_id', how='left')

# Create column of name type
df_person_info['identity_type'] = df_person_info['name'].map(name_to_type)
df_person_info.head()

Unnamed: 0,title,bio,party_title,state,identity_id,name,identity_type
0,Our Texas Values feeds,"Established in 2012, Texas Values is an Austin...",,,texas-values,Texas Values,
1,Democrat from Ohio,Teresa Fedor is a Democrat in the Ohio House o...,D,Ohio,teresa-fedor,Teresa Fedor,
2,Our Susana Mendoza feeds,Susana Mendoza is the Illinois comptroller. Me...,,,susana-mendoza,Susana Mendoza,
3,Republican from Virginia,"Stephen Martin, R-Chesterfield, represents the...",R,Virginia,stephen-martin,Stephen Martin,
4,None from Ohio,Brent Larkin writes a weekly column for The Pl...,,Ohio,brent-larkin,Brent Larkin,


In [13]:
# Save identity dataframe
df_person_info.to_csv('/gh/data/politifact/20190707scrape_personalities/processed_v2.csv', index=False)
