# Project Milestone 2

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder

In [3]:
# load dataset
factcheck = pd.read_json("politifact_factcheck_data.json", lines=True)
factcheck

Unnamed: 0,verdict,statement_originator,statement,statement_date,statement_source,factchecker,factcheck_date,factcheck_analysis_link
0,true,Barack Obama,John McCain opposed bankruptcy protections for...,6/11/2008,speech,Adriel Bettelheim,6/16/2008,https://www.politifact.com/factchecks/2008/jun...
1,false,Matt Gaetz,"""Bennie Thompson actively cheer-led riots in t...",6/7/2022,television,Yacob Reyes,6/13/2022,https://www.politifact.com/factchecks/2022/jun...
2,mostly-true,Kelly Ayotte,"Says Maggie Hassan was ""out of state on 30 day...",5/18/2016,news,Clay Wirestone,5/27/2016,https://www.politifact.com/factchecks/2016/may...
3,false,Bloggers,"""BUSTED: CDC Inflated COVID Numbers, Accused o...",2/1/2021,blog,Madison Czopek,2/5/2021,https://www.politifact.com/factchecks/2021/feb...
4,half-true,Bobby Jindal,"""I'm the only (Republican) candidate that has ...",8/30/2015,television,Linda Qiu,8/30/2015,https://www.politifact.com/factchecks/2015/aug...
...,...,...,...,...,...,...,...,...
21147,mostly-false,Donald Trump,Says the large trade deficit with Japan stems ...,8/13/2019,speech,Jon Greenberg,8/15/2019,https://www.politifact.com/factchecks/2019/aug...
21148,false,Donald Trump Jr.,"""Tens of thousands"" of people leave New York e...",11/1/2019,social_media,Jill Terreri Ramos,11/8/2019,https://www.politifact.com/factchecks/2019/nov...
21149,mostly-false,Chris Abele,"""I have fought for our shared values without b...",1/4/2011,news,Dave Umhoefer,1/13/2011,https://www.politifact.com/factchecks/2011/jan...
21150,false,Bloggers,"""Germany halts all Covid-19 vaccines, says the...",8/27/2021,blog,Ciara O'Rourke,9/9/2021,https://www.politifact.com/factchecks/2021/sep...


## Drop Unneeded Features

The first data preperation step I will take is removing uneeded features. Because my project is focused on two features, statement_source and statement, I can remove all other features. This includes, factchecker, factcheck_date, factcheck_analysis_link, statement_date, and statement_originator.

A small note concerning the last two features I listed. If time permits, it would be interesting to do a time series analysis using statement_date to see how rate of veracity changes over time. Similarly, if one were able to search through statement_originator and attatch a rough political alignment, one would be able to see what role political affiliation plays in predicting if a statement will be true/false. That said, at the moment these steps are not within the scope of the project so will be removed.

In [7]:
# remove uneeded features
factcheck2 = factcheck.drop(["statement_originator", "factchecker", 
                             "factcheck_date", "factcheck_analysis_link",
                             "statement_date"], axis=1)
factcheck2

Unnamed: 0,verdict,statement,statement_source
0,true,John McCain opposed bankruptcy protections for...,speech
1,false,"""Bennie Thompson actively cheer-led riots in t...",television
2,mostly-true,"Says Maggie Hassan was ""out of state on 30 day...",news
3,false,"""BUSTED: CDC Inflated COVID Numbers, Accused o...",blog
4,half-true,"""I'm the only (Republican) candidate that has ...",television
...,...,...,...
21147,mostly-false,Says the large trade deficit with Japan stems ...,speech
21148,false,"""Tens of thousands"" of people leave New York e...",social_media
21149,mostly-false,"""I have fought for our shared values without b...",news
21150,false,"""Germany halts all Covid-19 vaccines, says the...",blog


##  Encoding Target Values

As of now the verdict column (our target) is comprised of six ordinal categorical possibilities: true, mostly true, half true, mostly false, false, and pants-fire. I will encode these verdicts into 0 if category is false and 1 if verdict is true and train the model on these simplified target values. If time allows, I will encode those values into 0-5 to better capture the gradient of veracity of the original column.

In [36]:
# encoding verdicts according to true/false
factcheck2["verdict_code"] = [0  if verdict == "mostly false" or verdict == "false"
                              or verdict == "pants-fire" else 1 for verdict in factcheck2.verdict]
factcheck2

Unnamed: 0,verdict,statement,statement_source,verdict_code
0,true,John McCain opposed bankruptcy protections for...,speech,1
1,false,"""Bennie Thompson actively cheer-led riots in t...",television,0
2,mostly-true,"Says Maggie Hassan was ""out of state on 30 day...",news,1
3,false,"""BUSTED: CDC Inflated COVID Numbers, Accused o...",blog,0
4,half-true,"""I'm the only (Republican) candidate that has ...",television,1
...,...,...,...,...
21147,mostly-false,Says the large trade deficit with Japan stems ...,speech,1
21148,false,"""Tens of thousands"" of people leave New York e...",social_media,0
21149,mostly-false,"""I have fought for our shared values without b...",news,1
21150,false,"""Germany halts all Covid-19 vaccines, says the...",blog,0


## Create Dummy Variables out of Statement Source

For processing in the model, we must transform the statement_source column into numerical data. I will accomplish this using the pd.get_dummies() method.

In [56]:
# create dummies
dummies = pd.get_dummies(factcheck2.statement_source)
dummies

Unnamed: 0,advertisement,blog,campaign,email,meeting,news,other,radio,social_media,speech,statement,television,testimony
0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21147,0,0,0,0,0,0,0,0,0,1,0,0,0
21148,0,0,0,0,0,0,0,0,1,0,0,0,0
21149,0,0,0,0,0,1,0,0,0,0,0,0,0
21150,0,1,0,0,0,0,0,0,0,0,0,0,0


In [82]:
# rename statement columnd
dummies = dummies.rename(columns={"statement":"statement_s"})
dummies

Unnamed: 0,advertisement,blog,campaign,email,meeting,news,other,radio,social_media,speech,statement_s,television,testimony
0,0,0,0,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,0,1,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21147,0,0,0,0,0,0,0,0,0,1,0,0,0
21148,0,0,0,0,0,0,0,0,1,0,0,0,0
21149,0,0,0,0,0,1,0,0,0,0,0,0,0
21150,0,1,0,0,0,0,0,0,0,0,0,0,0


In [83]:
# drop og column to data frame
factcheck3 = factcheck2.drop("statement_source", axis=1)

In [84]:
# add dummies to data frame
factcheck3 = pd.concat([factcheck3, dummies], axis=1)
factcheck3

Unnamed: 0,verdict,statement,verdict_code,advertisement,blog,campaign,email,meeting,news,other,radio,social_media,speech,statement_s,television,testimony
0,true,John McCain opposed bankruptcy protections for...,1,0,0,0,0,0,0,0,0,0,1,0,0,0
1,false,"""Bennie Thompson actively cheer-led riots in t...",0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,mostly-true,"Says Maggie Hassan was ""out of state on 30 day...",1,0,0,0,0,0,1,0,0,0,0,0,0,0
3,false,"""BUSTED: CDC Inflated COVID Numbers, Accused o...",0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,half-true,"""I'm the only (Republican) candidate that has ...",1,0,0,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21147,mostly-false,Says the large trade deficit with Japan stems ...,1,0,0,0,0,0,0,0,0,0,1,0,0,0
21148,false,"""Tens of thousands"" of people leave New York e...",0,0,0,0,0,0,0,0,0,1,0,0,0,0
21149,mostly-false,"""I have fought for our shared values without b...",1,0,0,0,0,0,1,0,0,0,0,0,0,0
21150,false,"""Germany halts all Covid-19 vaccines, says the...",0,0,1,0,0,0,0,0,0,0,0,0,0,0


## Process Statements for Analysis

In order for the sentiment analysis model to process the statements we must preprocess them into a bag of words matrix. The follow steps are taken to accomplish this: converting to lowercase, removing punctuation & special characters, removing stop words, and stemming words.

In [90]:
def clean_statements(statements):
    """prepares series of text data for sentiment analysis by:
       converting to lowercase, removing punctuation & special characters, 
       removing stop words, and stemming words. Joins tokenized words together 
       at the end for use with Vectorizers"""
    # convert all text to lowercase
    lowercase = statements.str.lower()
    # remove punctuation and special characters
    clean = [re.sub('[^A-Za-z0-9 ]+', '', statement) for statement in lowercase]
    # english stop words
    stop_words = stopwords.words("english")
    # tokenize review into words
    token_clean = [word_tokenize(review) for review in clean]
    # remove stop words
    cleaner = [[word for word in review if word not in stop_words ] for review in token_clean]
    # stem words
    porter = PorterStemmer()
    stemmed = [[porter.stem(word) for word in review] for review in cleaner]
    # join words back together
    cleaned = [" ".join(review) for review in stemmed]
    return cleaned

In [93]:
# create cleaned reviews for use with bag of words & tfidf
cleaned = clean_statements(factcheck3.statement)

In [94]:
# view cleaned statements
cleaned

['john mccain oppos bankruptci protect famili bankruptci medic expens couldnt pay',
 'benni thompson activ cheerl riot 90',
 'saysmaggi hassan state 30 day last three month',
 'bust cdc inflat covid number accus violat feder law',
 'im republicancandid actual reduc size govern',
 'actual 30 countri practic birthright citizenship',
 'husband never gotten penni money farm',
 'go strictli number crime across board last year 10 percent decreas seriou crime',
 'american peopl say dont touch social secur dont touch medicar dont cut defens that 84 percent feder budget',
 'sinc 1978 ceo compens rose 1000 119 averag worker',
 'say accomplish includ fiscal respons budget agreement control state spend',
 'say presid obama deal allow iran produc nuclear weapon',
 'say donald trump say climat chang hoax invent chines',
 'least 450000 ballot key state miracul mark joe biden candid',
 '70 percent american adult commit crime could lead imprison',
 'say stimulu bill sent tax credit oversea 12 billion s

In [97]:
# add cleaned into og dataframe
factcheck3["statement_processed"] = cleaned
factcheck3

Unnamed: 0,verdict,statement,verdict_code,advertisement,blog,campaign,email,meeting,news,other,radio,social_media,speech,statement_s,television,testimony,statement_processed
0,true,John McCain opposed bankruptcy protections for...,1,0,0,0,0,0,0,0,0,0,1,0,0,0,john mccain oppos bankruptci protect famili ba...
1,false,"""Bennie Thompson actively cheer-led riots in t...",0,0,0,0,0,0,0,0,0,0,0,0,1,0,benni thompson activ cheerl riot 90
2,mostly-true,"Says Maggie Hassan was ""out of state on 30 day...",1,0,0,0,0,0,1,0,0,0,0,0,0,0,saysmaggi hassan state 30 day last three month
3,false,"""BUSTED: CDC Inflated COVID Numbers, Accused o...",0,0,1,0,0,0,0,0,0,0,0,0,0,0,bust cdc inflat covid number accus violat fede...
4,half-true,"""I'm the only (Republican) candidate that has ...",1,0,0,0,0,0,0,0,0,0,0,0,1,0,im republicancandid actual reduc size govern
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21147,mostly-false,Says the large trade deficit with Japan stems ...,1,0,0,0,0,0,0,0,0,0,1,0,0,0,say larg trade deficit japan stem send million...
21148,false,"""Tens of thousands"" of people leave New York e...",0,0,0,0,0,0,0,0,0,1,0,0,0,0,ten thousand peopl leav new york everi week
21149,mostly-false,"""I have fought for our shared values without b...",1,0,0,0,0,0,1,0,0,0,0,0,0,0,fought share valu without ideologu partisan
21150,false,"""Germany halts all Covid-19 vaccines, says the...",0,0,1,0,0,0,0,0,0,0,0,0,0,0,germani halt covid19 vaccin say unsaf longer r...
