In [58]:
import pandas as pd
import numpy as np
import os
import time

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils.np_utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping


import dill as pickle

In [3]:
os.getcwd()  # Get the current working directory (cwd)

'/Users/ryanmackie/Documents/DSI_coursework/Submissions/Projects/capstone-master/part_02'

In [4]:
os.chdir('/Users/ryanmackie/Documents/DSI_coursework/Submissions/Projects/capstone-master/part_03')

In [5]:
os.getcwd()

'/Users/ryanmackie/Documents/DSI_coursework/Submissions/Projects/capstone-master/part_03'

In [63]:
df = pd.read_csv('./complete_politifact_news.csv')

print(f'The dataframe has {df.shape[0]} rows and {df.shape[1]} columns.')
df.head(6)

The dataframe has 17730 rows and 7 columns.


Unnamed: 0,Story,Statement,Article,Date,Source,Label,date
0,An ad from a group opposing a constitutional a...,Approving the constitutional amendment on Illi...,https://www.politifact.com/factchecks/2020/oct...,"October 4, 2020",Coalition to Stop the Proposed Tax Hike Amendment,half-true,2020/oct/04
1,As Donald Trump fights to recover from the cor...,Says Donald Trump is not at Walter Reed Nation...,https://www.politifact.com/factchecks/2020/oct...,"October 4, 2020",Facebook posts,pants-fire,2020/oct/04
2,Even with President Donald Trump hospitalized ...,Says Joe Biden stays in his basement.,https://www.politifact.com/factchecks/2020/oct...,"October 4, 2020",Jason MIller,false,2020/oct/04
3,Gov. Phil Scott came out firing when the issue...,"“That 5% (wealth tax) starts at $159,000 per f...",https://www.politifact.com/factchecks/2020/oct...,"October 4, 2020",Phil Scott,true,2020/oct/04
4,"In one photo, President Donald Trump’s right b...","Says Donald Trump boarded Marine One ""with a p...",https://www.politifact.com/factchecks/2020/oct...,"October 4, 2020",Facebook posts,false,2020/oct/04
5,,Says he “strongly supports legalization for Dr...,https://www.politifact.com/factchecks/2020/oct...,"October 3, 2020",John Cornyn,half-true,2020/oct/03


In [64]:
df.isnull().sum()

Story        9
Statement    0
Article      0
Date         0
Source       0
Label        0
date         0
dtype: int64

In [66]:
df = df.dropna().reset_index(drop = True)

In [67]:
df = df.drop(columns = ['Statement','Article', 'Date', 'Source','date'])

In [68]:
df.head()

Unnamed: 0,Story,Label
0,An ad from a group opposing a constitutional a...,half-true
1,As Donald Trump fights to recover from the cor...,pants-fire
2,Even with President Donald Trump hospitalized ...,false
3,Gov. Phil Scott came out firing when the issue...,true
4,"In one photo, President Donald Trump’s right b...",false


In [69]:
df['Label'].value_counts()

false          3960
half-true      3174
mostly-true    3024
barely-true    2927
true           2288
pants-fire     2097
full-flop       159
half-flip        66
no-flip          26
Name: Label, dtype: int64

In [70]:
df = df[(df['Label']!='full-flop')&(df['Label']!='half-flip')&(df['Label']!='no-flip')].reset_index().drop(['index'], axis=1)

In [71]:
df['Label'].value_counts()

false          3960
half-true      3174
mostly-true    3024
barely-true    2927
true           2288
pants-fire     2097
Name: Label, dtype: int64

In [72]:
df['Label'] = df['Label'].map({'pants-fire':0, 'false':0, 'barely-true':0, 'half-true':1, 'mostly-true':1, 'true':1})

In [73]:
df.head()

Unnamed: 0,Story,Label
0,An ad from a group opposing a constitutional a...,1
1,As Donald Trump fights to recover from the cor...,0
2,Even with President Donald Trump hospitalized ...,0
3,Gov. Phil Scott came out firing when the issue...,1
4,"In one photo, President Donald Trump’s right b...",0


In [86]:
X = df['Story']
y = df['Label']

In [18]:
def to_words(series):
    ps = PorterStemmer()
    corpus = []
    start = time.time()
    for i in range(0, len(series)):
        letters = re.sub('[^a-zA-Z]', ' ',series[i])
        words = letters.lower().split()
        meaningful_words = [ps.stem(w) for w in words if not w in stopwords.words('english')]
        final = ' '.join(meaningful_words)
        corpus.append(final)
    end = time.time()
    print(f'Time Elapsed: {round(end - start, 2)} seconds')
    return corpus

In [19]:
vocab_size = round(20_000*1.3)

In [20]:
onehot = [one_hot(val,vocab_size)for val in to_words(X)]

Time Elapsed: 1985.78 seconds


In [1]:
maxlen = 0
for i in range(0, len(df['Story'])):
    number = len(df['Story'][i].split())
    if number > maxlen:
        maxlen = number

padded_onehot = pad_sequences(onehot, padding = 'pre', maxlen = maxlen)

In [108]:
X_final = np.array(padded_onehot)
y_final = np.array(y)

print(X_final.shape)
print(y_final.shape)

# X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = .2, random_state = 33)

(17470, 2935)
(17470,)


In [109]:
model = Sequential()
model.add(Embedding(vocab_size,output_dim=100,input_length=maxlen))
model.add(Dropout(0.3))
model.add(LSTM(8))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['acc'])

In [110]:
model.fit(X_final, y_final, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7ff81ec227d0>

In [114]:
model.save('lstm_model.h5')

In [51]:
# def model_func(layer_three_neurons=32, layer_four_neurons=16, layer_one_dropout=0.5, layer_two_dropout=0.5, layer_three_dropout=0.5, layer_four_dropout=0.5):
#     model = Sequential()
#     model.add(Embedding(vocab_size,output_dim=40,input_length=500))
#     model.add(Dropout(layer_one_dropout))
#     model.add(LSTM(256))
#     model.add(Dropout(layer_two_dropout))
#     model.add(Dense(layer_three_neurons, activation='relu'))
#     model.add(Dropout(layer_three_dropout))
#     model.add(Dense(layer_four_neurons, activation='relu'))
#     model.add(Dropout(layer_four_dropout))
#     model.add(Dense(1, activation='sigmoid')) # We can also use "linear"
#     model.compile(loss='binary_crossentropy', optimizer='adam', metrics = ['acc'])
#     return model

# # define the model with the wrapper
# nn = KerasClassifier(build_fn=model_func, epochs=10, batch_size=512)

In [52]:
# # Params grid
# params = {
#     "epochs":[10, 20],
#     "layer_three_neurons":[20, 32],
#     "layer_four_neurons":[16, 32],
#     "layer_one_dropout":[0.1, 0.3, 0.5],
#     "layer_two_dropout":[0.1, 0.3, 0.5],
#     "layer_three_dropout":[0.1, 0.3, 0.5],
#     "layer_four_dropout":[0.1, 0.3, 0.5]
# }
# gs = GridSearchCV(estimator=nn, param_grid=params, cv=2) # I'm using cv=2 for the sake of time! 
# gs.fit(X_train, y_train)
# print(gs.best_score_)
# gs.best_params_