In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import nltk as nl
import string as s
import re

nl.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rafaelhernandez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
train_data = pd.read_csv("fake_or_real_news_training.csv")
test_data = pd.read_csv("fake_or_real_news_test.csv")

In [11]:
train_data.iloc[2,2]

'U.S. Secretary of State John F. Kerry said Monday that he will stop in Paris later this week, amid criticism that no top American officials attended Sunday’s unity march against terrorism.\n\nKerry said he expects to arrive in Paris Thursday evening, as he heads home after a week abroad. He said he will fly to France at the conclusion of a series of meetings scheduled for Thursday in Sofia, Bulgaria. He plans to meet the next day with Foreign Minister Laurent Fabius and President Francois Hollande, then return to Washington.\n\nThe visit by Kerry, who has family and childhood ties to the country and speaks fluent French, could address some of the criticism that the United States snubbed France in its darkest hour in many years.\n\nThe French press on Monday was filled with questions about why neither President Obama nor Kerry attended Sunday’s march, as about 40 leaders of other nations did. Obama was said to have stayed away because his own security needs can be taxing on a country, 

In [13]:
train_data.head(10)

Unnamed: 0,ID,title,text,label,X1,X2
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,,
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,,
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,,
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,,
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,,
5,6903,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE,,
6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE,,
7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL,,
8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL,,
9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL,,


In [None]:
# t0 = nltk.DefaultTagger('NN')
# t1 = nltk.UnigramTagger(train_sents, backoff=t0) 
# t2 = nltk.BigramTagger(train_sents, backoff=t1) 
# t2.evaluate(test_sents)

In [None]:
# ps= nltk.corpus.treebank.parsed_sents() print(ps[0])
# ps[0].draw()

In [15]:
inspect = train_data["label"].value_counts()/len(train_data)
print(inspect.head(10))

REAL                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    

In [43]:
df_REAL = train_data[train_data.label == 'REAL']
df_REAL.shape

(1990, 7)

In [44]:
df_FAKE = train_data[train_data.label == 'FAKE']
df_FAKE.shape

(1976, 7)

In [53]:
train_data = pd.concat([df_REAL, df_FAKE], axis=0)

train_data.shape

(3966, 7)

## Data Preparation

In [54]:
stemmer = PorterStemmer()
words = stopwords.words("english")
train_data['cleaned'] = train_data['text'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

In [27]:
train_data.head()

Unnamed: 0,ID,title,text,label,X1,X2,cleaned
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,,,daniel greenfield shillman journal fellow free...
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,,,googl pinterest digg linkedin reddit stumbleup...
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,,,u s secretari state john f kerri said monday s...
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,,,kayde king kaydeek novemb the lesson tonight d...
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,,,it primari day new york front runner hillari c...


In [55]:
vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 2))
final_features = vectorizer.fit_transform(train_data['cleaned']).toarray()
final_features.shape

(3966, 100118)

In [58]:
#first we split our dataset into testing and training set:
# this block is to split the dataset into training and testing set 
X = train_data['cleaned']
Y = train_data['label']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)
# instead of doing these steps one at a time, we can use a pipeline to complete them all at once
pipeline = Pipeline([('vect', vectorizer),
                     ('chi',  SelectKBest(chi2, k=1200)),
                     ('clf', RandomForestClassifier())])
# fitting our model and save it in a pickle for later use
model = pipeline.fit(X_train, y_train)
# with open('RandomForest.pickle', 'wb') as f:
#     pickle.dump(model, f)
ytest = np.array(y_test)
clf_Score = pipeline.score(X_test,y_test)
# confusion matrix and classification report(precision, recall, F1-score)
print(classification_report(ytest, model.predict(X_test)))
print(confusion_matrix(ytest, model.predict(X_test)))
print('Accuracy:', clf_Score)



              precision    recall  f1-score   support

        FAKE       0.84      0.90      0.86       477
        REAL       0.90      0.84      0.87       515

   micro avg       0.86      0.86      0.86       992
   macro avg       0.87      0.87      0.86       992
weighted avg       0.87      0.86      0.86       992

[[427  50]
 [ 84 431]]
Accuracy: 0.8649193548387096
