In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl
import nltk as nl
import string as s
import re

nl.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import hstack

from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics




[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rafaelhernandez/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_data = pd.read_csv("fake_or_real_news_training.csv")
test_data = pd.read_csv("fake_or_real_news_test.csv")

In [3]:
train_data.iloc[2,2]

'U.S. Secretary of State John F. Kerry said Monday that he will stop in Paris later this week, amid criticism that no top American officials attended Sunday’s unity march against terrorism.\n\nKerry said he expects to arrive in Paris Thursday evening, as he heads home after a week abroad. He said he will fly to France at the conclusion of a series of meetings scheduled for Thursday in Sofia, Bulgaria. He plans to meet the next day with Foreign Minister Laurent Fabius and President Francois Hollande, then return to Washington.\n\nThe visit by Kerry, who has family and childhood ties to the country and speaks fluent French, could address some of the criticism that the United States snubbed France in its darkest hour in many years.\n\nThe French press on Monday was filled with questions about why neither President Obama nor Kerry attended Sunday’s march, as about 40 leaders of other nations did. Obama was said to have stayed away because his own security needs can be taxing on a country, 

In [4]:
train_data.head(10)

Unnamed: 0,ID,title,text,label,X1,X2
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE,,
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE,,
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL,,
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE,,
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL,,
5,6903,"Tehran, USA","\nI’m not an immigrant, but my grandparents ...",FAKE,,
6,7341,Girl Horrified At What She Watches Boyfriend D...,"Share This Baylee Luciani (left), Screenshot o...",FAKE,,
7,95,‘Britain’s Schindler’ Dies at 106,A Czech stockbroker who saved more than 650 Je...,REAL,,
8,4869,Fact check: Trump and Clinton at the 'commande...,Hillary Clinton and Donald Trump made some ina...,REAL,,
9,2909,Iran reportedly makes new push for uranium con...,Iranian negotiators reportedly have made a las...,REAL,,


In [5]:
shifted_X1 = train_data.loc[(train_data['X1'] == 'REAL') | (train_data['X1'] == 'FAKE')]
fixed_X1 = pd.DataFrame(shifted_X1['title'].map(str) + ' ' + shifted_X1['text'].map(str) + ' ' + shifted_X1['label'].map(str))
fixed_X1.columns = ['concat']
fixed_X1['label'] = shifted_X1.X1
fixed_X1.head()

Unnamed: 0,concat,label
192,Election Day: No Legal Pot In Ohio Democrats ...,REAL
308,Who rode it best? Jesse Jackson mounts up to f...,FAKE
382,Black Hawk crashes off Florida human remains ...,REAL
660,Afghanistan: 19 die in air attacks on hospital...,REAL
889,Al Qaeda rep says group directed Paris magazin...,REAL


In [6]:
shifted_X2 = train_data.loc[(train_data['X2'] == 'REAL') | (train_data['X2'] == 'FAKE')]
fixed_X2 = pd.DataFrame(shifted_X2['title'].map(str) + ' ' + shifted_X2['text'].map(str) + ' ' + shifted_X2['label'].map(str) + ' ' + shifted_X2['X1'].map(str))
fixed_X2.columns = ['concat']
fixed_X2['label'] = shifted_X2.X2
fixed_X2.head()

Unnamed: 0,concat,label
2184,Planned Parenthood’s lobbying effort pay rais...,REAL
3537,Chart Of The Day: Since 2009—–Recovery For The...,FAKE


In [7]:
labeled = train_data.loc[(train_data['label'] == 'REAL') | (train_data['label'] == 'FAKE')]
fixed_labeled = pd.DataFrame(labeled['title'].map(str) + ' ' + labeled['text'].map(str))
fixed_labeled.columns = ['concat']
fixed_labeled['label'] = labeled.label
fixed_labeled.head()

Unnamed: 0,concat,label
0,You Can Smell Hillary’s Fear Daniel Greenfield...,FAKE
1,Watch The Exact Moment Paul Ryan Committed Pol...,FAKE
2,Kerry to go to Paris in gesture of sympathy U....,REAL
3,Bernie supporters on Twitter erupt in anger ag...,FAKE
4,The Battle of New York: Why This Primary Matte...,REAL


In [8]:
# df_REAL = train_data[train_data.label == 'REAL']
# df_REAL.shape

In [9]:
# df_FAKE = train_data[train_data.label == 'FAKE']
# df_FAKE.shape

In [10]:
# train_data = pd.concat([df_REAL, df_FAKE], axis=0)
# train_data.shape

In [11]:
# train_data = train_data.drop(['X1','X2','ID'], axis=1)
# train_data.head()

In [1]:
#hstack to join sparse matrices

In [12]:
concatenated = pd.concat([fixed_X1, fixed_X2, fixed_labeled], axis=0)
concatenated.shape

(3999, 2)

In [13]:
concatenated

Unnamed: 0,concat,label
192,Election Day: No Legal Pot In Ohio Democrats ...,REAL
308,Who rode it best? Jesse Jackson mounts up to f...,FAKE
382,Black Hawk crashes off Florida human remains ...,REAL
660,Afghanistan: 19 die in air attacks on hospital...,REAL
889,Al Qaeda rep says group directed Paris magazin...,REAL
911,Shallow 5.4 magnitude earthquake rattles centr...,FAKE
1010,ICE Agent Commits Suicide in NYC Leaves Note ...,FAKE
1043,Political Correctness for Yuengling Brewery W...,FAKE
1218,Poll gives Biden edge over Clinton against GOP...,REAL
1438,Russia begins airstrikes in Syria U.S. warns ...,REAL


## Baseline

In [14]:
# Extracting features from text files
baseline_data = concatenated.copy()
count_vect = CountVectorizer()
v_concat = count_vect.fit_transform(baseline_data.concat)
print(v_concat.shape)

(3999, 55777)


In [15]:
# Extracting Term Frequency times inverse document frequency.
tfidf_transformer = TfidfTransformer()
tf_v_concat = tfidf_transformer.fit_transform(v_concat)
print(tf_v_concat.shape) #Data type sparse matrix of type '<class 'numpy.float64'>'

(3999, 55777)


In [16]:
v_baseline = tf_v_concat.copy()
v_baseline

<3999x55777 sparse matrix of type '<class 'numpy.float64'>'
	with 1373526 stored elements in Compressed Sparse Row format>

### Random Forest

In [17]:
X = v_baseline
Y = concatenated['label']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)
rf = RandomForestClassifier()
rf_clf = rf.fit(X_train, y_train)

y_test = np.array(y_test)
rf_Score = rf_clf.score(X_test,y_test)
# confusion matrix and classification report(precision, recall, F1-score)
print(classification_report(y_test, rf_clf.predict(X_test)))
print(confusion_matrix(y_test, rf_clf.predict(X_test)))
print('Accuracy:', rf_Score)



              precision    recall  f1-score   support

        FAKE       0.81      0.86      0.84       396
        REAL       0.86      0.80      0.83       404

   micro avg       0.83      0.83      0.83       800
   macro avg       0.83      0.83      0.83       800
weighted avg       0.83      0.83      0.83       800

[[342  54]
 [ 80 324]]
Accuracy: 0.8325


### Training Naive Bayes 

In [18]:
# Need to set a seed
X = v_baseline
Y = concatenated['label']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)
clf_nb = MultinomialNB().fit(X_train, y_train)
predicted = clf_nb.predict(X_test)

y_test = np.array(y_test)
clf_Score = clf_nb.score(X_test,y_test)
# confusion matrix and classification report(precision, recall, F1-score)
print(classification_report(y_test, clf_nb.predict(X_test)))
print(confusion_matrix(y_test, clf_nb.predict(X_test)))
print('Accuracy:', clf_Score)


              precision    recall  f1-score   support

        FAKE       0.97      0.62      0.75       377
        REAL       0.74      0.99      0.85       423

   micro avg       0.81      0.81      0.81       800
   macro avg       0.86      0.80      0.80       800
weighted avg       0.85      0.81      0.80       800

[[232 145]
 [  6 417]]
Accuracy: 0.81125


### Support Vector Machine

In [19]:
X = v_baseline
Y = concatenated['label']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)
text_clf_svm = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=123)

svm = text_clf_svm.fit(X_train, y_train)
predicted_svm = text_clf_svm.predict(X_test)

y_test = np.array(y_test)
svm_Score = svm.score(X_test,y_test)
# confusion matrix and classification report(precision, recall, F1-score)
print(classification_report(y_test, svm.predict(X_test)))
print(confusion_matrix(y_test, svm.predict(X_test)))
print('Accuracy:', svm_Score)

              precision    recall  f1-score   support

        FAKE       0.89      0.93      0.91       402
        REAL       0.93      0.88      0.90       398

   micro avg       0.91      0.91      0.91       800
   macro avg       0.91      0.91      0.91       800
weighted avg       0.91      0.91      0.91       800

[[375  27]
 [ 47 351]]
Accuracy: 0.9075




## Data Preparation

In [20]:
prep_data = concatenated.copy()

lemmatizer=WordNetLemmatizer()

words = stopwords.words("english")
prep_data['concat'] = prep_data['concat'].apply(lambda x:' '.join([lemmatizer.lemmatize(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())

#lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])

#prep_data['concat'] = prep_data['concat'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())


In [21]:
prep_data.head()

Unnamed: 0,concat,label
192,election day no legal pot in ohio democrats lo...,REAL
308,who rode best jesse jackson mount fight pipeli...,FAKE
382,black hawk crash florida human remains found c...,REAL
660,afghanistan die air attack hospital u s invest...,REAL
889,al qaeda rep say group directed paris magazine...,REAL


In [22]:
prep_data = concatenated.copy()
stemmer = PorterStemmer()
#stemmer = SnowballStemmer(language='english') #try both Porter and Snowball

words = stopwords.words("english")
prep_data['concat'] = prep_data['concat'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())
#prep_data['text_cleaned'] = prep_data['text'].apply(lambda x: " ".join([stemmer.stem(i) for i in re.sub("[^a-zA-Z]", " ", x).split() if i not in words]).lower())




In [23]:
prep_data.head()

Unnamed: 0,concat,label
192,elect day no legal pot in ohio democrat lose i...,REAL
308,who rode best jess jackson mount fight pipelin...,FAKE
382,black hawk crash florida human remain found cn...,REAL
660,afghanistan die air attack hospit u s investig...,REAL
889,al qaeda rep say group direct pari magazin att...,REAL


TfidfVectorizer perform the tokenize,then count tokens, then transform the raw counts to TF/IDF Values. Test results with and without stop_words="english".

In [32]:
vectorizer = TfidfVectorizer(min_df= 3, stop_words="english", sublinear_tf=True, norm='l2', ngram_range=(1, 3))
#vectorizer = TfidfVectorizer(min_df= 3, sublinear_tf=True, norm='l2', ngram_range=(1, 3))


In [34]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],'tfidf__use_idf': (True, False),'clf-svm__alpha': (1e-2, 1e-3)}

X = prep_data['concat']
Y = prep_data['label']
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20)
pipeline = Pipeline([('vect', vectorizer),
                     #('chi',  SelectKBest(chi2, k=1200)),
                     #('clf', RandomForestClassifier()),
                     #('nb', MultinomialNB()),
                     ('svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=123))
                    ])

#gs_clf = GridSearchCV(pipeline, parameters, n_jobs=-1) #errors

# fitting our model and save it in a pickle for later use
model = pipeline.fit(X_train, y_train)
#model = gs_clf.fit(X_train, y_train)

# with open('RandomForest.pickle', 'wb') as f:
#     pickle.dump(model, f)

y_test = np.array(y_test)
clf_Score = model.score(X_test,y_test)

# confusion matrix and classification report(precision, recall, F1-score)
print(classification_report(y_test, model.predict(X_test)))
print(confusion_matrix(y_test, model.predict(X_test)))
print('Accuracy:', clf_Score)



              precision    recall  f1-score   support

        FAKE       0.90      0.95      0.93       397
        REAL       0.95      0.90      0.92       403

   micro avg       0.93      0.93      0.93       800
   macro avg       0.93      0.93      0.93       800
weighted avg       0.93      0.93      0.93       800

[[379  18]
 [ 41 362]]
Accuracy: 0.92625


In [35]:
# Perform 6-fold cross validation
scores = cross_val_score(pipeline, X, Y, cv=6)
print ("Cross validated scores:", scores)



Cross validated scores: [0.92353823 0.92803598 0.90854573 0.92953523 0.91891892 0.9112782 ]
