In [81]:
import pandas as pd
import numpy as np

# import machine learning libraries from sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# visualizations
import seaborn as sns
from matplotlib import pyplot as plt

In [101]:
# read in dataset
df = pd.read_csv('data/all_news.csv', encoding = "ISO-8859-1", usecols=['user', 'text', 'veracity'], dtype=object)

In [102]:
df.head()

Unnamed: 0,user,text,veracity
0,hyddrox,RT @nyinvesting: The story of Hillary's defens...,False
1,BBCNews,The UK High Street is losing another brand fol...,True
2,TheEconomist,"On hearing swear words, people's heart rates s...",True
3,TheEconomist,"For some passengers, the aesthetic qualities o...",True
4,WSJ,"The real Greek drama is about reforms, not deb...",True


In [103]:
### Preprocessing

# drop NAs
df = df.dropna()
# convert all values to str
df = df.applymap(str)

In [104]:
# split df into df_train and df_test
df_train = df[:int(len(df)*0.7)]
df_test = df[int(len(df)*0.7):]

In [105]:
# create y_train and y_test
y_train = df_train['veracity']
y_test = df_test['veracity']

### TF-IDF Vectorizer

In [106]:
# TF-IDF Vectorizer with ngrams (1-3)
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', ngram_range=(1,3))
X_train = vectorizer.fit_transform(df_train['text'])
X_test = vectorizer.transform(df_test['text'])

In [107]:
# check that dimensions match
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(193518, 2797359) (193518,)
(82937, 2797359) (82937,)


### Multinomial Naive Bayes Classifier

In [108]:
classifier = MultinomialNB()
classifier.fit(X_train, y_train)
preds = classifier.predict(X_test)

In [109]:
# checking preds have the right dimension
len(preds) == len(y_test)

True

### Predicted Values

In [110]:
df_test = df_test.assign(prediction = preds)

In [112]:
df_test.head()

Unnamed: 0,user,text,veracity,prediction
193546,TheEconomist,The signs are that Britons will continue to sh...,True,True
193547,mil0blake,RT @windy_mills: He still thinks he invented t...,False,False
193548,tpartynews,"""Trump has nobody helping him other than the p...",False,False
193549,BBCNews,Cameron to face EU leaders for first time sinc...,True,True
193550,PBS,What the super PAC funded by Cards Against Hum...,True,True


In [113]:
# model's accuracy score
score = accuracy_score(y_test, preds)
score

0.97763362552298738

In [115]:
# incorrectly classified results
df_test[df_test['prediction'] != df_test['veracity']].head()

Unnamed: 0,user,text,veracity,prediction
193630,WSJ,Ford to more than double Mexico production cap...,True,False
193638,BBCNews,Cash machine blast gang jailedhttp://bbc.in/2i...,True,False
193663,NPR,"In Visit With Seniors, This Boy Learned Lesson...",True,False
193675,BBCNews,Ferrari beat Mercedes in final practicehttp://...,True,False
193698,BBCNews,Jessica Lawson death: Family still waiting for...,True,False
