In [1]:
import re
import pandas as pd
import numpy as np

# import machine learning libraries from sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# visualizations
import seaborn as sns
from matplotlib import pyplot as plt

In [2]:
# read in dataset
df = pd.read_csv('data/all_news.csv', encoding = "ISO-8859-1", usecols=['user', 'text', 'veracity'], dtype=object)

In [3]:
df.head()

Unnamed: 0,user,text,veracity
0,patriotraphael,"RT @Conservatexian: News post: ""Liberals Call ...",False
1,CBSNews,North Korea poised to make unprecedented missi...,True
2,michellearry,RT @Midgespeaks: From your mouth to God's ears...,False
3,giselleevns,RT @cd3isme: What Does The Fox Say????? #MakeM...,False
4,tpartynews,'Stop trying to fix the police! Fix the ghetto...,False


In [4]:
len(df)

276490

In [6]:
### Preprocessing

# drop NAs
df = df.dropna()

# drop duplicates
df = df.drop_duplicates()

# convert all values to str
df = df.applymap(str)

In [7]:
# split df into df_train and df_test
df_train = df[:int(len(df)*0.7)]
df_test = df[int(len(df)*0.7):]

In [8]:
# create y_train and y_test
y_train = df_train['veracity']
y_test = df_test['veracity']

### TF-IDF Vectorizer

In [9]:
# TF-IDF Vectorizer with ngrams (1-3)
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', ngram_range=(1,3))
X_train = vectorizer.fit_transform(df_train['text'])
X_test = vectorizer.transform(df_test['text'])

In [10]:
# check that dimensions match
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(189396, 2420853) (189396,)
(81170, 2420853) (81170,)


### Multinomial Naive Bayes Classifier

In [11]:
clf = MultinomialNB()
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

In [12]:
# checking preds have the right dimension
len(preds) == len(y_test)

True

### Predicted Values

In [13]:
df_test = df_test.assign(prediction = preds)

In [14]:
df_test.head()

Unnamed: 0,user,text,veracity,prediction
192451,lazykstafford,babies and dogs(: http://t.co/S3zsklh2MO,False,False
192452,NBCNews,Georgia cops hunt for second suspect in basket...,True,True
192453,paulinett,RT @chinedudiokpa: people are really comparing...,False,False
192454,leroylovesusa,RT @TeamTrumpAZ: #RIP https://t.co/tT5dnYICCr,False,False
192455,BBCNews,"Wednesday's Telegraph: ""Pension cap to cost bi...",True,True


In [15]:
# model's accuracy score
score = accuracy_score(y_test, preds)
score

0.8406554145620303

In [16]:
# incorrectly classified results
df_test[df_test['prediction'] != df_test['veracity']].head(10)

Unnamed: 0,user,text,veracity,prediction
192457,NBCNews,"Drinking urine, eating twigs? Expert share fro...",True,False
192462,BBCNews,"Poverty costs UK £78bn a year, Joseph Rowntree...",True,False
192464,NBCNews,Trump names billionaire Carl Icahn as special ...,True,False
192465,NPR,Baby Bison That Was Placed In A Van By Tourist...,True,False
192473,BBCNews,New badger culling trials given go ahead acros...,True,False
192487,NBCNews,Colossal report on Iraq war to finally be rele...,True,False
192490,WSJ,"Shake-up in Trump transition team, social medi...",True,False
192491,PBS,@emilybethhill Monday morning.,True,False
192495,ABC,Giuliani on whether Trump will attack Bill Cli...,True,False
192502,CBSNews,Banned from Britain: From Martha Stewart to th...,True,False


In [17]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))

In [18]:
show_most_informative_features(vectorizer, clf)

	-14.7878	00 00          		-8.8146	com            
	-14.7878	00 00 00       		-8.9069	twitter        
	-14.7878	00 00 000      		-8.9091	twitter com    
	-14.7878	00 000         		-9.1895	pic twitter com
	-14.7878	00 000 honeymoon		-9.1895	pic twitter    
	-14.7878	00 04          		-9.1972	pic            
	-14.7878	00 04 08       		-9.2796	trump          
	-14.7878	00 16          		-9.2922	http           
	-14.7878	00 16 https    		-9.3642	new            
	-14.7878	00 21          		-9.4702	says           
	-14.7878	00 21 00       		-9.6291	pbs            
	-14.7878	00 51          		-9.6817	watch          
	-14.7878	00 51 cst      		-9.8244	police         
	-14.7878	00 automobile  		-9.8457	org            
	-14.7878	00 automobile executives		-9.8514	clinton        
	-14.7878	00 biz         		-9.8557	pbs org        
	-14.7878	00 biz https   		-9.8632	http pbs org   
	-14.7878	00 blacktwitter		-9.8632	http pbs       
	-14.7878	00 blacktwitter blackownedstores		-9.8648	year           
	-14

In [23]:
prop = pd.read_csv('nbc_dataset/tweets.csv')

In [51]:
prop['text'][28]

"When I remember it's #Friyay https://t.co/yjBTsaFaR2"