In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import itertools
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split as tts
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [2]:
df = pd.read_csv('/kaggle/input/fake-news/train.csv')
df[:2]

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0


In [3]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [4]:
df.dropna(axis=0, inplace=True)
df.reset_index(inplace=True)

In [5]:
df.shape

(18285, 6)

In [6]:
df.isnull().sum()

index     0
id        0
title     0
author    0
text      0
label     0
dtype: int64

In [7]:
# x = df.iloc[:,1:4]
x = df.drop(['label','id'],axis=1)
x[:2]

Unnamed: 0,index,title,author,text
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...


In [8]:
y = df['label']
y.head()

0    1
1    0
2    1
3    1
4    1
Name: label, dtype: int64

In [9]:
x.shape, y.shape

((18285, 4), (18285,))

In [10]:
msgs = df.copy()
msgs.shape

(18285, 6)

In [11]:
ps = PorterStemmer()

In [12]:
corpus=[]
for i in range(len(msgs)):
    r = msgs['title'][i]
    r = re.sub('[^a-zA-Z]',' ',r)
    r = r.lower()
    r = r.split()
    r = [ps.stem(w) for w in r if not w in stopwords.words('english')]
    r = ' '.join(r)
    corpus.append(r)
corpus[:5]

['hous dem aid even see comey letter jason chaffetz tweet',
 'flynn hillari clinton big woman campu breitbart',
 'truth might get fire',
 'civilian kill singl us airstrik identifi',
 'iranian woman jail fiction unpublish stori woman stone death adulteri']

In [13]:
cv = CountVectorizer(max_features=5000, ngram_range=(1,4))
x = cv.fit_transform(corpus).toarray()

In [14]:
x.shape, y.shape

((18285, 5000), (18285,))

In [15]:
xtr, xte, ytr, yte = tts(x, y, test_size=.2, random_state=0)

In [16]:
cv.get_feature_names()[:5]



['abandon', 'abc', 'abc news', 'abduct', 'abe']

In [17]:
count_df = pd.DataFrame(x, columns = cv.get_feature_names())
count_df.shape

(18285, 5000)

In [18]:
cls = MultinomialNB()
cls.fit(xtr, ytr)
yp = cls.predict(xte)
print(accuracy_score(yte, yp))

0.9004648619086683


In [19]:
# from sklearn.linear_model import LogisticRegression
# lr = LogisticRegression()
# lr.fit(xtr, ytr)
# y_lr = lr.predict(xte)
# print(accuracy_score(yte, y_lr))

In [20]:
# from sklearn.ensemble import RandomForestClassifier
# rfc = RandomForestClassifier()
# rfc.fit(xtr, ytr)
# y_rfc = rfc.predict(xte)
# print(accuracy_score(yte, y_rfc))

In [21]:
cls = MultinomialNB(alpha=.9)
cls.fit(xtr, ytr)
yp = cls.predict(xte)
print(accuracy_score(yte, yp))

0.9010117582718075


In [22]:
## Most real /// The more negative the value the more chances of it being fake
sorted(zip(cls.feature_log_prob_[0],cv.get_feature_names()))[:20]



[(-11.641260044187877, 'access pipelin protest'),
 (-11.641260044187877, 'acknowledg emf'),
 (-11.641260044187877, 'acknowledg emf damag'),
 (-11.641260044187877, 'acquit'),
 (-11.641260044187877, 'adhd'),
 (-11.641260044187877, 'airstrik kill'),
 (-11.641260044187877, 'al nusra'),
 (-11.641260044187877, 'alien muslim'),
 (-11.641260044187877, 'american concern'),
 (-11.641260044187877, 'american concern elect'),
 (-11.641260044187877, 'american concern elect violenc'),
 (-11.641260044187877, 'american peopl defeat'),
 (-11.641260044187877, 'american peopl defeat oligarchi'),
 (-11.641260044187877, 'ariel noyola'),
 (-11.641260044187877, 'ariel noyola rodr'),
 (-11.641260044187877, 'ariel noyola rodr guez'),
 (-11.641260044187877, 'ask question'),
 (-11.641260044187877, 'auf'),
 (-11.641260044187877, 'avail'),
 (-11.641260044187877, 'babi powder')]