## Fake News Classifier

### Dataset: https://www.kaggle.com/c/fake-news/data

In [1]:
import re
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
df.shape

(20800, 5)

In [5]:
df.dropna(inplace = True)
df.shape

(18285, 5)

In [6]:
df.reset_index(inplace = True, drop = True)
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


### Using Bag of Words

In [7]:
# text preprocessing on title of each news
ps = PorterStemmer()
corpus = []
for i in range(len(df)):
    sentence = re.sub('^a-zA-Z', ' ', df['title'][i])
    sentence = sentence.lower()
    words = sentence.split()
    words = [ps.stem(word) for word in words if word not in stopwords.words('english')]
    sentence = ' '.join(words)
    corpus.append(sentence)

In [8]:
# creating bag of words model
cv = CountVectorizer(max_features = 5000, ngram_range = (1, 3))
X = cv.fit_transform(corpus).toarray()
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)

#### Multinomial Naive Bayes

In [9]:
classifier = MultinomialNB()
max_score = 0
for alpha in np.arange(0, 1, 0.1):
    sub_classifier = MultinomialNB(alpha = alpha)
    sub_classifier.fit(X_train, y_train)
    y_preds = sub_classifier.predict(X_test)
    score = accuracy_score(y_test, y_preds)
    if max_score < score:
        max_score = score
        classifier = sub_classifier
    print(f"Alpha: {alpha}, Score: {score}")

  'setting alpha = %.1e' % _ALPHA_MIN)


Alpha: 0.0, Score: 0.8846727423363712
Alpha: 0.1, Score: 0.8990886495443248
Alpha: 0.2, Score: 0.9000828500414251
Alpha: 0.30000000000000004, Score: 0.899917149958575
Alpha: 0.4, Score: 0.899751449875725
Alpha: 0.5, Score: 0.8990886495443248
Alpha: 0.6000000000000001, Score: 0.8995857497928749
Alpha: 0.7000000000000001, Score: 0.8992543496271748
Alpha: 0.8, Score: 0.8995857497928749
Alpha: 0.9, Score: 0.8994200497100249


#### Passive Aggressive Classifier

In [10]:
linear_clf = PassiveAggressiveClassifier()
linear_clf.fit(X_train, y_train)
preds = linear_clf.predict(X_test)
score = accuracy_score(y_test, preds)
print(f"Score: {score}")

Score: 0.9193040596520299


#### Real & Fake Features

In [11]:
feature_names = cv.get_feature_names()

In [12]:
# most real features
sorted(zip(feature_names, classifier.coef_[0]), reverse = True)[:20]

[('не', -8.955919475263002),
 ('на', -8.500443946580177),
 ('zuckerberg', -9.441427291044704),
 ('zu', -9.169493575561061),
 ('zone new york', -12.214016013284484),
 ('zone new', -12.214016013284484),
 ('zone', -8.28219038056016),
 ('zionist', -8.955919475263002),
 ('zika viru', -9.816120740486113),
 ('zika', -8.630497074828375),
 ('zero', -9.169493575561061),
 ('zealand', -9.169493575561061),
 ('youtub', -8.780028808799338),
 ('youth', -8.500443946580177),
 ('yourself', -9.816120740486113),
 ('young', -7.703156506767634),
 ('you', -7.043532018246333),
 ('yorker', -9.441427291044704),
 ('york time', -8.630497074828375),
 ('york new york', -12.214016013284484)]

In [13]:
# most fake features
sorted(zip(feature_names, classifier.coef_[0]))[:20]

[('000', -6.793481014012198),
 ('000 email', -9.169493575561061),
 ('000 job', -12.214016013284484),
 ('00pm', -8.385374616795389),
 ('00pm water', -8.630497074828375),
 ('00pm water cooler', -8.630497074828375),
 ('01', -9.169493575561061),
 ('10', -6.506905748535608),
 ('10 000', -9.441427291044704),
 ('10 year', -10.42225654405643),
 ('100', -7.649667821816648),
 ('100 000', -9.169493575561061),
 ('100 day', -10.42225654405643),
 ('100 million', -10.42225654405643),
 ('1000', -9.169493575561061),
 ('100percentfedup', -9.169493575561061),
 ('100percentfedup com', -9.169493575561061),
 ('11', -6.910711105225408),
 ('11 16', -8.630497074828375),
 ('11 2016', -8.630497074828375)]

### Using TFIDF

In [14]:
# text preprocessing on text of each news
ps = PorterStemmer()
corpus = []
for i in range(len(df)):
    sentence = re.sub('^a-zA-Z', ' ', df['text'][i])
    sentence = sentence.lower()
    words = sentence.split()
    words = [ps.stem(word) for word in words if word not in stopwords.words('english')]
    sentence = ' '.join(words)
    corpus.append(sentence)

In [15]:
# creating bag of words model
tf = CountVectorizer(max_features = 5000, ngram_range = (1, 3))
X = tf.fit_transform(corpus).toarray()
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 0)

#### Multinomial Naive Bayes

In [16]:
classifier = MultinomialNB()
max_score = 0
for alpha in np.arange(0, 1, 0.1):
    sub_classifier = MultinomialNB(alpha = alpha)
    sub_classifier.fit(X_train, y_train)
    y_preds = sub_classifier.predict(X_test)
    score = accuracy_score(y_test, y_preds)
    if max_score < score:
        max_score = score
        classifier = sub_classifier
    print(f"Alpha: {alpha}, Score: {score}")

  'setting alpha = %.1e' % _ALPHA_MIN)


Alpha: 0.0, Score: 0.9106876553438277
Alpha: 0.1, Score: 0.9090306545153273
Alpha: 0.2, Score: 0.9088649544324772
Alpha: 0.30000000000000004, Score: 0.9083678541839271
Alpha: 0.4, Score: 0.9085335542667772
Alpha: 0.5, Score: 0.908036454018227
Alpha: 0.6000000000000001, Score: 0.908036454018227
Alpha: 0.7000000000000001, Score: 0.908036454018227
Alpha: 0.8, Score: 0.907870753935377
Alpha: 0.9, Score: 0.907705053852527


#### Passive Aggressive Classifier

In [17]:
linear_clf = PassiveAggressiveClassifier()
linear_clf.fit(X_train, y_train)
preds = linear_clf.predict(X_test)
score = accuracy_score(y_test, preds)
print(f"Score: {score}")

Score: 0.9373653686826844


#### Real & Fake Features

In [18]:
feature_names = cv.get_feature_names()

In [19]:
# most real features
sorted(zip(feature_names, classifier.coef_[0]), reverse = True)[:20]

[('не', -8.547847523524862),
 ('на', -8.056937209459923),
 ('zuckerberg', -8.935168379509491),
 ('zu', -8.223521185205836),
 ('zone new york', -8.269362544554067),
 ('zone new', -8.817846492120491),
 ('zone', -9.396084792153626),
 ('zionist', -8.353857222896579),
 ('zika viru', -9.141850653769533),
 ('zika', -8.914633777067792),
 ('zero', -8.59101969539006),
 ('zealand', -9.719484947207235),
 ('youtub', -8.712855048251352),
 ('youth', -9.332368977767558),
 ('yourself', -9.20502955539103),
 ('young', -10.102477199462916),
 ('you', -9.834997834328968),
 ('yorker', -7.905252622127882),
 ('york time', -8.999430313727832),
 ('york new york', -10.893604788381458)]

In [20]:
# most fake features
sorted(zip(feature_names, classifier.coef_[0]))[:20]

[('000', -9.931328943267296),
 ('000 email', -6.842331480397781),
 ('000 job', -10.089231972712913),
 ('00pm', -10.376914045164256),
 ('00pm water', -7.057954761740763),
 ('00pm water cooler', -9.624174767402993),
 ('01', -10.292830927953855),
 ('10', -10.171470070949773),
 ('10 000', -7.926211473148722),
 ('10 year', -9.79499249971531),
 ('100', -7.3475434124631045),
 ('100 000', -7.9216831635239755),
 ('100 day', -11.161868774975233),
 ('100 million', -8.556334137402178),
 ('1000', -8.471930323819652),
 ('100percentfedup', -7.7774785116331735),
 ('100percentfedup com', -9.63247357021768),
 ('11', -10.157285435957835),
 ('11 16', -8.173858547518805),
 ('11 2016', -8.326395542955199)]