# Objective
To predict from the Title and the Text of an Article if the news is Real or Fake.

In [208]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
import pickle

In [209]:
import pandas as pd

In [210]:
data=pd.read_csv('fake_or_real_news.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,title,text,label
0,8476,You Can Smell Hillary’s Fear,"Daniel Greenfield, a Shillman Journalism Fello...",FAKE
1,10294,Watch The Exact Moment Paul Ryan Committed Pol...,Google Pinterest Digg Linkedin Reddit Stumbleu...,FAKE
2,3608,Kerry to go to Paris in gesture of sympathy,U.S. Secretary of State John F. Kerry said Mon...,REAL
3,10142,Bernie supporters on Twitter erupt in anger ag...,"— Kaydee King (@KaydeeKing) November 9, 2016 T...",FAKE
4,875,The Battle of New York: Why This Primary Matters,It's primary day in New York and front-runners...,REAL


In [211]:
data.shape

(6335, 4)

In [250]:
y=data['label']
x=data['text']+data['title']  #Merging both of the columns to obtain a single col

In [213]:
len(x)

6335

In [214]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.33,random_state=44) 

In order to preprocess(tokenize) text we will use a CountVectorizer and also find the Tfidf for the text data.
We will utilize the method which gives us the best result.

In [215]:
#nltk.download('wordnet')

In [216]:
import nltk
from nltk.stem import WordNetLemmatizer
#Bag-of-words
cv=CountVectorizer(stop_words="english")
tf = TfidfVectorizer(stop_words='english',max_df=0.7) #Removes Stopwords
#Turns words into their base form
wl=WordNetLemmatizer()

In [241]:
#Creating pickles for the transformers
wl=WordNetLemmatizer()
pickle.dump(wl,open("wl.pkl","wb"))
tf = TfidfVectorizer(stop_words='english',max_df=0.7) #Removes Stopwords
tf.fit(x_train)
pickle.dump(tf,open("tfidf.pkl","wb"))

In [254]:
c=x[4]
c=pd.Series(c)
c=c.apply(wl.lemmatize)
c=tf.transform(c)
print(nb.predict(c))

['REAL']


In [220]:
x_train=x_train.apply(wl.lemmatize)
x_test=x_test.apply(wl.lemmatize)

In [221]:
x_train_count=cv.fit_transform(x_train)
x_test_count=cv.transform(x_test)
x_train_tfidf=tf.fit_transform(x_train)
x_test_tfidf=tf.transform(x_test)

In [222]:
print(cv.get_feature_names()[0:10])

['00', '000', '0000', '000000031', '000035', '000billion', '000ft', '000x', '001', '0011']


# Using Naive Bayes for text classification
Naive Bayes model works well with text.

# Hyperparameter tuning

In [223]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
# Create the list of alphas: alphas
import numpy as np
alphas = np.arange(0,1,0.1) #From 0 to 1 with 0.1 intervals

# Define train_and_predict()
def train_and_predict(alpha):
    # Instantiate the classifier: nb_classifier
    nb_classifier = MultinomialNB(alpha=alpha)
    # Fit to the training data
    nb_classifier.fit(x_train_tfidf,y_train)
    # Predict the labels: pred
    pred = nb_classifier.predict(x_test_tfidf)
    # Compute accuracy: score
    score = metrics.accuracy_score(y_test,pred)
    return score

# Iterate over the alphas and print the corresponding score
for alpha in alphas:
    print('Alpha: ', alpha)
    print('Score: ', train_and_predict(alpha))
    print()

Alpha:  0.0


  'setting alpha = %.1e' % _ALPHA_MIN)


Score:  0.8943089430894309

Alpha:  0.1
Score:  0.9105691056910569

Alpha:  0.2
Score:  0.9019607843137255

Alpha:  0.30000000000000004
Score:  0.8957436633189861

Alpha:  0.4
Score:  0.890961262553802

Alpha:  0.5
Score:  0.8880918220946915

Alpha:  0.6000000000000001
Score:  0.882831181252989

Alpha:  0.7000000000000001
Score:  0.877092300334768

Alpha:  0.8
Score:  0.8713534194165471

Alpha:  0.9
Score:  0.8689622190339551



We can see that the optimum value of alpha is 0.1.

In [224]:
nb=MultinomialNB(alpha=0.1) #Multinomial Naive Bayes model
nb2=MultinomialNB(alpha=0.1)

In [225]:
x_test_count

<2091x56885 sparse matrix of type '<class 'numpy.int64'>'
	with 553473 stored elements in Compressed Sparse Row format>

In [226]:
nb.fit(x_train_count,y_train)
pred=nb.predict(x_test_count)
metrics.accuracy_score(y_test,pred)

0.9086561453849833

In [227]:
nb2.fit(x_train_tfidf,y_train)
pred2=nb2.predict(x_test_tfidf)
metrics.accuracy_score(y_test,pred)

0.9086561453849833

So, both the techniques give us a similar result.

Also, our predictor works reasonably well i.e. with an accuracy of above 90% on the test set.

In [228]:
metrics.confusion_matrix(y_test,pred)

array([[910, 107],
       [ 84, 990]], dtype=int64)

# Interpreting the model

In [229]:

# Get the class labels: class_labels
class_labels = nb.classes_

# Extract the features: feature_names
feature_names = tf.get_feature_names()

# Zip the feature names together with the coefficient array and sort by weights: feat_with_weights
feat_with_weights = sorted(zip(nb.coef_[0], feature_names))

In [230]:
# Print the first class label and the top 30 feat_with_weights entries (indicative of FAKE news)
class_labels[0], feat_with_weights[:30]  

('FAKE',
 [(-16.08754726423953, '000035'),
  (-16.08754726423953, '000billion'),
  (-16.08754726423953, '0011'),
  (-16.08754726423953, '004s'),
  (-16.08754726423953, '005'),
  (-16.08754726423953, '005s'),
  (-16.08754726423953, '00684'),
  (-16.08754726423953, '006s'),
  (-16.08754726423953, '007'),
  (-16.08754726423953, '007s'),
  (-16.08754726423953, '008s'),
  (-16.08754726423953, '00am'),
  (-16.08754726423953, '00pm'),
  (-16.08754726423953, '013c2812c9'),
  (-16.08754726423953, '015'),
  (-16.08754726423953, '016'),
  (-16.08754726423953, '01am'),
  (-16.08754726423953, '020'),
  (-16.08754726423953, '022'),
  (-16.08754726423953, '02714'),
  (-16.08754726423953, '02870'),
  (-16.08754726423953, '030'),
  (-16.08754726423953, '031'),
  (-16.08754726423953, '032'),
  (-16.08754726423953, '0325'),
  (-16.08754726423953, '033'),
  (-16.08754726423953, '03747'),
  (-16.08754726423953, '039'),
  (-16.08754726423953, '03eb'),
  (-16.08754726423953, '04pm')])

In [231]:
# Print the second class label and the bottom 30 feat_with_weights entries (indicative of REAL news)
class_labels[1], feat_with_weights[-30:]

('REAL',
 [(-6.2395195689077685, 'going'),
  (-6.233722756722502, 'news'),
  (-6.225871779517841, 'white'),
  (-6.221190610041609, 'gop'),
  (-6.212407747218889, 'bush'),
  (-6.16518889095315, 'democratic'),
  (-6.1486443064824465, 'cruz'),
  (-6.125743634880566, 'year'),
  (-6.125272070220313, 'presidential'),
  (-6.118693408272149, 'voters'),
  (-6.113088797832822, 'republicans'),
  (-6.08100685664732, 'political'),
  (-6.008385197631241, 'percent'),
  (-5.968504882037868, 'house'),
  (-5.966893992075728, 'sanders'),
  (-5.949343384036576, 'like'),
  (-5.942644115601743, 'states'),
  (-5.934819655667917, 'just'),
  (-5.904876149506519, 'time'),
  (-5.765645473309027, 'party'),
  (-5.664343724591964, 'republican'),
  (-5.571824366398973, 'campaign'),
  (-5.51492350110496, 'new'),
  (-5.467904263094537, 'president'),
  (-5.467415755232057, 'people'),
  (-5.448068582151567, 'obama'),
  (-5.426272452663007, 'state'),
  (-4.89771000298504, 'clinton'),
  (-4.525822111335696, 'trump'),
  (-

So, this model makes some sense. Most of the words indicating REAL news are related to politics while the ones indicating FAKE news are pretty much gibberish.

There is still some room for improvement in this model. We can eliminate a few small words (let's say less than 2 letters) maybe even single digit numbers and see how the model behaves.

We can also try some other ML algorithms to see how the accuracy changes. 

In [232]:
#Saving the model using pickle
pickle.dump(nb2,open('model.pkl','wb'))