In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, roc_auc_score
from sklearn.pipeline import Pipeline

import seaborn as sns, matplotlib.pyplot as plt

import re

import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [None]:
# Fake News loaded
fake = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')
fake.head()

In [None]:
# Fake news are labelled as 1
fake['label'] = 1
fake.head()

In [None]:
# True news loaded
true = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
true.head()

In [None]:
# True news labelled as 0
true['label'] = 0
true.head()

In [None]:
len(true), len(fake)

In [None]:
# Both fake and true news are combined and first 2000 data points are only selected
df = pd.concat([fake.iloc[:1000,:],true.iloc[:1000,:]], axis=0)
df = df.sample(len(df))
df = df.reset_index(drop=True)
df.head()

In [None]:
# Text & Title columns are combined and other columns except label are dropped
df.text = df.title + df.text
df = df[['text','label']]
df.head()

In [None]:
X = df.text
y = df.label

## Processing

In [None]:
# Data processing is done
corpus = []
ps = PorterStemmer()

for i in X:
    # Except alphabets, everything is removed
    new = re.sub('[^a-zA-Z]', ' ',i)
    # Converted to lower case
    new = new.lower()
    # Word tolenizing done
    new = nltk.word_tokenize(new)
    # English stop words are removed
    new = [ps.stem(i) for i in new if i not in stopwords.words('english')]
    new = ' '.join(new)
    corpus.append(new)

In [None]:
# Splitting into train and test data
X_train,X_test, y_train,y_test = train_test_split(corpus,y, train_size=0.7, random_state=100, stratify=y)

In [None]:
# 2 models are considered included in a pipeline to produce bag of words
mnb = Pipeline([('cnt_vec', CountVectorizer()),
               ('mnb', MultinomialNB())])

bnb = Pipeline([('cnt_vec', CountVectorizer()),
               ('bnb', BernoulliNB())])

folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=52)

In [None]:
# Cross validation for first model
cv_mnb = cross_val_score(mnb,X_train,y_train,cv=folds)
cv_mnb.mean()

In [None]:
# Cross validation for second model
cv_bnb = cross_val_score(bnb,X_train,y_train,cv=folds)
cv_bnb.mean()

In [None]:
# Hyperparameter tuning to find the best model and parameters
classifier = Pipeline([('classifier', mnb)])

hyp = [{'classifier':[mnb],
       'classifier__cnt_vec__ngram_range':[(1,1),(1,2),(2,1),(2,2)]},
      {'classifier':[bnb],
       'classifier__cnt_vec__ngram_range':[(1,1),(1,2),(2,1),(2,2)]}]

grid = GridSearchCV(estimator=classifier, param_grid=hyp, cv=folds, n_jobs=-1, scoring='accuracy',
                   verbose=3, return_train_score=True)
grid.fit(X_train,y_train)

In [None]:
grid.best_estimator_, grid.best_score_

In [None]:
model = grid.best_estimator_

In [None]:
# Final training of model
model.fit(X_train,y_train)
y_test_pred = model.predict(X_test)
print('Accuracy of test data =',100*accuracy_score(y_test, y_test_pred),'%')

In [None]:
conf = confusion_matrix(y_test, y_test_pred)
sns.heatmap(conf, annot=True)
plt.show()

In [None]:
y_test_proba = [i[1] for i in model.predict_proba(X_test)]

auc = roc_auc_score(y_test,y_test_proba)
fpr,tpr,thresh = roc_curve(y_test,y_test_proba)
plt.plot(fpr,tpr)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend(('AUC Score = {}%'.format(round(auc,2)),),loc='lower right')
plt.show()

In [None]:
imp = pd.DataFrame({'features':model['classifier']['cnt_vec'].get_feature_names(), 'coef':model['classifier']['bnb'].coef_[0]})

top20 = imp.sort_values('coef',ascending=False).iloc[:20,:]
down20 = imp.sort_values('coef').iloc[:20,:]

plt.figure(figsize=(20,8))
plt.bar(top20.features,top20.coef)
plt.title('Top 20 words in true news', fontsize=24)
plt.show()

plt.figure(figsize=(20,8))
plt.bar(down20.features,down20.coef)
plt.title('Top 20 words in false news', fontsize=24)
plt.xticks(rotation=90)
plt.show()