In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

!pip install neattext

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import re
import neattext as nt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
trueNews = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/True.csv')
fakeNews = pd.read_csv('/kaggle/input/fake-and-real-news-dataset/Fake.csv')

In [None]:
trueNews.columns, fakeNews.columns

In [None]:
trueNews['label'] = pd.Series('True', index=trueNews.index)
fakeNews['label'] = pd.Series('Fake', index=fakeNews.index)

In [None]:
trueNews.head()

In [None]:
trueNews.tail()

In [None]:
fakeNews.head()

In [None]:
fakeNews.tail()

In [None]:
df = pd.concat([trueNews, fakeNews])
df.reset_index(drop=True, inplace=True)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df['titleLength'] = df['title'].apply(len)
df['textLength'] = df['text'].apply(len)

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(data=df, x='label');

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(data=df[df['label']=='True'], x='subject');

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(data=df[df['label']=='Fake'], x='subject');

In [None]:
df['preProcessedTitle'] = df['title'].apply(str.lower)
df['preProcessedTitle'] = df['preProcessedTitle'].apply(lambda x: x.replace('/', ' '))
df['preProcessedTitle'] = df['preProcessedTitle'].apply(nt.remove_special_characters)
df['preProcessedTitle'] = df['preProcessedTitle'].apply(nt.remove_dates)
df['preProcessedTitle'] = df['preProcessedTitle'].apply(nt.remove_numbers)
df['preProcessedTitle'] = df['preProcessedTitle'].apply(nt.remove_stopwords)
df['preProcessedTitle'] = df['preProcessedTitle'].apply(nt.remove_punctuations)
df['preProcessedTitle'] = df['preProcessedTitle'].apply(nt.remove_emojis)
df['preProcessedTitle'] = df['preProcessedTitle'].apply(nt.remove_urls)

In [None]:
df['preProcessedText'] = df['text'].apply(str.lower)
df['preProcessedText'] = df['preProcessedText'].apply(lambda x: x.replace('/', ' '))
df['preProcessedText'] = df['preProcessedText'].apply(nt.remove_special_characters)
df['preProcessedText'] = df['preProcessedText'].apply(nt.remove_dates)
df['preProcessedText'] = df['preProcessedText'].apply(nt.remove_numbers)
df['preProcessedText'] = df['preProcessedText'].apply(nt.remove_stopwords)
df['preProcessedText'] = df['preProcessedText'].apply(nt.remove_punctuations)
df['preProcessedText'] = df['preProcessedText'].apply(nt.remove_emojis)
df['preProcessedText'] = df['preProcessedText'].apply(nt.remove_urls)

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [None]:
df['preProcessedTitle'] = df['preProcessedTitle'].apply(str.split).apply(lambda x: list(map(ps.stem, x))).apply(lambda x: ' '.join(x))
df['preProcessedText'] = df['preProcessedText'].apply(str.split).apply(lambda x: list(map(ps.stem, x))).apply(lambda x: ' '.join(x))

In [None]:
df.head()

In [None]:
len(df)

In [None]:
df['class'] = pd.get_dummies(df['label'], drop_first=True)

In [None]:
train, test = train_test_split(df, test_size=0.2)

In [None]:
train.shape, test.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# TFIDF + BernoulliNB with 'preProcessedTitle' as feature

In [None]:
pipeTfidf = Pipeline(steps=[('tfidf', TfidfVectorizer()), ('nb', MultinomialNB())])
param_grid_tfidf = {
    'tfidf__ngram_range': [(1,1), (1,2), (1,3)]
}

In [None]:
x_trainTitle, y_train, x_testTitle, y_test = train['preProcessedTitle'], train['class'], test['preProcessedTitle'], test['class']

In [None]:
TfidfSearch = GridSearchCV(pipeTfidf, param_grid_tfidf)
TfidfSearch.fit(x_trainTitle, y_train)

In [None]:
print(f"Best Training Score (tfidf): {TfidfSearch.best_score_*100} %")
print(f"Best Parameters (tfidf): {TfidfSearch.best_params_}")

In [None]:
tfidfPredicts = TfidfSearch.predict(x_testTitle)
plt.figure(figsize=(10,8))
sns.heatmap(confusion_matrix(y_test, tfidfPredicts), annot=True, fmt='d')

In [None]:
print(f"Test Score: {accuracy_score(y_test, tfidfPredicts)*100} %")

# CountVectorizer + BernoulliNB with 'preProcessedTitle' as feature

In [None]:
pipeCountV = Pipeline(steps=[('cv', CountVectorizer()), ('nb', MultinomialNB())])
param_grid_cv = {
    'cv__ngram_range': [(1,1), (1,2), (1,3)]
}

In [None]:
CvSearch = GridSearchCV(pipeCountV, param_grid_cv)
CvSearch.fit(x_trainTitle, y_train)

In [None]:
print(f"Best Score (CV): {CvSearch.best_score_*100} %")
print(f"Best Parameters (CV): {CvSearch.best_params_}")

In [None]:
CvPredicts = CvSearch.predict(x_testTitle)
plt.figure(figsize=(10,8))
sns.heatmap(confusion_matrix(y_test, tfidfPredicts), annot=True, fmt='d')

In [None]:
print(f"Test Score: {accuracy_score(y_test, CvPredicts)*100} %")

# TFIDF + BernoulliNB with 'preProcessedText' as feature

In [None]:
x_trainText, y_train, x_testText, y_test = train['preProcessedText'], train['class'], test['preProcessedText'], test['class']

In [None]:
TfidfSearch.fit(x_trainText, y_train)
print(f"Best Training Score (tfidf): {TfidfSearch.best_score_*100} %")
print(f"Best Parameters (tfidf): {TfidfSearch.best_params_}")

In [None]:
tfidfPredicts = TfidfSearch.predict(x_testText)
plt.figure(figsize=(10,8))
sns.heatmap(confusion_matrix(y_test, tfidfPredicts), annot=True, fmt='d')

In [None]:
print(f"Test Score: {accuracy_score(y_test, tfidfPredicts)*100} %")

# CountVectorizer + MultinomialNB with 'preProcessedText' as feature

In [None]:
CvSearch = GridSearchCV(pipeCountV, param_grid_cv)
CvSearch.fit(x_trainText, y_train)
print(f"Best Score (CV): {CvSearch.best_score_*100} %")
print(f"Best Parameters (CV): {CvSearch.best_params_}")

In [None]:
CvPredicts = CvSearch.predict(x_testText)
plt.figure(figsize=(10,8))
sns.heatmap(confusion_matrix(y_test, CvPredicts), annot=True, fmt='d')

In [None]:
print(f"Test Score: {accuracy_score(y_test, CvPredicts)*100} %")

# CountVectorizer + MultinomialNB with 'preProcessedText' and 'preProcessedTitle' as feature

In [None]:
df['combined'] = df['title']+' '+df['text']

In [None]:
df['combined'] = df['text'].apply(str.lower)
df['combined'] = df['combined'].apply(lambda x: x.replace('/', ' '))
df['combined'] = df['combined'].apply(nt.remove_special_characters)
df['combined'] = df['combined'].apply(nt.remove_dates)
df['combined'] = df['combined'].apply(nt.remove_numbers)
df['combined'] = df['combined'].apply(nt.remove_stopwords)
df['combined'] = df['combined'].apply(nt.remove_punctuations)
df['combined'] = df['combined'].apply(nt.remove_emojis)
df['combined'] = df['combined'].apply(nt.remove_urls)

In [None]:
df['combined'] = df['combined'].apply(str.split).apply(lambda x: list(map(ps.stem, x))).apply(lambda x: ' '.join(x))

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df['combined'], df['class'], test_size=0.2, random_state=42)

In [None]:
CvSearch = GridSearchCV(pipeCountV, param_grid_cv)
CvSearch.fit(x_train, y_train)
print(f"Best Score (CV): {CvSearch.best_score_*100} %")
print(f"Best Parameters (CV): {CvSearch.best_params_}")

In [None]:
CvPredicts = CvSearch.predict(x_test)
plt.figure(figsize=(10,8))
sns.heatmap(confusion_matrix(y_test, CvPredicts), annot=True, fmt='d')

In [None]:
print(f"Test Score: {accuracy_score(y_test, CvPredicts)*100} %")