# Import Libraries

In [None]:
!pip install neattext
import pandas as pd
import numpy as np
import neattext as nt
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from nltk.stem import PorterStemmer
from functools import partial
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Load The Data

In [None]:
true_path = '/kaggle/input/fake-and-real-news-dataset/True.csv'
fake_path = '/kaggle/input/fake-and-real-news-dataset/Fake.csv'

In [None]:
trueDf = pd.read_csv(true_path)
fakeDf = pd.read_csv(fake_path)

In [None]:
trueDf.columns

In [None]:
fakeDf.columns

# Add Labels (1 - True, 0 - Fake)

In [None]:
trueDf['class'] = pd.Series(1, index=trueDf.index)
fakeDf['class'] = pd.Series(0, index=fakeDf.index)

In [None]:
trueDf.head()

In [None]:
fakeDf.head()

# Merge trueDf and fakeDf

In [None]:
df = pd.concat([trueDf, fakeDf])

In [None]:
del trueDf
del fakeDf

In [None]:
df['class'].value_counts()

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df['class'].value_counts()

# EDA

### Preprocessing the text data (both title and text)

In [None]:
def preProcess(txt):
    txt = txt.lower()
    txt = nt.remove_punctuations(txt)
    txt = nt.remove_special_characters(txt)
    txt = nt.remove_urls(txt)
    txt = nt.remove_dates(txt)
    txt = nt.remove_numbers(txt)
    txt = nt.remove_stopwords(txt)
    return txt

In [None]:
df['preProcessTitle'] = df['title'].apply(preProcess)
df['preProcessText'] = df['text'].apply(preProcess)

### Creating wordcloud for 'text'

In [None]:
textTxt = ' '.join(df['preProcessText'].values)

wordcloudTxt = WordCloud(width = 800, height = 800,
                background_color ='black',
                min_font_size = 10).generate(textTxt)
  
                       
plt.figure(figsize = (10, 8), facecolor = None)
plt.imshow(wordcloudTxt)
plt.axis("off")
plt.tight_layout(pad = 0)
  
plt.show()

### Top 10 words in 'text'

In [None]:
words, freq = list(wordcloudTxt.words_.keys()), list(wordcloudTxt.words_.values())
txtWords = pd.DataFrame(data = list(zip(words[:10], freq[:10])), columns=['words', 'freq'])
plt.figure(figsize=(15,8))
sns.barplot(x=txtWords['words'], y=txtWords['freq'])

### Creating wordcloud for 'title'

In [None]:
titleTxt = ' '.join(df['preProcessTitle'].values)

wordcloudTitle = WordCloud(width = 800, height = 800,
                background_color ='black',
                min_font_size = 10).generate(titleTxt)
  
                       
plt.figure(figsize = (10, 8), facecolor = None)
plt.imshow(wordcloudTitle)
plt.axis("off")
plt.tight_layout(pad = 0)
  
plt.show()

### Top 10 words in 'title'

In [None]:
words, freq = list(wordcloudTitle.words_.keys()), list(wordcloudTitle.words_.values())
titleWords = pd.DataFrame(data = list(zip(words[:10], freq[:10])), columns=['words', 'freq'])
plt.figure(figsize=(15,8))
sns.barplot(x=titleWords['words'], y=titleWords['freq'])

### True vs Fake news

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(x=df['class']);

### Subject w.r.t the class (True vs Fake news)

In [None]:
plt.figure(figsize=(10,8))
sns.countplot(x=df['subject'], hue=df['class']);

# Stemming the text data

In [None]:
ps = PorterStemmer()
stem = lambda x: ' '.join(list(map(ps.stem, x.split())))

In [None]:
df['preProcessTextStem'] = df['preProcessText'].apply(stem)
df['preProcessTitleStem'] = df['preProcessTitle'].apply(stem)

In [None]:
df.head()

# Splitting the Data into Train - Test (80-20)

In [None]:
X, Y = df['preProcessTextStem'], df['class']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

#### deleting unrequired dataframes

In [None]:
del df
del X
del Y

# Comparing various Classification Models using Cross Validation

### TF-IDF with LogisticRegression, BernaoulliNb and RandomForest

In [None]:
pipLogR = Pipeline([('tfidf', TfidfVectorizer()), ('logR', LogisticRegression())])
score_logR = cross_val_score(estimator=pipLogR, X=X_train, y=y_train, cv=5)
print('Mean Score: ', score_logR.mean())

In [None]:
pipNb = Pipeline([('tfidf', TfidfVectorizer()), ('Nb', BernoulliNB())])
score_Nb = cross_val_score(estimator=pipNb, X=X_train, y=y_train, cv=5)
print('Mean Score: ', score_Nb.mean())

In [None]:
pipRf = Pipeline([('tfidf', TfidfVectorizer()), ('Rf', RandomForestClassifier())])
score_Rf = cross_val_score(estimator=pipRf, X=X_train, y=y_train, cv=5)
print('Mean Score: ', score_Rf.mean())

### CountVectorizer with LogisticRegression, BernaoulliNb and RandomForest

In [None]:
pipLogR_cv = Pipeline([('cv', CountVectorizer()), ('logR', LogisticRegression(max_iter=500))]) # increased the max_iter since default (100) didn't converge 
score_logR_cv = cross_val_score(estimator=pipLogR_cv, X=X_train, y=y_train, cv=5)
print('Mean Score: ', score_logR_cv.mean())

In [None]:
pipNb_cv = Pipeline([('cv', CountVectorizer()), ('Nb', BernoulliNB())])
score_Nb_cv = cross_val_score(estimator=pipNb_cv, X=X_train, y=y_train, cv=5)
print('Mean Score: ', score_Nb_cv.mean())

In [None]:
pipRf_cv = Pipeline([('cv', CountVectorizer()), ('Rf', RandomForestClassifier())])
score_Rf_cv = cross_val_score(estimator=pipRf_cv, X=X_train, y=y_train, cv=5)
print('Mean Score: ', score_Rf_cv.mean())

> Combination of CountVectorizer and LogisticRegression outperformed the others!

# Creating The Final Pipeline

In [None]:
pipe = Pipeline([('cv', CountVectorizer()), ('logR', LogisticRegression(max_iter=500))])
pipe.fit(X_train, y_train)

# Predicting the unseen test data

In [None]:
pipe.score(X_test, y_test)

In [None]:
pred_y = pipe.predict(X_test)
plt.figure(figsize=(10,8))
sns.heatmap(confusion_matrix(y_test, pred_y), annot=True, fmt='d');

In [None]:
print(classification_report(y_test, pred_y))