## Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re

import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, f1_score, classification_report, accuracy_score

## Loading the dataset

In [None]:
true = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')
fake = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')

In [None]:
true.head(3)

In [None]:
fake.head(3)

In [None]:
true['label'] = 0
fake['label'] = 1

# Concatening the datasets
df = pd.concat([true, fake], ignore_index=True)

In [None]:
df

<b> The dataset has 44,898 records and 5 columns. </b>

In [None]:
df.info()

<b>Four columns are of object datatype and one column is integer.</b>

In [None]:
# Checking if any duplicate records are present

duplicate=df[df.duplicated()] 
duplicate

<b> There are 209 duplicate records in the dataset. </b>

In [None]:
# Removing duplicate records

df.drop_duplicates(inplace=True)

In [None]:
# Again check if any duplicate records are left

duplicate = df[df.duplicated()] 
duplicate

<b>Hence, all duplicate records are removed.</b>

In [None]:
df.describe(include='object')

In [None]:
# Checking for null values

df.isnull().sum()

<b>The dataset doesn't have any missing values.</b>

In [None]:
# Visualizing the disribution of true and fake news

sns.countplot(df['label'])

<b> The count of fake news is a bit more than true news. </b>

## Text Preprocessing

In [None]:
# Expanding contractions

# Dictionary of English Contractions
contractions_dict = { "ain't": "are not","'s":" is","aren't": "are not",
                     "can't": "cannot","can't've": "cannot have",
                     "'cause": "because","could've": "could have","couldn't": "could not",
                     "couldn't've": "could not have", "didn't": "did not","doesn't": "does not",
                     "don't": "do not","hadn't": "had not","hadn't've": "had not have",
                     "hasn't": "has not","haven't": "have not","he'd": "he would",
                     "he'd've": "he would have","he'll": "he will", "he'll've": "he will have",
                     "how'd": "how did","how'd'y": "how do you","how'll": "how will",
                     "I'd": "I would", "I'd've": "I would have","I'll": "I will",
                     "I'll've": "I will have","I'm": "I am","I've": "I have", "isn't": "is not",
                     "it'd": "it would","it'd've": "it would have","it'll": "it will",
                     "it'll've": "it will have", "let's": "let us","ma'am": "madam",
                     "mayn't": "may not","might've": "might have","mightn't": "might not", 
                     "mightn't've": "might not have","must've": "must have","mustn't": "must not",
                     "mustn't've": "must not have", "needn't": "need not",
                     "needn't've": "need not have","o'clock": "of the clock","oughtn't": "ought not",
                     "oughtn't've": "ought not have","shan't": "shall not","sha'n't": "shall not",
                     "shan't've": "shall not have","she'd": "she would","she'd've": "she would have",
                     "she'll": "she will", "she'll've": "she will have","should've": "should have",
                     "shouldn't": "should not", "shouldn't've": "should not have","so've": "so have",
                     "that'd": "that would","that'd've": "that would have", "there'd": "there would",
                     "there'd've": "there would have", "they'd": "they would",
                     "they'd've": "they would have","they'll": "they will",
                     "they'll've": "they will have", "they're": "they are","they've": "they have",
                     "to've": "to have","wasn't": "was not","we'd": "we would",
                     "we'd've": "we would have","we'll": "we will","we'll've": "we will have",
                     "we're": "we are","we've": "we have", "weren't": "were not","what'll": "what will",
                     "what'll've": "what will have","what're": "what are", "what've": "what have",
                     "when've": "when have","where'd": "where did", "where've": "where have",
                     "who'll": "who will","who'll've": "who will have","who've": "who have",
                     "why've": "why have","will've": "will have","won't": "will not",
                     "won't've": "will not have", "would've": "would have","wouldn't": "would not",
                     "wouldn't've": "would not have","y'all": "you all", "y'all'd": "you all would",
                     "y'all'd've": "you all would have","y'all're": "you all are",
                     "y'all've": "you all have", "you'd": "you would","you'd've": "you would have",
                     "you'll": "you will","you'll've": "you will have", "you're": "you are",
                     "you've": "you have"}

# Regular expression for finding contractions
contractions_re=re.compile('(%s)' % '|'.join(contractions_dict.keys()))

# Function for expanding contractions
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

# Expanding Contractions in the title, text
df['title'] = df['title'].apply(lambda x:expand_contractions(x))
df['text'] = df['text'].apply(lambda x:expand_contractions(x))

In [None]:
# Converting text to lowercase

df['title'] = df['title'].apply(lambda x:x.lower())
df['text'] = df['text'].apply(lambda x:x.lower())

In [None]:
# Removing digits and words containing digits

df['title'] = df['title'].apply(lambda x: re.sub('\w*\d\w*','', x))
df['text'] = df['text'].apply(lambda x: re.sub('\w*\d\w*','', x))

In [None]:
# Removing punctuations

df['title'] = df['title'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
df['text'] = df['text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))

In [None]:
# Removing extra spaces

df['title'] = df['title'].apply(lambda x: re.sub(' +',' ',x))
df['text'] = df['text'].apply(lambda x: re.sub(' +',' ',x))

In [None]:
# Applying lemmatization

w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    rev = [lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text) if w not in stopwords.words('english')]
    rev = ' '.join(rev)
    return rev

df['title'] = df.title.apply(lemmatize_text)
df['text'] = df.text.apply(lemmatize_text)

In [None]:
# Displaying title, text after cleaning

print("Title\n")
for index,text in enumerate(df['title'][0:3]):
    print('Title %d:\n'%(index+1), text)
    
print("\nText\n")
for index,txt in enumerate(df['text'][0:3]):
    print('Text %d:\n'%(index+1), txt)

In [None]:
# Wordcloud of title, text in True news

# Cleaned dataframe of True labels
df_true = df[df.label == 0]

title_true = " ".join(tit for tit in df_true['title'])
text_true = " ".join(txt for txt in df_true['text'])

plt.figure(figsize=(40, 30))

# Title
title_cloud = WordCloud(collocations=False, background_color='black').generate(title_true)
plt.subplot(1, 2, 1)
plt.axis("off")
plt.title("Title", fontsize=40)
plt.imshow(title_cloud, interpolation='bilinear')

# Title
text_cloud = WordCloud(collocations=False, background_color='black').generate(text_true)
plt.subplot(1, 2, 2)
plt.axis("off")
plt.title("Text", fontsize=40)
plt.imshow(text_cloud, interpolation='bilinear')

<b> Common words in title:- </b>trump, korea, republican, house, russia, say, new, leader, white, senate, etc.


<b> Common words in text:- </b>trump, state, republican, president, said, reuters, party, official, country, people, etc.

In [None]:
# Wordcloud of title, text in Fake news

# Cleaned dataframe of Fake labels
df_fake = df[df.label == 1]

title_fake = " ".join(tit for tit in df_fake['title'])
text_fake = " ".join(txt for txt in df_fake['text'])

plt.figure(figsize=(40, 30))

# Title
title_cloud = WordCloud(collocations=False, background_color='black').generate(title_fake)
plt.subplot(1, 2, 1)
plt.axis("off")
plt.title("Title", fontsize=40)
plt.imshow(title_cloud, interpolation='bilinear')

# Title
text_cloud = WordCloud(collocations=False, background_color='black').generate(text_fake)
plt.subplot(1, 2, 2)
plt.axis("off")
plt.title("Text", fontsize=40)
plt.imshow(text_cloud, interpolation='bilinear')

<b> Common words in title:- </b>trump, video, watch, clinton, obama, tweet, president, woman, muslim, democrat, etc.


<b> Common words in text:- </b>trump, people, said, president, new, obama, state, clinton, time, one, etc.

In [None]:
# Subject-wise distriution of news 

sns.countplot(df['subject'])
plt.xticks(rotation=90)

<b> Observations:- </b>

<ul>
    <li>Most of the news in the dataset is politicsNews.</li>
    <li>It is followed by worldNews, News and politics.</li>
    <li>Government News, US_News and Middle-east have less than 2000 records.</li>
</ul>

## Model

In [None]:
tf = TfidfVectorizer(max_features=3000, ngram_range=(1,4))

X = tf.fit_transform(df['text']).toarray()
X

In [None]:
y = df['label']

In [None]:
# Splitting the dataset into train and test 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)

In [None]:
# Training the model using Naive Bayes classifier

nb = MultinomialNB().fit(X_train, y_train)

In [None]:
print("Score of train data:", nb.score(X_train, y_train))
print("Score of test data:", nb.score(X_test, y_test))

<b> The model performs well on train as well as test data. </b>

In [None]:
y_pred = nb.predict(X_test)
y_pred

In [None]:
# F1 score and accuracy

f1_score = f1_score(y_test, y_pred, average='weighted')
accuracy = accuracy_score(y_test, y_pred)

print("F1 Score:", f1_score)
print("Accuracy Score:", accuracy)

In [None]:
# Classification Report

print(classification_report(y_test, y_pred))

In [None]:
cm = confusion_matrix(y_test, y_pred)

group_names = ['True Neg', 'False Pos', 'False Neg', 'True Pos']
group_counts = ["{0:0.0f}".format(value) for value in
                cm.flatten()]
group_percentages = ["{0:.2%}".format(value) for value in
                     cm.flatten()/np.sum(cm)]
labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in
          zip(group_names,group_counts,group_percentages)]
labels = np.asarray(labels).reshape(2,2)
sns.heatmap(cm, annot=labels, fmt='', cmap='PuRd')