# Import the libraries

In [None]:
!pip install bs4

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import nltk
from wordcloud import WordCloud, STOPWORDS
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from string import punctuation
import string
import re
from bs4 import BeautifulSoup


import warnings
warnings.filterwarnings('ignore')

## Read the csv's

In [None]:
fake = pd.read_csv("../input/fake-and-real-news-dataset/Fake.csv")
true = pd.read_csv("../input/fake-and-real-news-dataset/True.csv")

## How many rows and columns does it have

In [None]:
fake.shape, true.shape

## How does the datasets look like

In [None]:
fake.head()

In [None]:
true.head()

## Adding a target column towards the end and adding Fake and True for each row

In [None]:
fake['target'] = 'Fake'
true['target'] = 'True'

In [None]:
fake.head()

In [None]:
true.head()

## Concatenating the fake and true dataframes

In [None]:
merge_fake_real = pd.concat([fake, true], axis = 0, ignore_index = True)

In [None]:
merge_fake_real

In [None]:
sns.countplot(x = 'target', data = merge_fake_real)

## This dataset looks balanced dataset.

# Data Cleaning

## Strip Html

In [None]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

## Remove Square Brackets

In [None]:
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

## Remove Urls

In [None]:
def remove_urls(text):
    return re.sub('http\S+', '', text)

## Remove Stopwords

In [None]:
stop  = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop.update(punctuation)

def remove_stopwords(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in stop:
            final_text.append(i.strip())
    return " ".join(final_text)

## Data Cleaning all together

In [None]:
def denoise_text(text):
    text = strip_html(text)
    text = remove_between_square_brackets(text)
    text = remove_urls(text)
    text = remove_stopwords(text)
    return text

In [None]:
merge_fake_real['text'] = merge_fake_real['text'].apply(denoise_text)

In [None]:
merge_fake_real

In [None]:
sns.set_theme(style = 'darkgrid')
plt.figure(figsize = (12, 6))
sns.countplot(x = "subject", data = merge_fake_real)

In [None]:
sns.set_theme(style = 'darkgrid')
plt.figure(figsize = (12, 6))
sns.countplot(merge_fake_real['subject'], hue = merge_fake_real['target'])

In [None]:
fig, ax = plt.subplots(2, 1, figsize = (12, 12))
sns.countplot(merge_fake_real['subject'], ax = ax[0])
sns.countplot(merge_fake_real['subject'], hue = merge_fake_real['target'], ax = ax[1])

## Check for null values if any

In [None]:
merge_fake_real['text'].isna().sum()

In [None]:
plt.figure(figsize = (20, 20))
wc = WordCloud(max_words = 2000,
               width = 1600,
               height = 800,
               stopwords = stop)
wc.generate(" ".join(merge_fake_real[merge_fake_real['target'] == 'True'].text))
plt.imshow(wc, interpolation = 'bilinear')

In [None]:
plt.figure(figsize = (20, 20))
wc = WordCloud(max_words = 2000,
               height = 1600,
               width = 800,
               stopwords = stop)
wc.generate(" ".join(merge_fake_real[merge_fake_real['target'] == 'Fake'].text))
plt.imshow(wc, interpolation = 'bilinear')

## Text Length of the Fake and Real News

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (12, 8))
text_len = merge_fake_real[merge_fake_real['target'] == 'True']['text'].str.len()
ax1.hist(text_len, color = 'red')
ax1.set_title('Original text')
text_len = merge_fake_real[merge_fake_real['target'] == 'Fake']['text'].str.len()
ax2.set_title('Fake text')
ax2.hist(text_len, color = 'blue')

## Average word length of Fake and Real News

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize = (20, 10))
word_len = merge_fake_real[merge_fake_real['target'] == 'True']['text'].str.split().apply(lambda x: [len(i) for i in x])
sns.distplot(word_len.map(lambda x: np.mean(x)), ax = ax1, color = 'red')
ax1.set_title('Original text')
word_len = merge_fake_real[merge_fake_real['target'] == 'Fake']['text'].str.split().apply(lambda x: [len(i) for i in x])
sns.distplot(word_len.map(lambda x: np.mean(x)), ax = ax2, color = 'green')
ax2.set_title('Fake text')
fig.suptitle('Average World Length in Each Text')

In [None]:
def get_corpus(text):
    words = []
    for i in text:
        for j in i.split():
            words.append(j.strip())
    return words

corpus = get_corpus(merge_fake_real['text'])

In [None]:
from collections import Counter
counter = Counter(corpus)
most_common = counter.most_common(10)
most_common = dict(most_common)

In [None]:
most_common

## Split the data into Independent and Dependent variables

In [None]:
X = merge_fake_real['text']
y = merge_fake_real['target']

## Use dummies for target

In [None]:
y = pd.get_dummies(y, drop_first = True)

In [None]:
X.head()

In [None]:
y

In [None]:
y = y.values.reshape(-1,)

In [None]:
y

## Using TfidfVectorizer

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
tfvector = TfidfVectorizer(stop_words = stop, max_df = 0.5)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)

In [None]:
X_train

In [None]:
X_test

In [None]:
X_train = tfvector.fit_transform(X_train)
X_test = tfvector.transform(X_test)

In [None]:
X_train

In [None]:
X_test

## Using Logistic Regression on top of vectors created by TfidfVectorizer

In [None]:
from sklearn.linear_model import LogisticRegression
logit = LogisticRegression()
logit.fit(X_train, y_train)

In [None]:
pred_data = logit.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
acc = accuracy_score(y_test, pred_data)
cr = classification_report(y_test, pred_data)
cm = confusion_matrix(y_test, pred_data)

In [None]:
print('The Accuracy of the model is ', round(acc*100, 2), '%')
print('-------'*12)
print('Classification Report is ', cr)
print('-------'*12)

In [None]:
ax = plt.subplot()
sns.heatmap(cm, annot = True, ax = ax)
ax.set_xlabel('Predictions')
ax.set_ylabel('y_test')

## Using CountVectorizer

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 101)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)

In [None]:
model2 = LogisticRegression()

In [None]:
model2.fit(X_train, y_train)

In [None]:
y_pred = model2.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
acc = accuracy_score(y_test, y_pred)
cr = classification_report(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [None]:
print('The Accuracy of the model is ', round(acc*100, 2), '%')
print('-------'*12)
print('Classification Report is ', cr)
print('-------'*12)

In [None]:
ax = plt.subplot()
sns.heatmap(cm, annot = True, ax = ax)
ax.set_xlabel('Predictions')
ax.set_ylabel('y_test')

## CountVectorizer seems to be performing well compared to Tfidf Vectorizer.