# **Fake News Detector**

## Importing modules

In [None]:
import pandas as pd
import os
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report 

## Loading the Dataset

In [None]:
real = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/True.csv")
fake = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/Fake.csv")

In [None]:
real.head()

In [None]:
fake.head()

Adding a label that indicates whether the news is fake or real.
**Real news is denoted using 1.
Fake news is denoted using 0.**

In [None]:
real['label'] = 1
fake['label'] = 0

Combining the two datasets and shuffling it.

In [None]:
news = pd.concat([real,fake])
news = news.sample(frac=1, random_state = 1).reset_index(drop = True)
news.head()

In [None]:
news.isna().sum()

We can see that our dataset has no null values.

In [None]:
sns.countplot(news['label']);

From the above plot, we can see that our dataset is balanced.

In [None]:
news['text'] = news['title'] + ' ' + news['text']
news.drop(['title', 'subject','date'], inplace = True, axis = 1)
news.head()

## Text Preprocessing

Now, we are removing stop words, punctutations, and digits from the news text, and are performing Lemmatization.

In [None]:
nlp = spacy.load('en')

def clean_text(text):
    doc = nlp(text)
    clean = []
    for token in doc:
        if not token.is_punct and not token.is_space and not token.is_digit:
            if not token.is_stop:
                clean.append(token.lemma_.lower())
    return(' '.join(clean))

In [None]:
news['text'] = news['text'].apply(clean_text)
news.head()

## Word Clouds

### ***Word Cloud for the Fake news.***

In [None]:
wordcloud = WordCloud(width = 800, height = 800, max_font_size = 120).generate(" ".join(news[news['label'] == 0].text)) 
  
# plot the word cloud for fake news data                      
plt.figure(figsize = (8, 8)) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout() 
plt.show() 

### ***Word Cloud for the Real news.***

In [None]:
wordcloud = WordCloud(width = 800, height = 800, max_words=200, max_font_size = 120).generate(" ".join(news[news['label'] == 1].text)) 
  
# plot the word cloud for fake news data                      
plt.figure(figsize = (8, 8)) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout() 
plt.show() 

## Splitting the data for training and testing.

In [None]:
text_train, text_test, label_train, label_test = train_test_split(news['text'], news['label'], test_size = 0.2)

## **CountVectorizer and TF-IDF Transformer.**

We fit the CountVectorizer and TF-IDF Transformer on the training dataset, and transform on both training and testing dataset.

In [None]:
count_vectorizer = CountVectorizer()
freq_term_matrix = count_vectorizer.fit_transform(text_train)

In [None]:
tfidf = TfidfTransformer()
tfidf_matrix = tfidf.fit_transform(freq_term_matrix)
print(tfidf_matrix.toarray().shape)

In [None]:
test_count_vect = count_vectorizer.transform(text_test)
test_tfidf = tfidf.transform(test_count_vect)
print(test_tfidf.toarray().shape)

## **Classification**

### ***Logistic Regression Model***

In [None]:
logreg = LogisticRegression(C=1e4, max_iter=300)
logreg.fit(tfidf_matrix,label_train)
predlogreg = logreg.predict(test_tfidf)

cm = confusion_matrix(label_test, predlogreg)
print('Confusion Matrix:')
print(cm) 
print('Accuracy Score:', accuracy_score(label_test, predlogreg))
print('Report: ')
print(classification_report(label_test, predlogreg)) 
print(logreg.score(test_tfidf,label_test))

### ***Naive Bayes Model***

In [None]:
nb = MultinomialNB()
nb.fit(tfidf_matrix,label_train)
prednb = nb.predict(test_tfidf)

cm = confusion_matrix(label_test, prednb)
print('Confusion Matrix:')
print(cm) 
print('Accuracy Score:',accuracy_score(label_test, prednb))
print('Report: ')
print(classification_report(label_test, prednb))
print(nb.score(test_tfidf,label_test))

We see that the Logistic Regression model performs better.