### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style("darkgrid")
import matplotlib.pyplot as plt
import nltk
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import re,string,unicodedata
from keras.preprocessing import text, sequence
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score
from sklearn.model_selection import train_test_split
from string import punctuation
from nltk import pos_tag
from nltk.corpus import wordnet
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout
from keras.callbacks import ReduceLROnPlateau
import tensorflow as tf

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Data Loading

In [None]:
true = pd.read_csv("../input/fake-and-real-news-dataset/True.csv")
true.head()

In [None]:
fake = pd.read_csv("../input/fake-and-real-news-dataset/Fake.csv")
fake.head()

In [None]:
true.shape,fake.shape

In [None]:
print("Null Values in Real News Data = ",true.isna().any().sum())
print("Null Values in Fake News Data = ",fake.isna().any().sum())

In [None]:
true['category'] = 1
fake['category'] = 0

df = pd.concat([true,fake])
df.head()

Final Shape of the data

In [None]:
df.shape

In [None]:
df.describe(include="object")

In [None]:
print(df['category'].value_counts())
sns.countplot(df['category'])
plt.show()

In [None]:
sns.countplot(x='subject',hue='category',data=df,)
plt.xticks(rotation=90)

plt.show()

In [None]:
df["text"] =df["title"]+df["text"]+df['subject']
df=df[["text","category"]]

### STOPWORDS

Stopwords are the English words which does not add much meaning to a sentence. They can safely be ignored without sacrificing the meaning of the sentence. For example, the words like the, he, have etc. Such words are already captured this in corpus named corpus. We first download it to our python environment.

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')
list1 = nlp.Defaults.stop_words

In [None]:
list2 = stopwords.words('english')
punctuation = list(string.punctuation)
Stopwords = set((set(list1)|set(list2)|set(punctuation)))
len(Stopwords)

### Data Cleaning

In [None]:
#creating instance
lemma=WordNetLemmatizer()

#text cleaning function
def clean_text(text):
    
    """
    It takes text as an input and clean it by applying several methods
    
    """
    
    string = ""
    
    #lower casing
    text=text.lower()
    
    #simplifying text
    text=re.sub(r"i'm","i am",text)
    text=re.sub(r"he's","he is",text)
    text=re.sub(r"she's","she is",text)
    text=re.sub(r"that's","that is",text)
    text=re.sub(r"what's","what is",text)
    text=re.sub(r"where's","where is",text)
    text=re.sub(r"\'ll"," will",text)
    text=re.sub(r"\'ve"," have",text)
    text=re.sub(r"\'re"," are",text)
    text=re.sub(r"\'d"," would",text)
    text=re.sub(r"won't","will not",text)
    text=re.sub(r"can't","cannot",text)
    
    #removing any special character
    text=re.sub(r"[-()\"#!@$%^&*{}?.,:]"," ",text)
    text=re.sub(r"\s+"," ",text)
    text=re.sub('[^A-Za-z0-9]+',' ', text)
    
    for word in text.split():
        if word not in Stopwords:
            string+=lemma.lemmatize(word)+" "
    
    return string

In [None]:
#cleaning the whole data
df["text"]=df["text"].apply(clean_text)

### Word Cloud

In [None]:
from wordcloud import WordCloud

#### Word Cloud for Real News

In [None]:
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1000 , height = 500 , stopwords = Stopwords).generate(" ".join(df[df.category == 1].text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear')
plt.show()


#### Word Cloud for Fake News

In [None]:
plt.figure(figsize = (20,20)) 
wc = WordCloud(max_words = 2000 , width = 1000 , height = 500 , stopwords = Stopwords).generate(" ".join(df[df.category == 0].text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear')
plt.show()


#### Word Cloud for Whole data

In [None]:
plt.figure(figsize = (20,20)) 
wc = WordCloud(max_words = 2000 , width = 1000 , height = 500 , stopwords = Stopwords,background_color='white').generate(" ".join(df.text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear')
plt.show()

### Classification Model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [None]:
#Split the dataset into Train And Test Dataset.
X=df["text"] #feature 
y=df["category"] # traget

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Pipeline

## Logistic Regression

In [None]:
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model

clf_text=Pipeline([("tfidf",TfidfVectorizer()),("clf",LogisticRegression())])
clf_text.fit(X_train,y_train)

In [None]:
#making prediction using the model
predictions=clf_text.predict(X_test)

print(metrics.classification_report(y_test,predictions))

In [None]:
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))

#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))

## Naive Bayes

In [None]:
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model

clf_text=Pipeline([("tfidf",TfidfVectorizer()),("clf",MultinomialNB(alpha=0.5))])
clf_text.fit(X_train,y_train)

In [None]:
#making prediction using the model
predictions=clf_text.predict(X_test)

print(metrics.classification_report(y_test,predictions))

In [None]:
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))

#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))

### SVC

In [None]:
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model

clf_text=Pipeline([("tfidf",TfidfVectorizer()),("clf",LinearSVC())])
clf_text.fit(X_train,y_train)

In [None]:
#making prediction using the model
predictions=clf_text.predict(X_test)

print(metrics.classification_report(y_test,predictions))

In [None]:
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))

#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))

### Random Forest Cassifier

In [None]:
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model

clf_rf=Pipeline([("tfidf",TfidfVectorizer()),("clf",RandomForestClassifier(random_state=0))])
clf_rf.fit(X_train,y_train)

#making prediction using the model
predictions=clf_rf.predict(X_test)


print(metrics.classification_report(y_test,predictions))


In [None]:
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))

#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))

### Decision Tree Classifier

In [None]:
#pipe line will take the text and vectorise it , and then TF-IDF, then fitting the model

clf_dt=Pipeline([("tfidf",TfidfVectorizer()),("clf",DecisionTreeClassifier(random_state=2))])
clf_dt.fit(X_train,y_train)

#making prediction using the model
predictions=clf_dt.predict(X_test)


print(metrics.classification_report(y_test,predictions))


In [None]:
#overall acuracy
print(metrics.accuracy_score(y_test,predictions))
print(metrics.f1_score(y_test,predictions))

#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))

Updating......

Please upvote the notebook if you find it useful. Your comments are also requested for improvement of the notebook.