In [None]:
#Importing necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#Loading the dataset
fake = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')
true = pd.read_csv('../input/fake-and-real-news-dataset/True.csv')

In [None]:
#Looking at top 10 records of fake dataset
fake.head(10)

In [None]:
#Looking at top 10 records of true dataset
true.head(10)

In [None]:
#Checking shape of fake dataset
fake.shape

In [None]:
#Checking the shape of true dataset
true.shape

In [None]:
#Checking the different subjects and their counts in Fake dataset
fake['subject'].value_counts()

Here we can observe that there are total 6 different topics or subjects, among which "News" has highest count. 

In [None]:
#Checking the different subjects and their value counts in true dataset
true['subject'].value_counts()

In this dataset, we can observe that there are only 2 different topics/subjects, among which "politicsNews" has highest count.

In order to analyse and make model we will combine the two dataset, but before that we need to add an extra column which will help us distinguish between fake news and true news.
For this, we will add another feature named 'category' where, all the fake news have 1 as its value and true news will have 0 as its value. 

In [None]:
#Creating a category for whether news is fake or not

fake['category']=1
true['category']=0

Lets now join both the datasets.

In [None]:
#Joining and reseting index
df = pd.concat([fake,true]).reset_index(drop=True)

In [None]:
#Checking the newly created dataset
df.head()

In [None]:
#Checking the shape of the dataset
df.shape

Lets plot the graph of 'category' and 'subject' columns.

In [None]:
#Countplot of 'category' attribute
plt.figure(figsize=(6,7))
sns.countplot(df['category'])
plt.legend()

In [None]:
#Countplot for 'subject' attribute
plt.figure(figsize=(10,7))
sns.countplot(df['subject'])

**Data Cleaning**

In [None]:
#Checking null values explicitly
df.isna().sum()

In [None]:
#checking if there is empty string in TEXT column
blanks=[]

#index,label and review of the doc
for index,text in df["text"].iteritems(): # it will iter through index,label and review
    if text.isspace(): # if there is a space
        blanks.append(index) #it will be noted down in empty list

len(blanks)

As we have to find out from text whether the news is fake or not, we will just need two attributes :- 'text' and 'category', we will drop other features.

In [None]:
#Instead of dropping these values we are going to merge title with text
df["text"] =df["title"]+df["text"]

#we only need two columns rest can be ignored

df=df[["text","category"]]

In [None]:
#Importing libraries for cleaning purpose

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import re

**Stopword** 

Words like “a” and “the” appear so frequently that they don’t require tagging as thoroughly as nouns, verbs and modifiers.                       
We call them stop words, and they can be filtered from the text to be processed.

**Lemmatization**

It takes into consideration the morphological analysis of the words.The output we will get after lemmatization is called ‘lemma’, which is a root word rather than root stem.

In [None]:
#Loading the spacy library
nlp = spacy.load('en_core_web_sm')

#Creating instance
lemma = WordNetLemmatizer()

In [None]:
#Creating list of stopwords containing stopwords from spacy and nltk

#Stopwords of spacy
l1 = nlp.Defaults.stop_words
print(len(l1))

#Stopwords of NLTK library
l2 = stopwords.words('english')
print(len(l2))

#Combining both the above lists 
Stopwords = set((set(l1)|set(l2)))
print(len(Stopwords))

In [None]:
#Function for cleaning the text 
def text_cleaning(text):
    #Defining empty string
    string = ""
    
    #lower casing
    text=text.lower()
    
    #simplifying text
    text=re.sub(r"i'm","i am",text)
    text=re.sub(r"he's","he is",text)
    text=re.sub(r"she's","she is",text)
    text=re.sub(r"that's","that is",text)
    text=re.sub(r"what's","what is",text)
    text=re.sub(r"where's","where is",text)
    text=re.sub(r"\'ll"," will",text)
    text=re.sub(r"\'ve"," have",text)
    text=re.sub(r"\'re"," are",text)
    text=re.sub(r"\'d"," would",text)
    text=re.sub(r"won't","will not",text)
    text=re.sub(r"can't","cannot",text)
    
    #removing any special character
    text=re.sub(r"[-()\"#!@$%^&*{}?.,:]"," ",text)
    text=re.sub(r"\s+"," ",text)
    text=re.sub('[^A-Za-z0-9]+',' ', text)
    
    for word in text.split():
        if word not in Stopwords:
            string+=lemma.lemmatize(word)+" "
    
    return string

Lets clean now 'text' column in our dataframe using above function.

In [None]:
#cleaning
df['text']=df['text'].apply(text_cleaning)

Yeah, we have cleaned our text data, lets visualize it via wordcloud.

In [None]:
from wordcloud import WordCloud

In [None]:
plt.figure(figsize=(20,20))
wc = WordCloud(max_words = 500, width = 1600, height = 800).generate(" ".join(df[df.category==0].text))
plt.axis('off')
plt.imshow(wc, interpolation = 'bilinear')

# Model Building

In [None]:
X = df['text'] #feature
y = df['category'] #target

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.33, random_state = 42)

In [None]:
#Importing libraries
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [None]:
#Tfidf vectorizer
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

#Fitting the model
text_clf.fit(X_train, y_train)

In [None]:
#Predicting
predictions =  text_clf.predict(X_test)

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, predictions))

In [None]:
#Accuracy
print(metrics.accuracy_score(y_test,predictions))

In [None]:
#confusion matrix
print(metrics.confusion_matrix(y_test,predictions))