In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Importing data

In [None]:
fake = pd.read_csv('../input/fake-and-real-news-dataset/Fake.csv')
true= pd.read_csv('../input/fake-and-real-news-dataset/True.csv')

#### Checking for first 5 rows

In [None]:
fake.head()

#### Checking for value counts

In [None]:
fake['subject'].value_counts()

In [None]:
true['subject'].value_counts()

#### Creating a target variable for classification

In [None]:
# Making target variable
fake['category' ] = 1
true['category' ] = 0

In [None]:
# Succesfully able to create target variable
true.head()

#### Concatenating two diffrent dataframes(true and fake) and making it one

In [None]:
df = pd.concat([fake,true]).reset_index(drop = True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

plt.figure(figsize = (6,7))
sns.countplot(df['category'],palette = "Spectral_r")
plt.legend()

In [None]:
plt.figure(figsize = (16,7))
sns.countplot(df['subject'],palette = "Spectral_r")

#### Creating the dataframe with import variables

In [None]:
df = df[['text', "category"]]

In [None]:
df.head()

Data Cleaning

In [None]:
df.isna().sum()*100/len(df)

#### Checking for empty spaces in the dataframe

In [None]:
# Checking for empty string
blank =[]

for index, text in df['text'].iteritems():
    if text.isspace():
        blank.append(index)
        
len(blank)

In [None]:
blank

In [None]:
# Dropping the blank data where data is not avaiable
print("Before dropping:" ,df.shape)
df.drop(blank, inplace = True)
print("After dropping:" ,df.shape)

In [None]:
#  Import NLP necessary libraries
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import spacy
import re
nlp = spacy.load("en_core_web_sm")

In [None]:
lemma = WordNetLemmatizer()

In [None]:
# Using spacy stopwords fuction
list1 = nlp.Defaults.stop_words
print(len(list1))

list2 = stopwords.words("english")
print(len(list2))

Stopwords = set((set(list1)|set(list2)))
print(len(Stopwords))

Data cleaning 

In [None]:
def clean_text(text):
    string = ""
    
    text = text.lower()
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "She is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "what is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"\'ll'", "will", text)
    text = re.sub(r"\'re'", "are", text)
    text = re.sub(r"\'d'", "would", text)
    text = re.sub(r"\'won't", "will not", text)
    text = re.sub(r"\'can't", "cannot", text)

#     removing all special characters

    text = re.sub(r"[-()\"#!@$%^&*{}?.,;]"," ", text)
    text = re.sub(r"\s+"," ", text)
    text = re.sub(r"[^A-Za-z-0-9]+", " ", text)

    for word in text.split():
        if word not in Stopwords:
            string+=lemma.lemmatize(word)+' '

    return string

In [None]:
df['text'][10]

In [None]:
clean_text(df['text'][10])

In [None]:
df["text"] = df["text"].apply(clean_text)

In [None]:
df['text']

In [None]:
from wordcloud import WordCloud

## Creating world cloud for true news having white background

In [None]:
plt.figure(figsize =(15,15))
wc = WordCloud(background_color='white',
               max_words = 500, width = 1600, height = 800).generate(" ".join(df[df.category == 0].text))
plt.axis("off")
plt.imshow(wc, interpolation = 'bilinear')

## Creating world cloud for fake news having black background

In [None]:
plt.figure(figsize =(15,15))
wc = WordCloud(max_words = 500, width = 1600, height = 800).generate(" ".join(df[df.category == 1].text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear',)

In [None]:
#creating more intiuive wordcloud 

from PIL import Image

In [None]:
#creating a mask of thumb
thumb="../input/worlcloudimages/thumbs-up.png"
icon=Image.open(thumb)
mask=Image.new(mode="RGB",size=icon.size, color=(255,255,255))
mask.paste(icon, box=icon)
rgb_array=np.array(mask)

In [None]:
plt.figure(figsize =(15,15))
wc = WordCloud(background_color='white',mask = rgb_array,max_words = 1500, width = 1600, height = 800).generate(" ".join(df[df.category == 0].text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear',)

In [None]:
#creating a mask of thumb
skull="../input/worlcloudimages/skull-icon.png"
icon=Image.open(skull)
mask=Image.new(mode="RGB",size=icon.size, color=(255,255,255))
mask.paste(icon, box=icon)
rgb_array1=np.array(mask)

In [None]:
plt.figure(figsize =(15,15))
wc = WordCloud(mask = rgb_array1,max_words = 1500, width = 1600, height = 800).generate(" ".join(df[df.category == 1].text))
plt.axis("off")
plt.imshow(wc , interpolation = 'bilinear',)

## Data Modelling

In [None]:
# Splitting the data into train and test
from sklearn.model_selection import train_test_split
x=df['text']
y= df['category']

x_train, x_test,y_train, y_test = train_test_split(x,y, test_size = 0.20, random_state = 1)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
x_train_tfidf = vectorizer.fit_transform(x_train)

In [None]:
x_train_tfidf

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc=RandomForestClassifier(random_state=42,max_features='auto', n_estimators= 200, max_depth=8, criterion='gini')
rfc.fit(x_train_tfidf, y_train)


In [None]:
from sklearn.pipeline import Pipeline
text_rfc = Pipeline([("tfidf",TfidfVectorizer()),("rfc",RandomForestClassifier(random_state=42))])
text_rfc.fit(x_train, y_train)

In [None]:
predictions = text_rfc.predict(x_test)

In [None]:
from sklearn import metrics
print("Accuracy for Random Forest on data: ",metrics.accuracy_score(y_test,predictions))

In [None]:
print(metrics.confusion_matrix(y_test, predictions))

In [None]:
from sklearn import metrics
print(metrics.classification_report(y_test, predictions))