In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import re

import nltk as nlp # Main Library for NLP
from nltk.corpus import stopwords # stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB #MultinomialNB # for the model - also try Guassian

import nltk, re, string, collections
from nltk.util import ngrams
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. EDA and Data Preprocessing

In [2]:
fake = pd.read_csv("../input/fake-and-real-news-dataset/Fake.csv")
real = pd.read_csv("../input/fake-and-real-news-dataset/True.csv")

In [3]:
fake.head()

In [4]:
real.head()

In [5]:
fake.shape,real.shape
#if i conacatenate these you want the rows to be (23481+21417=44898)

In [6]:
fake.info()
real.info()

In [7]:
#adding classifier as a number: 1 if the article is fake, 0 otherwise
fake['class'] = 1
real['class'] = 0

all_data = pd.concat([fake,real])
all_data.head()

In [8]:
all_data.shape

In [9]:
fig, ax = plt.subplots(figsize=(10,10))
plt.xticks(rotation=45)
ax = sns.countplot(x = all_data.subject)

In [10]:
all_data = all_data.drop(['title','date','subject'], axis=1)
all_data.head()

In [11]:
all_data = all_data.sample(frac = 1) #Shuffling our data
all_data.head()

Lets process our text!

# 2. Text Preprocessing

In [12]:
nlp.download("stopwords") 
lemma=nlp.WordNetLemmatizer()

In [13]:
 def text_process(data):
    text_list=[]
    for text in data.text:
        text=re.sub("[^a-zA-Z]"," ",text) # extracting unnecesary characters
        text=text.lower() #makes characters lowercase
        text=nlp.word_tokenize(text) # splits all the words
        text=[word for word in text if not word in set(stopwords.words("english"))] # extract stopwords
        text=[lemma.lemmatize(word) for word in text] # Lemmatisation
        text=" ".join(text) 
        text_list.append(text)
        
    return text_list

**You will see me take sample sizes for my datasets - this is because my computer is not powerful enough to do this for the full datasets**

In [14]:
fake_sample = fake.sample(n=1000) #10,000
fake_text = text_process(fake_sample)

In [15]:
text_all = ''
for text in fake_text:    
    text_all = text_all + " " + text

    
bigrm = list(nltk.bigrams(text_all.split()))
BigramFreq = collections.Counter(bigrm)
BigramFreq = BigramFreq.most_common(10)
print(BigramFreq)

In [16]:
bigrams = [x[0] for x in BigramFreq]
count = [x[1] for x in BigramFreq]

for i in range(len(bigrams)):
    bigrams[i] = bigrams[i][0] + " " + bigrams[i][1]

    
f, ax = plt.subplots(figsize=(10,10))
plt.xticks(rotation=45)
sns.barplot(x=bigrams,y=count)

Now lets do this for the real articles

In [17]:
real_sample = real.sample(n=1000) #10,000
real_text = text_process(real_sample)

In [18]:
text_all_2 = ''
for text in real_text:    
    text_all_2 = text_all_2 + " " + text

    
bigrm2 = list(nltk.bigrams(text_all_2.split()))
BigramFreq2 = collections.Counter(bigrm2)
BigramFreq2 = BigramFreq2.most_common(10)
print(BigramFreq2)

In [19]:
bigrams2 = [x[0] for x in BigramFreq2]
count2 = [x[1] for x in BigramFreq2]

for i in range(len(bigrams2)):
    bigrams2[i] = bigrams2[i][0] + " " + bigrams2[i][1]

    
f, ax = plt.subplots(figsize=(10,10))
plt.xticks(rotation=45)
sns.barplot(x=bigrams2,y=count2)

# 3. Modelling

**I am going to be using a multinomial naive bayes classifier. You could do this with a Gaussian naive bayes classifier as well - however multinomial is better for classifying between two distinct classes - in our case this is Real and Fake news.**

In [20]:
all_data_sample = all_data.sample(n=1000) #10,000
all_data_processed = text_process(all_data_sample)

In [21]:
max_features=1000
count_vectorizer=CountVectorizer(max_features=max_features,stop_words="english")
sparce_matrix=count_vectorizer.fit_transform(all_data_processed).toarray()

In [22]:
print(sparce_matrix)

In [23]:
y=all_data_sample.iloc[: , -1]
x=sparce_matrix

In [24]:
print(y)

In [25]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.1,random_state=5)

In [26]:
nb=MultinomialNB()
nb.fit(x_train,y_train)
y_pred= nb.predict(x_test)

In [27]:
y_test = y_test.to_numpy()

In [28]:
print(type(y_test))
print(type(y_pred))

In [29]:
print(y_test)
print(y_pred)

In [30]:
count = 0
for i in range(len(y_pred)):
    if y_test[i] == y_pred[i]:
        count+=1
    
print("Accuracy: " + str((count/len(y_pred))* 100))

**Accuracy is 96% for Multinomial Bayes - this accuracy could be increased if we choose a larger sample size.**