In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
fake_data = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/Fake.csv")
true_data = pd.read_csv("/kaggle/input/fake-and-real-news-dataset/True.csv")

In [None]:
fake_data.head(5)


In [None]:
true_data.head(5)

# 1. *Data Preprocessing*


In [None]:
#Creating labels for classification 
fake_data["Label"] = 0
true_data["Label"] = 1

In [None]:
#Taking a sample of the data (You can also take fully but will take longer time according to CPU Power)
data = pd.concat([fake_data.iloc[:2000,:],true_data.iloc[:2000,:]], axis=0,ignore_index = True)

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.subject.unique()

In [None]:
data.isnull().sum()

In [None]:
#Dropping unnecessary variables
data.drop(["title","subject","date"], axis=1, inplace=True)

In [None]:
data.head()

In [None]:
data.text[6]

In [None]:
#Creating independent and dependent variables
X = data.drop(["Label"],axis=1)
y = data["Label"]

In [None]:
X.head()

In [None]:
y.head()

In [None]:
X.columns

# 2.*Stemming and removing stop words*

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [None]:
ps = PorterStemmer()
corpus=[]
for i in range(0,len(X)):
    review = re.sub("[^a-zA-Z]"," ",X["text"][i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words("english")]
    review = " ".join(review)
    corpus.append(review)

# 3.*Word Vectorization*

In [None]:
vector = TfidfVectorizer(max_features =5000, ngram_range=(1,3))
X = vector.fit_transform(corpus).toarray()

In [None]:
X.shape

In [None]:
y.shape

# 4.*Model Building*

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

In [None]:
X_train.shape

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
classifier=MultinomialNB()

classifier.fit(X_train, y_train)
pred = classifier.predict(X_test)

score = metrics.accuracy_score(y_test, pred)

# 5.*Model Evaluation*

In [None]:
print("Accuracy of the model: {}%".format(score*100))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
cm = metrics.confusion_matrix(y_test, pred)
metrics.plot_confusion_matrix(classifier,X_test,y_test)

# *We see that the model has successfully classified the fake and real news with an accuracy of 98%*