## Importing Packages

In [1]:
import numpy as np
import pandas as pd

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import confusion_matrix,accuracy_score


## Loading the Dataset

In [2]:
data = pd.read_csv('Messages.csv')
data.head()

Unnamed: 0,Timestamp,Email Address,Message/Text,Real/Fake,Sentiment,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10
0,10/6/2020 22:58:57,sankalp.vyas007@gmail.com,*COVID-19*\n(COVID is NOW a COMMON FLU)\n\n*👉5...,Fake,Opinion,,,,,,
1,10/7/2020 12:55:41,salujarashi333@gmail.com,After announcing 21 days lock down...people hu...,Real,Opinion,,,,Real,Fake,Total
2,10/7/2020 12:56:32,salujarashi333@gmail.com,After 21 days India will beat Corona virus,Fake,Opinion,,,Rashi,5,67,72
3,10/7/2020 13:00:43,salujarashi333@gmail.com,I recommend Aarogya Setu app to fight against ...,Real,Opinion,,,Vaibhavi,532,29,561
4,10/7/2020 17:24:39,salujarashi333@gmail.com,Drinking green tea can cure Covid19,Fake,Opinion,,,Sakshi,15,41,56


In [3]:
data.shape

(737, 11)

In [4]:
data = data.drop(columns = ['Timestamp','Email Address','Sentiment','Unnamed: 5','Unnamed: 6','Unnamed: 7','Unnamed: 8','Unnamed: 9','Unnamed: 10'])
data.head()

Unnamed: 0,Message/Text,Real/Fake
0,*COVID-19*\n(COVID is NOW a COMMON FLU)\n\n*👉5...,Fake
1,After announcing 21 days lock down...people hu...,Real
2,After 21 days India will beat Corona virus,Fake
3,I recommend Aarogya Setu app to fight against ...,Real
4,Drinking green tea can cure Covid19,Fake


The dataset contains two columns:-
    1. Message/Text : Describes the message entered by the user.
    2. Real/Fake : Describes if the message is real or fake.

## Exploratory Data Analysis

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 737 entries, 0 to 736
Data columns (total 2 columns):
Message/Text    737 non-null object
Real/Fake       737 non-null object
dtypes: object(2)
memory usage: 11.6+ KB


In [6]:
data.isna().sum()

Message/Text    0
Real/Fake       0
dtype: int64

In [7]:
data.dtypes

Message/Text    object
Real/Fake       object
dtype: object

In [8]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder() 
  
data['Real/Fake']= label_encoder.fit_transform(data['Real/Fake']) 
  
data.head()

Unnamed: 0,Message/Text,Real/Fake
0,*COVID-19*\n(COVID is NOW a COMMON FLU)\n\n*👉5...,0
1,After announcing 21 days lock down...people hu...,1
2,After 21 days India will beat Corona virus,0
3,I recommend Aarogya Setu app to fight against ...,1
4,Drinking green tea can cure Covid19,0


In [9]:
data.groupby('Real/Fake').size()

Real/Fake
0    165
1    572
dtype: int64

This shows that the data is balanced.

## NLP

Steps:
    1. We will remove the punctuations.
    2. Then we will convert the text into lower case.
    3. Removing the stopwords.
    4. Stemming
    5. CountVectorizer (Bag of Words)

In [10]:
# Downloading the stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rashi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
corpus = []
for i in range(737):
    
    # Removing punctuations and numbers.
    review = re.sub('[^a-zA-Z]',' ',data['Message/Text'][i])
    
    # Converting to lower case
    review = review.lower()
    review = review.split()
    
    # Removing stopwords and stemming
    ps = PorterStemmer()
    review = [ ps.stem(word) for word in review if not word in set(stopwords.words('english')) ]
    review = ' '.join(review)
    
    corpus.append(review)
    
    
    
    

In [12]:
# Coverting list to dataframe
corpus_df = pd.DataFrame(corpus)
corpus_df.head()

Unnamed: 0,0
0,covid covid common flu drug day ivermectin mg ...
1,announc day lock peopl hurri get thing
2,day india beat corona viru
3,recommend aarogya setu app fight covid
4,drink green tea cure covid


In [13]:
corpus_df['corpus'] = corpus_df
corpus_df = corpus_df.drop([0],axis=1)
corpus_df.head()

Unnamed: 0,corpus
0,covid covid common flu drug day ivermectin mg ...
1,announc day lock peopl hurri get thing
2,day india beat corona viru
3,recommend aarogya setu app fight covid
4,drink green tea cure covid


In [14]:
# Creating Bag of Words model
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [15]:
y = data.iloc[:,1].values

## Model Selection

In [16]:
# Splitting the model
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.20,random_state = 0)

classifier = {'Logistic Regression':LogisticRegression(),'SVM':svm.SVC(kernel='linear'),'Naive Bayes':GaussianNB(),
              'KNN':KNeighborsClassifier(),'Decision Tree':DecisionTreeClassifier(),'Random Forest':RandomForestClassifier()}

for i in classifier:
    classifier[i].fit(X_train, y_train)
    ypred = classifier[i].predict(X_test)
    print('Accuracy Score w.r.t ' + i + ' :' + str(accuracy_score(y_test,ypred)))
    
model = GaussianNB()
model.fit(X_train,y_train)
ypred = model.predict(X_test)
confusion_matrix(y_test,ypred)



Accuracy Score w.r.t Logistic Regression :0.8986486486486487
Accuracy Score w.r.t SVM :0.9054054054054054
Accuracy Score w.r.t Naive Bayes :0.8716216216216216
Accuracy Score w.r.t KNN :0.6283783783783784
Accuracy Score w.r.t Decision Tree :0.7905405405405406
Accuracy Score w.r.t Random Forest :0.8986486486486487




array([[ 18,  13],
       [  6, 111]], dtype=int64)

In [17]:
accuracy_score(y_test,ypred)

0.8716216216216216

In [18]:
msg = "herbs can cure covid 19"
input1 = [msg]

input_data = cv.transform(input1).toarray()

input_pred = model.predict(input_data)

if input_pred[0]==1:
    print("Msg is Real")
else:
    print("Msg is Fake")

Msg is Fake


In [19]:
msg = "oxford vaccine will be available by feb"
input1 = [msg]

input_data = cv.transform(input1).toarray()

input_pred = model.predict(input_data)

if input_pred[0]==1:
    print("Msg is Real")
else:
    print("Msg is Fake")

Msg is Real


In [20]:
msg = "In India there are only 2 covid patients"
input1 = [msg]

input_data = cv.transform(input1).toarray()

input_pred = model.predict(input_data)

if input_pred[0]==1:
    print("Msg is Real")
else:
    print("Msg is Fake")

Msg is Fake


## By : Rashi Saluja