In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix 
from sklearn.metrics import classification_report 

In [2]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

In [3]:
data = pd.read_csv('Clean Data.csv')
data.head()

Unnamed: 0,Tweets,target(fake=0)
0,good news covafirst vaccine get approval human...,1
1,country first indigenous corona vaccine covade...,1
2,india first corona vaccine candidate cova set ...,1
3,anildeshmukhncp pypayurved bought untested hom...,1
4,mohap announces new corona case recovery,1


In [4]:
vec = TfidfVectorizer(ngram_range=(1, 2))
x = vec.fit_transform(data.iloc[:,0])
x

<3067x31409 sparse matrix of type '<class 'numpy.float64'>'
	with 77472 stored elements in Compressed Sparse Row format>

In [5]:
x_train , x_test , y_train , y_test = train_test_split(x , data.iloc[:,-1] , test_size=200 , random_state=42 , stratify=data.iloc[:,-1].values)

In [6]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(2867, 31409)
(2867,)
(200, 31409)
(200,)


### Gaussian NB

In [7]:
clf = GaussianNB()
clf.fit(x_train.toarray() , y_train)
pred = clf.predict(x_test.toarray())

In [8]:
confusion_matrix(y_test , pred)

array([[105,  21],
       [ 24,  50]], dtype=int64)

In [9]:
print(classification_report(y_test , pred))

              precision    recall  f1-score   support

           0       0.81      0.83      0.82       126
           1       0.70      0.68      0.69        74

    accuracy                           0.78       200
   macro avg       0.76      0.75      0.76       200
weighted avg       0.77      0.78      0.77       200



### Logistic Regression

In [10]:
clf = LogisticRegression()
clf.fit(x_train , y_train)
pred = clf.predict(x_test)

In [11]:
confusion_matrix(y_test , pred)

array([[122,   4],
       [ 47,  27]], dtype=int64)

In [12]:
print(classification_report(y_test , pred))

              precision    recall  f1-score   support

           0       0.72      0.97      0.83       126
           1       0.87      0.36      0.51        74

    accuracy                           0.74       200
   macro avg       0.80      0.67      0.67       200
weighted avg       0.78      0.74      0.71       200



### Decision Tree

In [13]:
clf = DecisionTreeClassifier()
clf.fit(x_train , y_train)
pred = clf.predict(x_test)

In [14]:
confusion_matrix(y_test , pred)

array([[105,  21],
       [ 31,  43]], dtype=int64)

In [15]:
print(classification_report(y_test , pred))

              precision    recall  f1-score   support

           0       0.77      0.83      0.80       126
           1       0.67      0.58      0.62        74

    accuracy                           0.74       200
   macro avg       0.72      0.71      0.71       200
weighted avg       0.73      0.74      0.74       200



### Voting Ensemble

In [16]:
clf1 = GaussianNB()
clf2 = LogisticRegression()
clf3 = DecisionTreeClassifier()

In [17]:
votes = VotingClassifier(estimators=[('nb', clf1), ('lr', clf2), ('dt', clf3)], voting='hard', n_jobs=-1)
votes.fit(x_train.toarray() , y_train)
pred = votes.predict(x_test.toarray())

In [18]:
confusion_matrix(y_test , pred)

array([[119,   7],
       [ 38,  36]], dtype=int64)

In [19]:
print(classification_report(y_test , pred))

              precision    recall  f1-score   support

           0       0.76      0.94      0.84       126
           1       0.84      0.49      0.62        74

    accuracy                           0.78       200
   macro avg       0.80      0.72      0.73       200
weighted avg       0.79      0.78      0.76       200

