## Necessary libraries

In [2]:
# Libraries
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

import pandas as pd
import time
import pickle
import gensim
import numpy as np
import nltk


from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from scipy import sparse
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_classification

## Handling Raw Corpus

In [58]:
data = pd.read_csv("data.tsv", sep='\t')
data.head()

Unnamed: 0,id,tweet,label
0,1,The CDC currently reports 99031 deaths. In gen...,real
1,2,States reported 1121 deaths a small rise from ...,real
2,3,Politically Correct Woman (Almost) Uses Pandem...,fake
3,4,#IndiaFightsCorona: We have 1524 #COVID testin...,real
4,5,Populous states can generate large case counts...,real


In [59]:
data.tail()

Unnamed: 0,id,tweet,label
6416,6417,???Autopsies prove that COVID-19 is??� a blood...,fake
6417,6418,_A post claims a COVID-19 vaccine has already ...,fake
6418,6419,Aamir Khan Donate 250 Cr. In PM Relief Cares Fund,fake
6419,6420,It has been 93 days since the last case of COV...,real
6420,6421,The House Democratic Caucus holds a moment of ...,real


In [60]:
import string

In [61]:
# def remove_punctuation(text):
#     punctuationfree="".join([i for i in text if i not in string.punctuation])
#     return punctuationfree

# # Removing punctuations
# data['tweet']= data['tweet'].apply(lambda x:remove_punctuation(x))


# # Converting text into lower case
# data['tweet']= data['tweet'].apply(lambda x: x.lower())


# def remove_stopwords(text):
#     text = text.split(" ")
#     output= " ".join([i for i in text if i not in stopwords])
#     return output

# stopwords = nltk.corpus.stopwords.words('english')

# # Removing stopwords
# for i in range(len(data)):
#     data["tweet"][i] = remove_stopwords(data["tweet"][i])
    

# # Removing urls.....
# for i in range(len(data)):
#     temp = data["tweet"][i].split(" ")
#     if temp[-1].startswith("http"):
#         temp.pop()
#     data["tweet"][i] = " ".join(temp)

In [62]:
x = data['tweet']
y = data['label']

In [63]:
from sklearn.model_selection import train_test_split
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size=0.3,random_state=53)

In [64]:
print(train_y.value_counts())

# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = train_y.astype('str')
test_y = test_y.astype('str')
train_y = encoder.fit_transform(train_y)
test_y = encoder.fit_transform(test_y)

classes = list(encoder.classes_)
print(len(classes))

real    2333
fake    2161
Name: label, dtype: int64
2


## Model Traning

In [65]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)

    if is_neural_net:
        predictions = predictions.argmax(axis=-1)

    
    acc = metrics.accuracy_score(predictions, test_y)
    f1 = metrics.f1_score(predictions, test_y, average='weighted')
    #print(classification_report(predictions, test_y, target_names = list(encoder.classes_)))
    return acc, f1, classifier

# Boosting Model

## Count Vectorizer

In [66]:
%%time
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(pd.concat([train_x, test_x], axis=0))

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.fit_transform(train_x)
xvalid_count =  count_vect.transform(test_x)
xtrain_count.shape, xvalid_count.shape

Wall time: 665 ms


((4494, 14833), (1927, 14833))

In [67]:
%%time
# Extereme Gradient Boosting on Count Vectors
accuracy, f1_score, classifier = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
# print("Xgb, Count Vectors: ", accuracy, f1_score)
print("Accuracy Score: {}".format(accuracy))
print("f1 Score: {}".format(f1_score))

# Save the model to output folder
filename = 'count_vec.pkl'
with open(filename, 'wb') as fp:
    pickle.dump(classifier, fp)

Accuracy Score: 0.9231966787752984
f1 Score: 0.9231675321926582
Wall time: 1.62 s


In [68]:
del xtrain_count
del xvalid_count
del count_vect

## Word level TF-IDF

In [69]:
%%time
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern = r'\w{1,}', max_features=5000)
tfidf_vect.fit(pd.concat([train_x, test_x], axis=0))
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(test_x)

Wall time: 709 ms


In [70]:
%%time
# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy, f1_score, classifier = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
# print("Xgb, WordLevel TF-IDF: ", accuracy, f1_score)
print("Accuracy Score: {}".format(accuracy))
print("f1 Score: {}".format(f1_score))

# Save the model to output folder
filename = 'word-tfidf.pkl'
with open(filename, 'wb') as fp:
    pickle.dump(classifier, fp)

Accuracy Score: 0.915412558380903
f1 Score: 0.9154155332432391
Wall time: 3.71 s


In [71]:
del xtrain_tfidf
del xvalid_tfidf
del tfidf_vect

## Ngram word Level TF-iDF

In [72]:
%%time
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', ngram_range=(2,3), max_features=25000)
tfidf_vect_ngram.fit(pd.concat([train_x, test_x], axis=0))
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(test_x)

Wall time: 2.59 s


In [73]:
%%time
# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy, f1_score, classifier = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram.tocsc(), train_y, xvalid_tfidf_ngram.tocsc())
# print("Xgb, Ngram WordLevel Vectors: ", accuracy, f1_score)
print("Accuracy Score: {}".format(accuracy))
print("f1 Score: {}".format(f1_score))

# Save the model to output folder
filename = 'ngram-tfidf.pkl'
with open(filename, 'wb') as fp:
    pickle.dump(classifier, fp)

Accuracy Score: 0.8401660612350804
f1 Score: 0.8401011489825361
Wall time: 2.66 s


In [74]:
del xtrain_tfidf_ngram
del xvalid_tfidf_ngram
del tfidf_vect_ngram

## Char Level TF-IDF

In [75]:
%%time
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', ngram_range=(2,3), max_features=25000)
tfidf_vect_ngram_chars.fit(pd.concat([train_x, test_x], axis=0))
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(test_x) 

Wall time: 4.22 s


In [76]:
%%time
# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy, f1_score, classifier = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
# print("Xgb, CharLevel Vectors: ", accuracy, f1_score)
print("Accuracy Score: {}".format(accuracy))
print("f1 Score: {}".format(f1_score))

# Save the model to output folder
filename = 'char-tfidf.pkl'
with open(filename, 'wb') as fp:
    pickle.dump(classifier, fp)

Accuracy Score: 0.9538142189932538
f1 Score: 0.9538064754702225
Wall time: 32 s


In [77]:
# tweet = ["China president xi jinping visited masjid and request Muslims for dua in present crisis country going through.we need your help."]
tweet = ["The CDC currently reports 99031 deaths. In general the discrepancies in death counts between different sources are small and explicable. The death toll stands at roughly 100000 people today."]

chars =  tfidf_vect_ngram_chars.transform(tweet)
result = classifier.predict(chars)[0]
if result==0:
    print("Fake")
else:
    print("Real")

Real
