# Setup

In [1]:
# import dependencies
import numpy as np
import pandas as pd
#import pickle
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostClassifier, VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from xgboost import XGBClassifier
import spacy
!pip3 install https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.3.0/en_core_web_md-3.3.0.tar.gz &> /dev/null
nlp = spacy.load("en_core_web_md") # md: reduced word vector table with 20k unique vectors for ~500k words

# custom functions
from functions import clean_text, print_metrics, plot_confusion_matrix

[nltk_data] Downloading package stopwords to /Users/steve/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/steve/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# set seed
np.random.seed(42)

In [3]:
# load & clean data 
train = pd.read_csv('https://raw.githubusercontent.com/smkerr/COVID-fake-news-detection/main/data/original-data/Constraint_Train.csv', header=0)
train_clean = train[train["tweet"].map(len) <= 280].drop_duplicates() # drop posts longer than 280 characters & drop duplicates
X_train, y_train = train_clean["tweet"], train_clean["label"]


val = pd.read_csv('https://raw.githubusercontent.com/smkerr/COVID-fake-news-detection/main/data/original-data/Constraint_Val.csv', header=0)
val_clean = val[val["tweet"].map(len) <= 280].drop_duplicates()  # drop posts longer than 280 characters & drop duplicates
X_val, y_val = val_clean["tweet"], val_clean["label"]

test = pd.read_csv('https://raw.githubusercontent.com/smkerr/COVID-fake-news-detection/main/data/original-data/Constraint_Test.csv', header=0)
test_clean = test[test["tweet"].map(len) <= 280].drop_duplicates()  # drop posts longer than 280 characters & drop duplicates
X_test, y_test = test_clean["tweet"], test_clean["label"]

# Pre-processing

In [4]:
# apply clean_text() function to all tweets 
X_train = X_train.map(lambda x: clean_text(x))
X_val = X_val.map(lambda x: clean_text(x))
X_test = X_test.map(lambda x: clean_text(x))

In [5]:
# initialize label encoder
label_encoder = preprocessing.LabelEncoder()

# encode 'fake' as 0 and 'real' as 1 to make target variables machine-readable
y_train = label_encoder.fit_transform(y_train)
y_val = label_encoder.fit_transform(y_val)
y_test = label_encoder.fit_transform(y_test)

# Feature Extraction


## Word Embeddings

In [6]:
# generate word vectors for each tweet 
## train set 
tweet2vec_list = [nlp(doc).vector.reshape(1,-1) for doc in X_train] # creates 6420x1 list, with each entry containing a 300x1 np.array word vector corresponding with a tweet
tweet2vec_data = np.concatenate(tweet2vec_list) # joins word vectors into 6420x300 np.array
tweet2vec_train = pd.DataFrame(tweet2vec_data) # convert to data frame

## validation set 
tweet2vec_list = [nlp(doc).vector.reshape(1,-1) for doc in X_val] # creates 6420x1 list, with each entry containing a 300x1 np.array word vector corresponding with a tweet
tweet2vec_data = np.concatenate(tweet2vec_list) # joins word vectors into 6420x300 np.array
tweet2vec_val = pd.DataFrame(tweet2vec_data) # convert to data frame

## test set 
tweet2vec_list = [nlp(doc).vector.reshape(1,-1) for doc in X_test] # creates 6420x1 list, with each entry containing a 300x1 np.array word vector corresponding with a tweet
tweet2vec_data = np.concatenate(tweet2vec_list) # joins word vectors into 6420x300 np.array
tweet2vec_test = pd.DataFrame(tweet2vec_data) # convert to data frame

## Bag-of-Words (BoW)

In [7]:
# count vectorizer
cv = CountVectorizer(ngram_range=(1, 2)) # count term frequency

# fit and transform train data to count vectorizer
cv.fit(X_train.values)
cv_train = cv.transform(X_train.values)

# fit and transform validation data to counter vectorizer
cv_val = cv.transform(X_val.values)

# fit and transform validation data to counter vectorizer
cv_test = cv.transform(X_test.values)

In [8]:
# rename word2vec columns to de-conflict merge

## train set
### create list of word embedding column names
word2vec_col = []
for i in range(len(tweet2vec_train.columns)):
  num = str(i)
  name = "word2vec_"+num
  word2vec_col.append(name) 

### rename word embedding columns 
tweet2vec_train.columns = word2vec_col

## validation set
### create list of word embedding column names
word2vec_col = []
for i in range(len(tweet2vec_val.columns)):
  num = str(i)
  name = "word2vec_"+num
  word2vec_col.append(name) 

### rename word embedding columns 
tweet2vec_val.columns = word2vec_col

## test set
### create list of word embedding column names
word2vec_col = []
for i in range(len(tweet2vec_test.columns)):
  num = str(i)
  name = "word2vec_"+num
  word2vec_col.append(name) 

### rename word embedding columns 
tweet2vec_test.columns = word2vec_col

## TF-IDF

In [9]:
# TF-IDF
tfidf = TfidfTransformer()

# fit the CountVector to TF-IDF transformer
tfidf.fit(cv_train)
tfidf_train = tfidf.transform(cv_train)

# do the same for the validation set
tfidf.fit(cv_val)
tfidf_val = tfidf.transform(cv_val)

# and the same for the validation set
tfidf.fit(cv_test)
tfidf_test = tfidf.transform(cv_test)

In [10]:
# convert tfidf_train to data frame
## train set
tfidf_train = pd.DataFrame(tfidf_train.toarray())

## validation set 
tfidf_val = pd.DataFrame(tfidf_val.toarray())

## test set 
tfidf_test = pd.DataFrame(tfidf_test.toarray())

In [11]:
# rename tfidf columns to de-conflict merge

## train set
### create list of tfidf column names
tfidf_col = []
for i in range(len(tfidf_train.columns)):
  num = str(i)
  name = "tfidf_"+num
  tfidf_col.append(name) 

### rename tfidf columns
tfidf_train.columns = tfidf_col

## validation set
### create list of tfidf column names
tfidf_col = []
for i in range(len(tfidf_val.columns)):
  num = str(i)
  name = "tfidf_"+num
  tfidf_col.append(name) 

### rename tfidf columns
tfidf_val.columns = tfidf_col

## test set
### create list of tfidf column names
tfidf_col = []
for i in range(len(tfidf_test.columns)):
  num = str(i)
  name = "tfidf_"+num
  tfidf_col.append(name) 

### rename tfidf columns
tfidf_test.columns = tfidf_col

## Combine Features

In [12]:
# join tf-idf with word embeddings 

## train set 
X_train = tfidf_train.join(tweet2vec_train) 

## validation set 
X_val = tfidf_val.join(tweet2vec_val) 

## test set 
X_test = tfidf_test.join(tweet2vec_test) 

# Model Training

In [13]:
# load pickled classifiers from file
#with open("pickle_svm_clf.pkl", 'rb') as file:
#    pickle_svm_clf = pickle.load(file)
#
#with open("pickle_lg_clf.pkl", 'rb') as file:
#    pickle_lr_clf = pickle.load(file)
#    
#with open("pickle_xgb_clf.pkl", 'rb') as file:
#    pickle_xgb_clf = pickle.load(file)
#    
#with open("pickle_ada_clf.pkl", 'rb') as file:
#    pickle_ada_clf = pickle.load(file)

## #1 SVM

In [14]:
# make predictions
#pickle_svm_pred_val = pickle_svm_clf.predict(X_val)
#pickle_svm_pred_test = pickle_svm_clf.predict(X_test)

In [None]:
# create SVC object 
svm_clf = SVC(kernel='linear',probability=True, C=10, class_weight='balanced')

# create pipeline
svm_pipeline = Pipeline([
        #('bow', CountVectorizer(ngram_range=(1, 2))), # count term frequency
        #('tfidf', TfidfTransformer()), # downweight words which appear frequently
        ('c', svm_clf) # classifier
    ])

# train model
fit = svm_pipeline.fit(X_train,y_train)

# make predictions
svm_pred_val = svm_pipeline.predict(X_val)
svm_pred_test = svm_pipeline.predict(X_test)

## #2 Logistic Regression

In [None]:
# make predictions
#pickle_lr_pred_val = pickle_lr_clf.predict(X_val)
#pickle_lr_pred_test = pickle_lr_clf.predict(X_test)

In [None]:
# create logistic regression object
lr_clf = LogisticRegression(max_iter=1000, penalty='none', solver='saga')

# create pipeline
lr_pipeline = Pipeline([
        #('count', CountVectorizer(ngram_range=(1, 2))), # count term frequency
        #('tfidf', TfidfTransformer()), # downweight words which appear frequently
        ('c', lr_clf) # classifier
    ])

# train model
fit = lr_pipeline.fit(X_train,y_train)

# make predictions
lr_pred_val = lr_pipeline.predict(X_val) # validation set 
lr_pred_test = lr_pipeline.predict(X_test) # test set 

## #3 Extreme Gradient Boost

In [None]:
# make predictions
#pickle_xgb_pred_val = pickle_xgb_clf.predict(X_val)
#pickle_xgb_pred_test = pickle_xgb_clf.predict(X_test)

In [None]:
# create XGBoost object 
xgb_clf = XGBClassifier(max_depth=3, min_child_weight=3, eta = 0.15, n_estimators = 550, subsample=0.85)

# create pipeline
xgb_pipeline = Pipeline([
        #('bow', CountVectorizer(ngram_range=(1, 1))), # count term frequency
        #('tfidf', TfidfTransformer()), # downweight words which appear frequently
        ('c', xgb_clf) # classifier
])
 
# train model 
fit = xgb_pipeline.fit(X_train,y_train)

# make predictions
xgb_pred_val = xgb_pipeline.predict(X_val) # validation set 
xgb_pred_test = xgb_pipeline.predict(X_test) # test set 

## #4 Adaptive Boost

In [None]:
# make predictions
#pickle_ada_pred_val = pickle_ada_clf.predict(X_val)
#pickle_ada_pred_test = pickle_ada_clf.predict(X_test)

In [None]:
# create AdaBoost object
ada_clf = AdaBoostClassifier(n_estimators=500, learning_rate = 1.0)

# create pipeline
ada_pipeline = Pipeline([
        #('bow', CountVectorizer(ngram_range=(1, 2))), # count term frequency
        #('tfidf', TfidfTransformer()), # downweight words which appear frequently
        ('c', ada_clf) # classifier
])

# train model
fit = ada_pipeline.fit(X_train,y_train)

# make predictions
ada_pred_val = ada_pipeline.predict(X_val) # validation set
ada_pred_test = ada_pipeline.predict(X_test) # test set

## #5 Voting Classifier

In [None]:
# make predictions
#pickle_voting_pred_val = pickle_voting_clf.predict(X_val)
#pickle_voting_pred_test = pickle_voting_clf.predict(X_test)

In [None]:
named_estimators = [ # for each of the individual models
    ("SVM", svm_pipeline),
    ("Logistic Regression", lr_pipeline),
    ("XGBoost", xgb_pipeline),
    ("AdaBoost", ada_pipeline)
]

In [None]:
# voting classifier
voting_clf = VotingClassifier(named_estimators, voting = "soft") #soft voting (predicts the class label based on the argmax of the sums of the predicted probabilities)

# fit model
voting_clf.fit(X_train,y_train)

# make predictions
voting_pred_val = voting_clf.predict(X_val) # validation set 
voting_pred_test = voting_clf.predict(X_test) # test set 

## Pickled Classifiers

In [None]:
#import pickle
##mount drive to save data
#from google.colab import drive
#drive.mount('drive')

In [None]:
##save SVM
#pkl_filename = "pickle_svm_clf.pkl"
#with open(pkl_filename, 'wb') as file:
#    pickle.dump(svm_clf, file)
#
## Load from file
##with open(pkl_filename, 'rb') as file:
##    pickle_svm_clf = pickle.load(file)
#
##use pickle_svm_clf as model for unseen data

In [None]:
## Load from file
#pkl_filename = "pickle_lg_clf.pkl"
#with open(pkl_filename, 'rb') as file:
#    pickle_lg_clf = pickle.load(file)
#
##use pickle_svm_clf as model for unseen data

In [None]:
##save LG
#pkl_filename = "pickle_lg_clf.pkl"
#with open(pkl_filename, 'wb') as file:
#    pickle.dump(lr_clf, file)
#
## Load from file
##with open(pkl_filename, 'rb') as file:
##    pickle_svm_clf = pickle.load(file)
#
##use pickle_svm_clf as model for unseen data

In [None]:
##save XGB
#pkl_filename = "pickle_xgb_clf.pkl"
#with open(pkl_filename, 'wb') as file:
#    pickle.dump(svm_clf, file)
#
## Load from file
##with open(pkl_filename, 'rb') as file:
##    pickle_svm_clf = pickle.load(file)
#
##use pickle_svm_clf as model for unseen data

In [None]:
##save svm
#pkl_filename = "pickle_svm_clf.pkl"
#with open(pkl_filename, 'wb') as file:
#    pickle.dump(svm_clf, file)
#
## Load from file
#with open(pkl_filename, 'rb') as file:
#    pickle_svm_clf = pickle.load(file)
#
##use pickle_svm_clf as model for unseen data

# Evaluation

## #1 SVM

In [None]:
# validation set
# display results
print_metrics(svm_pred_val,y_val)
plot_confusion_matrix(confusion_matrix(y_val,svm_pred_val),target_names=['fake','real'], normalize = False, \
                      title = 'Confusion matix of SVM on val data')

In [None]:
# test set
# display results
print_metrics(svm_pred_test,y_test)
plot_confusion_matrix(confusion_matrix(y_test,svm_pred_test),target_names=['fake','real'], normalize = False, \
                      title = 'Confusion matix of SVM on test data')

## #2 Logisitic Regression

In [None]:
# validation set
# display results
print_metrics(lr_pred_val,y_val)
plot_confusion_matrix(confusion_matrix(y_val,lr_pred_val),target_names=['fake','real'], normalize = False, \
                      title = 'Confusion matix of LR on val data')

In [None]:
# test set 
# display results
print_metrics(lr_pred_test,y_test)
plot_confusion_matrix(confusion_matrix(y_test,lr_pred_test),target_names=['fake','real'], normalize = False, \
                      title = 'Confusion matix of LR on test data')

## #3 Extreme Gradient Boost

In [None]:
# validation set 
# display results
print_metrics(xgb_pred_val, y_val)
plot_confusion_matrix(confusion_matrix(y_val, xgb_pred_val),target_names=['fake','real'], normalize = False, \
                      title = 'Confusion matix of XGB on val data')

In [None]:
# test set
# display results
print_metrics(xgb_pred_test, y_test)
plot_confusion_matrix(confusion_matrix(y_test, xgb_pred_test),target_names=['fake','real'], normalize = False, \
                      title = 'Confusion matix of XGB on test data')

## #4 Adaptive Boost

In [None]:
# validation set 
# display results
print_metrics(ada_pred_val,y_val)
plot_confusion_matrix(confusion_matrix(y_val,ada_pred_val),target_names=['fake','real'], normalize = False, \
                      title = 'Confusion matix of Ada on val data')

In [None]:
# test set 
# display results
print_metrics(ada_pred_test,y_test)
plot_confusion_matrix(confusion_matrix(y_test,ada_pred_test),target_names=['fake','real'], normalize = False, \
                      title = 'Confusion matix of Ada on test data')

## #5 Voting Classifier

In [None]:
# validation set 
# display results
print_metrics(voting_pred_val,y_val)
plot_confusion_matrix(confusion_matrix(y_val,voting_pred_val),target_names=['fake','real'], normalize = False, \
                      title = 'Confusion matix of Ensemble on val data')

In [None]:
# test set 
# display results
print_metrics(voting_pred_test,y_test)
plot_confusion_matrix(confusion_matrix(y_test,voting_pred_test),target_names=['fake','real'], normalize = False, \
                      title = 'Confusion matix of Ensemble on test data')

# Error Analysis

In [None]:
# create a df of misclassified posts
svm_val_misclass_df = X_val[svm_pred_val!=y_val]

# inspect df
svm_val_misclass_df.info()

In [None]:
#error analysis, to compair the false classifications of the different models

false_pred_svm = val[(val["label"] != svm_pred)]
false_pred_gb = val[(val["label"] != gb_pred)]
false_pred_lr = val[(val["label"] != lr_pred)]
false_pred_xg = val[(val["label"] != pred)]
false_pred_ada = val[(val["label"] != ada_pred)]
false_pred_ensemble = val[(val["label"] != voting_pred)]

#common_mistakes = false_pred_svm = false_pred_gb = false_pred_lr = false_pred_xg = false_pred_ada = false_pred_ensemble
print(false_pred_ensemble)



In [None]:
#print(common_mistakes)
common_mistakes = pd.merge(false_pred_svm, false_pred_ada, on=['tweet'], how='inner')
print(common_mistakes)
common_mistakes.to_csv('/content/drive/MyDrive/Final Project/COVID-fake-news-detection/model/error_analysis/common_mistakes.csv', encoding='utf-8')

In [None]:
#mount drive to save data
from google.colab import drive
drive.mount('drive')