In [2]:
import numpy as np
import pandas as pd
import nltk
import re
import itertools
import matplotlib.pyplot as plt

## nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import pos_tag

from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, f1_score, precision_score, recall_score, plot_confusion_matrix

[nltk_data] Downloading package stopwords to /Users/marco/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
train = pd.read_csv('https://raw.githubusercontent.com/smkerr/COVID-fake-news-detection/main/data/Constraint_Train.csv')
val = pd.read_csv('https://raw.githubusercontent.com/smkerr/COVID-fake-news-detection/main/data/Constraint_Val.csv')

In [4]:
# define print_metrics() function to print results
def print_metrics(pred,true):
    print(confusion_matrix(true,pred))
    print(classification_report(true,pred,))
    print("Accuracy : ",accuracy_score(pred,true))
    print("Precison : ",precision_score(pred,true, average = 'weighted'))
    print("Recall : ",recall_score(pred,true,  average = 'weighted'))
    print("F1 : ",f1_score(pred,true,  average = 'weighted'))

In [5]:
stop_words = set(stopwords.words("english")) # create list of stop words

def cleantext(string):
    text = string.lower().split() # convert letters to lowercase and split claims ('tweets') into separate words
    text = " ".join(text) # ???
    text = re.sub(r"http(\S)+",' ',text)    
    text = re.sub(r"www(\S)+",' ',text)
    text = re.sub(r"&",' and ',text) # replace '&' with ' and '
    text = text.replace('&amp',' ') # replace '&amp' with empty space
    text = re.sub(r"[^0-9a-zA-Z]+",' ',text) # replace non-letters, non-numbers, & punctuation with empty space
    text = text.split() # ??? 
    #text = [ps.stem(word) for word in text] # set text equal to stemmed words
    text = [w for w in text if not w in stop_words] # remove stop words 
    text = " ".join(text) # joining the seperated pieces of string into one string
    return text

# apply cleantext() function to all tweets 
train['tweet'] = train['tweet'].map(lambda x: cleantext(x))
val['tweet'] = val['tweet'].map(lambda x: cleantext(x))
#test['tweet'] = test['tweet'].map(lambda x: cleantext(x))

In [39]:
train['tweet'].head(10)

0    cdc currently report 99031 death general discr...
1    state reported 1121 death small rise last tues...
2    politically correct woman almost us pandemic e...
3    indiafightscorona 1524 covid testing laborator...
4    populous state generate large case count look ...
5    covid act found average person illinois covid ...
6    tested positive covid19 symptom stay home away...
7    obama call trump coronavirus response chaotic ...
8    clearly obama administration leave kind game p...
9    retraction hydroxychloroquine chloroquine with...
Name: tweet, dtype: object

In [60]:
# XGBoost Classifier

# create pipeline
pipeline = Pipeline([
        ('bow', CountVectorizer(ngram_range=(1, 1))), # count term frequency
        ('tfidf', TfidfTransformer()), # downweight words which appear frequently
        ('c', XGBClassifier(max_depth=20, min_child_weight=2)) # classifier
    ])

fit = pipeline.fit(train['tweet'],train['label']) # train model
print('XGBoost')
print ('val:')
pred = pipeline.predict(val['tweet']) # make predictions

# display results
print_metrics(pred,val['label'])
#plot_confusion_matrix(confusion_matrix(val['label'],pred),target_names=['fake','real'], normalize = False, \
#                      title = 'Confusion matix of DT on val data')



XGBoost
val:
[[ 932   88]
 [ 100 1020]]
              precision    recall  f1-score   support

        fake       0.90      0.91      0.91      1020
        real       0.92      0.91      0.92      1120

    accuracy                           0.91      2140
   macro avg       0.91      0.91      0.91      2140
weighted avg       0.91      0.91      0.91      2140

Accuracy :  0.9121495327102803
Precison :  0.9121664179690567
Recall :  0.9121495327102803
F1 :  0.9121292411495973


In [44]:
#SVM
from sklearn.svm import LinearSVC
# create pipeline
pipeline = Pipeline([
        ('bow', CountVectorizer(ngram_range=(1, 2))), # count term frequency
        ('tfidf', TfidfTransformer()), # downweight words which appear frequently
        ('c', LinearSVC()) # classifier
    ])

fit = pipeline.fit(train['tweet'],train['label']) # train model
print('XGBoost')
print ('val:')
pred = pipeline.predict(val['tweet']) # make predictions

# display results
print_metrics(pred,val['label'])
#plot_confusion_matrix(confusion_matrix(val['label'],pred),target_names=['fake','real'], normalize = False, \
#                      title = 'Confusion matix of DT on val data')

XGBoost
val:
[[ 957   63]
 [  72 1048]]
              precision    recall  f1-score   support

        fake       0.93      0.94      0.93      1020
        real       0.94      0.94      0.94      1120

    accuracy                           0.94      2140
   macro avg       0.94      0.94      0.94      2140
weighted avg       0.94      0.94      0.94      2140

Accuracy :  0.9369158878504673
Precison :  0.9369264902222572
Recall :  0.9369158878504673
F1 :  0.9369045856666006


In [45]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')

stop_words = set(stopwords.words("english")) # create list of stop words

def cleantext(string):
    text = string.lower().split() # convert letters to lowercase and split claims ('tweets') into separate words
    text = " ".join(text) # ???
    text = re.sub(r"http(\S)+",' ',text)    
    text = re.sub(r"www(\S)+",' ',text)
    text = re.sub(r"&",' and ',text) # replace '&' with ' and '
    text = text.replace('&amp',' ') # replace '&amp' with empty space
    text = re.sub(r"[^0-9a-zA-Z]+",' ',text) # replace non-letters, non-numbers, & punctuation with empty space
    text = text.split() # ??? 
    #text = [ps.stem(word) for word in text] # set text equal to stemmed words
    text = [w for w in text if not w in stop_words] # Lemmatization
    text = [lemmatizer.lemmatize(w) for w in text] # remove stop words 
    text = " ".join(text) # joining the seperated pieces of string into one string
    return text

# apply cleantext() function to all tweets 
train['tweet'] = train['tweet'].map(lambda x: cleantext(x))
val['tweet'] = val['tweet'].map(lambda x: cleantext(x))
#test['tweet'] = test['tweet'].map(lambda x: cleantext(x))

[nltk_data] Downloading package wordnet to /Users/marco/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [46]:
train['tweet'].head(10)

0    cdc currently report 99031 death general discr...
1    state reported 1121 death small rise last tues...
2    politically correct woman almost u pandemic ex...
3    indiafightscorona 1524 covid testing laborator...
4    populous state generate large case count look ...
5    covid act found average person illinois covid ...
6    tested positive covid19 symptom stay home away...
7    obama call trump coronavirus response chaotic ...
8    clearly obama administration leave kind game p...
9    retraction hydroxychloroquine chloroquine with...
Name: tweet, dtype: object

In [62]:
# XGBoost Classifier

# create pipeline
pipeline = Pipeline([
        ('bow', CountVectorizer(ngram_range=(1, 1))), # count term frequency
        ('tfidf', TfidfTransformer()), # downweight words which appear frequently
        ('c', XGBClassifier(max_depth=20, min_child_weight=2)) # classifier
    ])

fit = pipeline.fit(train['tweet'],train['label']) # train model
print('XGBoost')
print ('val:')
pred = pipeline.predict(val['tweet']) # make predictions

# display results
print_metrics(pred,val['label'])
#plot_confusion_matrix(confusion_matrix(val['label'],pred),target_names=['fake','real'], normalize = False, \
#                      title = 'Confusion matix of DT on val data')



XGBoost
val:
[[ 932   88]
 [ 100 1020]]
              precision    recall  f1-score   support

        fake       0.90      0.91      0.91      1020
        real       0.92      0.91      0.92      1120

    accuracy                           0.91      2140
   macro avg       0.91      0.91      0.91      2140
weighted avg       0.91      0.91      0.91      2140

Accuracy :  0.9121495327102803
Precison :  0.9121664179690567
Recall :  0.9121495327102803
F1 :  0.9121292411495973


In [63]:
# XGBoost Classifier

params = {
          'max_depth': 20,
          'min_child_weight': 2,
         }

# create pipeline
pipeline = Pipeline([
        ('bow', CountVectorizer(ngram_range=(1, 1))), # count term frequency
        ('tfidf', TfidfTransformer()), # downweight words which appear frequently
        ('c', XGBClassifier(**params, objective='binary:logistic')) # classifier
    ])

fit = pipeline.fit(train['tweet'],train['label']) # train model
print('XGBoost')
print ('val:')
pred = pipeline.predict(val['tweet']) # make predictions

# display results
print_metrics(pred,val['label'])
#plot_confusion_matrix(confusion_matrix(val['label'],pred),target_names=['fake','real'], normalize = False, \
#                      title = 'Confusion matix of DT on val data')



XGBoost
val:
[[ 932   88]
 [ 100 1020]]
              precision    recall  f1-score   support

        fake       0.90      0.91      0.91      1020
        real       0.92      0.91      0.92      1120

    accuracy                           0.91      2140
   macro avg       0.91      0.91      0.91      2140
weighted avg       0.91      0.91      0.91      2140

Accuracy :  0.9121495327102803
Precison :  0.9121664179690567
Recall :  0.9121495327102803
F1 :  0.9121292411495973


In [50]:
from pprint import pprint
from time import time
import logging

#from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [67]:
model = XGBClassifier()

pipeline = Pipeline([
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ('model', model),
])

param_grid = {
    'model__max_depth': [10,20,30],
    'model__n_estimators': [100, 200, 300],
    'model__min_child_weight': [1, 2, 5]
}

grid = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1, scoring='accuracy')

grid.fit(train['tweet'],train['label'])
mean_score = grid.cv_results_["mean_test_score"][grid.best_index_]
print(f"Best parameters: {grid.best_params_}")
print(f"Mean CV score: {mean_score: .6f}")
print(f"Standard deviation of CV score: {std_score: .6f}")



Best parameters: {'model__max_depth': 10, 'model__min_child_weight': 1, 'model__n_estimators': 100}
Mean CV score:  0.910125
Standard deviation of CV score:  0.006924
































In [68]:
print(f"Best parameters: {grid.best_params_}")
print(f"Mean CV score: {mean_score: .6f}")
print(f"Standard deviation of CV score: {std_score: .6f}")

Best parameters: {'model__max_depth': 10, 'model__min_child_weight': 1, 'model__n_estimators': 100}
Mean CV score:  0.910125
Standard deviation of CV score:  0.006924


In [None]:
# import machine learning libraries
import xgboost as xgb
from sklearn.metrics import accuracy_score


# import packages for hyperparameters tuning
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

In [None]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 0
    }

In [None]:
def objective(space):
    clf=XGBClassifier(
                    n_estimators =space['n_estimators'], max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( train['tweet'], train['label']), ( val['tweet'], val['label']]
    
    clf.fit(train['tweet'], train['label'],
            eval_set=evaluation, eval_metric="auc",
            early_stopping_rounds=10,verbose=False)
    

    pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, pred>0.5)
    print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }

In [None]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

In [None]:
print("The best hyperparameters are : ","\n")
print(best_hyperparams)