In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import StratifiedKFold, KFold
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import math
import numbers
from dateutil import parser
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score, f1_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.inspection import permutation_importance
import collections
import time
import ast
import re
import nltk
from emoji import UNICODE_EMOJI
import random
from functions_thesis import preprocessing, get_f1_macro, cross_validation_train, best_resampling
from imblearn.pipeline import Pipeline

In [None]:
# load data
after_data = pd.read_csv("after_train_val.csv", sep = "|").drop(columns = ['Unnamed: 0'])

# do some preprocessing
after_data = preprocessing(after_data)
print(after_data.shape)
after_data.keys()

**BASELINE MODELS**

We start with some baseline models, from here models will be improved using hyperparameter tuning and dealing with class imbalance. As a super simple baseline model we take a model that just predicts the majority class 'non-viral'. Moreover we use the F1 macro measurement to describe model performance: we assign equal weights to the F1 score of the majority and minority classes. 
We will test the following models: 
- Logistic model 
- Random Forest classifier
- XGBoost classifier
- Neural Network classifier

For the non-tree based models, data will be scaled. 

**After invasion models**

In [None]:
# prepare data
a_X = after_data[['verified', 'log_followers',
       'log_following', 'log_tweetcount',
       'log_listed', 'account_age_y', 
       'sex_generalized', 'tweet_char_len', 
        'hashtag_count',
       'mention_count', 'urls_count', 'organization', 'sentiment', 'emoji_count', 'public_metrics.retweet_count']]

a_Y = after_data['viral']

In [None]:
a_X.head()

First we resample the after data to match the same number as instances as the before data. This is to make both models comparable. 

In [None]:
# randomly undersample data
sample = {0: 586497, 1: 6376}
resample = RandomUnderSampler(random_state = 42, sampling_strategy = sample)
a_X, a_Y = resample.fit_resample(a_X, a_Y)
Counter(a_Y)

**Majority model**

In [None]:
metrics_majority = baseline_model(a_X, a_Y)

**Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression

# set up model 
model = LogisticRegression(random_state = 42) # BALANCED MADE PERFORMANCE WORSE
resample = False
scale = True 

# get evaluation
metrics_LR_base, importances_LR_base = cross_validation_train(model, a_X, a_Y, resample, scale)
get_f1_macro(metrics_LR_base)

In [None]:
# set up model 
model = LogisticRegression(random_state = 42) # BALANCED MADE PERFORMANCE WORSE

resampling_methods = {'RUS' : RandomUnderSampler(random_state = 42), 'ROS' : RandomOverSampler(random_state = 42), 'SMOTE' : SMOTE(random_state = 42, n_jobs = 3), 'bound' : list(np.arange(0, 65, 5))}
scaler = True

best_scores_mean_LR, best_scores_std_LR, best_ratio_LR = best_resampling(model, a_X, a_Y, resampling_methods, scaler)

**Random Forest**

In [None]:
# set up model 
model = RandomForestClassifier(n_estimators = 100, random_state = 42, n_jobs = 3)
resample = False
scale = False

# get evaluation
metrics_RF_base, importances_RF_base = cross_validation_train(model, a_X, a_Y, resample, scale)
get_f1_macro(metrics_RF_base)

In [None]:
# set up model 
model = RandomForestClassifier(n_estimators = 100, random_state = 42, n_jobs = 3)

resampling_methods = {'RUS' : RandomUnderSampler(random_state = 42), 'ROS' : RandomOverSampler(random_state = 42), 'SMOTE' : SMOTE(random_state = 42, n_jobs = 3), 'bound' : list(np.arange(0, 65, 5))}
scaler = False

best_scores_mean_RF, best_scores_std_RF, best_ratio_RF = best_resampling(model, a_X, a_Y, resampling_methods, scaler)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# set up model 
model = RandomForestClassifier(n_estimators = 100, random_state = 42, n_jobs = 3)
resample = 10
scale = False

# get evaluation
metrics_RF_b_test, importances_RF_b_test = cross_validation_train(model, a_X, a_Y, resample, scale)
get_f1_macro(metrics_RF_b_test)

**XGBoost**

In [None]:
from xgboost import XGBClassifier

# set up model 
model = XGBClassifier(n_estimators = 100, random_state = 42, n_jobs = 3)
resample = False
scale = False

# get evaluation
metrics_XG_base, importances_XG_base = cross_validation_train(model, a_X, a_Y, resample, scale)
get_f1_macro(metrics_XG_base)

In [None]:
# set up model 
model = XGBClassifier(n_estimators = 100, random_state = 42, n_jobs = 3)

resampling_methods = {'RUS' : RandomUnderSampler(random_state = 42), 'ROS' : RandomOverSampler(random_state = 42), 'SMOTE' : SMOTE(random_state = 42, n_jobs = 3), 'bound' : list(np.arange(0, 65, 5))}
scaler = False

best_scores_mean_XGB, best_scores_std_XGB, best_ratio_XGB = best_resampling(model, a_X, a_Y, resampling_methods, scaler)

**Neural Network**

In [None]:
from sklearn.neural_network import MLPClassifier

# set up model 
model = MLPClassifier(random_state = 42)
resample = False
scale = True

# get evaluation
metrics_MLP_base, importances_MLP_base = cross_validation_train(model, a_X, a_Y, resample, scale)
get_f1_macro(metrics_MLP_base)

In [None]:
# set up model 
model = MLPClassifier(random_state = 42)

resampling_methods = {'RUS' : RandomUnderSampler(random_state = 42), 'ROS' : RandomOverSampler(random_state = 42), 'SMOTE' : SMOTE(random_state = 42, n_jobs = 3), 'bound' : list(np.arange(0, 65, 5))}
scaler = True

best_scores_mean_MLP, best_scores_std_MLP, best_ratio_MLP = best_resampling(model, a_X, a_Y, resampling_methods, scaler)

**FINAL MODEL HYPERPARAMETER TUNING**

First a class is made for the resample method boundary, to put it into the pipeline for gridsearch. Ideal boundary is set on 10.

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

ind_bound = a_X.index[(a_X['public_metrics.retweet_count'] >= 10) & (a_X['public_metrics.retweet_count'] <= 100)].tolist()
len(ind_bound)

skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
skf.get_n_splits(a_X, a_Y)

cv = list()

for item in skf.split(a_X, a_Y):
    cv.append([np.array(list((set(item[0]) - set(ind_bound)))), item[1]])


X_ = a_X.drop(columns = ['public_metrics.retweet_count'])
    
# do grid search TEST

model = RandomForestClassifier(random_state = 42, n_jobs = 2)

grid = {"n_estimators" : [90, 100, 130], 
        "criterion" : ['gini', 'entropy'],
        "max_depth" : [5, 10, 20, 40, 'None'],
        "min_samples_split" : [2, 5, 10], 
        "max_features" : ['sqrt', 'None']}

grid_search = GridSearchCV(estimator = model, param_grid = grid, n_jobs = 2, cv = cv, scoring = 'f1_macro', refit = False)
grid_result = grid_search.fit(X_, a_Y)

mean = pd.DataFrame(grid_result.cv_results_).iloc[grid_result.best_index_]['mean_test_score']
std = pd.DataFrame(grid_result.cv_results_).iloc[grid_result.best_index_]['std_test_score']

print("mean score: %f +- %f" % (mean, std))
print("best parameters: ", grid_result.best_params_)

**Best parameters are:**

mean score: 0.735954 +- 0.004972

best parameters:  {'criterion': 'gini', 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 130}

**FINAL MODEL**

In [None]:
# set up model 
model = RandomForestClassifier(criterion = 'gini', max_depth = 20, max_features = 'sqrt', min_samples_split = 2, n_estimators = 150, random_state = 42, n_jobs = 3)
resample = 10
scale = False

# get evaluation
metrics_RF_final, importances_RF_final = cross_validation_train(model, a_X, a_Y, resample, scale)
get_f1_macro(metrics_RF_final)