In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import StratifiedKFold, KFold

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.preprocessing import StandardScaler

from sklearn.ensemble import RandomForestClassifier

from functions_thesis import preprocessing, get_f1_macro, cross_validation_train, best_resampling
from sklearn.inspection import permutation_importance
import time
import collections
from sklearn.metrics import f1_score
from scipy import stats
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.inspection import PartialDependenceDisplay

def get_F1_distribution(testset, model):
    F1_scores = list()
    n = len(testset)
    for i in np.arange(1000):
        # get random 1000 datapoints 
        sample = testset.sample(n = n, replace = True)

        X = sample[['verified', 'log_followers',
           'log_following', 'log_tweetcount',
           'log_listed', 'account_age_y', 
           'sex_generalized', 'tweet_char_len', 
            'hashtag_count',
           'mention_count', 'urls_count', 'organization', 'sentiment', 'emoji_count']]
        Y_true = sample['viral']

        # make prediction
        Y_pred = model.predict(X)

        # get F1 macro and append to list
        F1_scores.append(f1_score(Y_true, Y_pred, average = 'macro'))

    return F1_scores

# make dict to store all f1 distributions
all_F1 = dict()

**IMPORT AND PREPROCESS TEST SETS**

In [None]:
test_before = pd.read_csv("test_before_prep.csv", sep = "|").drop(columns = ['Unnamed: 0'])
test_after = pd.read_csv("test_after_prep.csv", sep = "|").drop(columns = ['Unnamed: 0'])

test_before = preprocessing(test_before)
test_after = preprocessing(test_after)

print(test_before.shape, test_after.shape)

In [None]:
# prepare data
b_X_test = test_before[['verified', 'log_followers',
       'log_following', 'log_tweetcount',
       'log_listed', 'account_age_y', 
       'sex_generalized', 'tweet_char_len', 
        'hashtag_count',
       'mention_count', 'urls_count', 'organization', 'sentiment', 'emoji_count']]

b_Y_test = test_before['viral']
print(Counter(b_Y_test))

# combine data again
before_test = b_X_test
before_test['viral'] = b_Y_test
print(len(before_test))
before_test.head()

We randomly downsample the after testset to match the same number of instances of non-viral and viral tweets of the before test set. This way, performance metrics are comparable

In [None]:
# randomly undersample data
sample = Counter(b_Y_test)
resample = RandomUnderSampler(random_state = 42, sampling_strategy = sample)
# prepare data
a_X_test = test_after[['verified', 'log_followers',
       'log_following', 'log_tweetcount',
       'log_listed', 'account_age_y', 
       'sex_generalized', 'tweet_char_len', 
        'hashtag_count',
       'mention_count', 'urls_count', 'organization', 'sentiment', 'emoji_count']]

a_Y_test = test_after['viral']
print(Counter(a_Y_test))
a_X_test, a_Y_test = resample.fit_resample(a_X_test, a_Y_test)
print(Counter(a_Y_test))

# combine data again
after_test = a_X_test
after_test['viral'] = a_Y_test
print(len(after_test))
after_test.head()

In [None]:
comb_Y_test = pd.concat([b_Y_test,a_Y_test]).reset_index().drop(columns = ['index'])['viral']
comb_X_test = pd.concat([b_X_test,a_X_test]).reset_index().drop(columns = ['index'])

# randomly undersample data
sample = Counter(b_Y_test)
resample = RandomUnderSampler(random_state = 42, sampling_strategy = sample)
comb_X_test, comb_Y_test = resample.fit_resample(comb_X_test, comb_Y_test)
print(Counter(comb_Y_test))

# combine data again
comb_test = comb_X_test
comb_test['viral'] = comb_Y_test
print(len(comb_test))
comb_test.head()

In [None]:
b_x__ = b_X_test
b_x__['after'] = 0
a_x__ = a_X_test
a_x__['after'] = 1

COMB_Y_test = pd.concat([b_Y_test,a_Y_test]).reset_index().drop(columns = ['index'])['viral']
COMB_X_test = pd.concat([b_x__,a_x__]).reset_index().drop(columns = ['index'])

# randomly undersample data
sample = Counter(b_Y_test)
resample = RandomUnderSampler(random_state = 42, sampling_strategy = sample)
COMB_X_test, COMB_Y_test = resample.fit_resample(COMB_X_test, COMB_Y_test)
print(Counter(COMB_Y_test))

# combine data again
COMB_test = COMB_X_test
COMB_test['viral'] = COMB_Y_test
print(len(COMB_test))
COMB_test.head()

**BEFORE DATA - FINAL MODEL + TESTING**

In [None]:
# load data
before_data = pd.read_csv("before_train_val.csv", sep = "|").drop(columns = ['Unnamed: 0'])

# do some preprocessing
before_data = preprocessing(before_data)
print(before_data.shape)
before_data.keys()

In [None]:
# resample data based on bound of 25
print('number of train data: ', len(before_data))
resample_data = before_data[(before_data['public_metrics.retweet_count'] < 25) | (before_data['public_metrics.retweet_count'] > 100)]
print('after resampling with bound of 25: ', len(resample_data))

# prepare data
b_X = resample_data[['verified', 'log_followers',
       'log_following', 'log_tweetcount',
       'log_listed', 'account_age_y', 
       'sex_generalized', 'tweet_char_len', 
        'hashtag_count',
       'mention_count', 'urls_count', 'organization', 'sentiment', 'emoji_count']]

b_Y = resample_data['viral']

**Fit final model**

In [None]:
start_time = time.time()

RF = RandomForestClassifier(criterion = 'entropy', max_depth = 40, max_features = 'sqrt', min_samples_split = 5, n_estimators = 150, random_state = 42, n_jobs = 3)

model_before = RF.fit(b_X, b_Y)
print("done fitting the model")
imp_before = permutation_importance(model_before, b_X, b_Y, n_repeats = 30, random_state = 42, scoring = 'f1_macro', n_jobs = 3)

print("number of minutes running: ", (time.time() - start_time)/60)

importances_before = collections.defaultdict(list)
features = b_X.keys()
for item in imp_before:
            if item == 'importances':
                for feature, importance in zip(features, imp_before[item]):
                    importances_before[feature].append(importance)  
                    
true_Y = b_Y
pred_Y = model_before.predict(b_X)
f1_b = f1_score(true_Y, pred_Y, average = 'macro')
print("f1 score before = ", f1_b)

importances_before_new = dict()

for item in importances_before:
    importances_before_new[item] = importances_before[item][0] / f1_b * 100

pd.DataFrame.from_dict(importances_before_new).to_csv("permutation_train_before.csv")

**Get F1 distribution**

In [None]:
F1_distr_before = get_F1_distribution(before_test, model_before)
plt.hist(F1_distr_before, bins = 50)
print("number of f-measures: ", len(F1_distr_before))
print(np.mean(F1_distr_before), np.std(F1_distr_before), stats.sem(F1_distr_before))

In [None]:
F1_distr_before_CON = get_F1_distribution(after_test, model_before)
plt.hist(F1_distr_before_CON, bins = 50)
print("number of f-measures: ", len(F1_distr_before_CON))
print(np.mean(F1_distr_before_CON), np.std(F1_distr_before_CON), stats.sem(F1_distr_before_CON))

In [None]:
all_F1['BB'] = F1_distr_before
all_F1['BA'] = F1_distr_before_CON

**AFTER DATA - FINAL MODEL + TESTING**

In [None]:
# load data
after_data = pd.read_csv("after_train_val.csv", sep = "|").drop(columns = ['Unnamed: 0'])

# do some preprocessing
after_data = preprocessing(after_data)
print(after_data.shape)
after_data.keys()

In [None]:
# resample data based on bound of 25
print('number of train data: ', len(after_data))
# randomly undersample data
sample = {0: 586497, 1: 6376}
resample = RandomUnderSampler(random_state = 42, sampling_strategy = sample)
# prepare data
a_X = after_data[['verified', 'log_followers',
       'log_following', 'log_tweetcount',
       'log_listed', 'account_age_y', 
       'sex_generalized', 'tweet_char_len', 
        'hashtag_count',
       'mention_count', 'urls_count', 'organization', 'sentiment', 'emoji_count', 'public_metrics.retweet_count']]

a_Y = after_data['viral']
print(Counter(a_Y))
a_X, a_Y = resample.fit_resample(a_X, a_Y)
after_data_RUS = a_X.reset_index()
after_data_RUS['viral'] = a_Y

print('number of train data after RUS:', len(after_data_RUS))

resample_data_after = after_data_RUS[(after_data_RUS['public_metrics.retweet_count'] < 10) | (after_data_RUS['public_metrics.retweet_count'] > 100)]
print('after resampling with bound of 10: ', len(resample_data_after))

a_X = resample_data_after[['verified', 'log_followers',
       'log_following', 'log_tweetcount',
       'log_listed', 'account_age_y', 
       'sex_generalized', 'tweet_char_len', 
        'hashtag_count',
       'mention_count', 'urls_count', 'organization', 'sentiment', 'emoji_count']]

a_Y = resample_data_after['viral']

print(a_X.shape, Counter(a_Y))

**Fit final model**

In [None]:
start_time = time.time()

RF = RandomForestClassifier(criterion = 'gini', max_depth = 20, max_features = 'sqrt', min_samples_split = 2, n_estimators = 150, random_state = 42, n_jobs = 3)
model_after = RF.fit(a_X, a_Y)
print("done fitting model")

imp_after = permutation_importance(model_after, a_X, a_Y, n_repeats = 30, random_state = 42, scoring = 'f1_macro', n_jobs = 4)

print("number of minutes running: ", (start_time - time.time())/60)

importances_after = collections.defaultdict(list)
features = a_X.keys()
for item in imp_after:
            if item == 'importances':
                for feature, importance in zip(features, imp_after[item]):
                    importances_after[feature].append(importance)     
                    
                    
true_Y = a_Y
pred_Y = model_after.predict(a_X)
f1_a = f1_score(true_Y, pred_Y, average = 'macro')
print("f1 score after = ", f1_a)

importances_after_new = dict()

for item in importances_after:
    importances_after_new[item] = importances_after[item][0] / f1_a * 100

pd.DataFrame.from_dict(importances_after_new).to_csv("permutation_train_after.csv")

importances_after

**Get F1 distribution**

In [None]:
F1_distr_after = get_F1_distribution(after_test, model_after)
plt.hist(F1_distr_after, bins = 50)
print("number of f-measures: ", len(F1_distr_after))
print(np.mean(F1_distr_after), np.std(F1_distr_after), stats.sem(F1_distr_after))

In [None]:
F1_distr_after_CON = get_F1_distribution(before_test, model_after)
plt.hist(F1_distr_after_CON, bins = 50)
print("number of f-measures: ", len(F1_distr_after_CON))
print(np.mean(F1_distr_after_CON), np.std(F1_distr_after_CON), stats.sem(F1_distr_after_CON))

In [None]:
all_F1['AA'] = F1_distr_after
all_F1['AB'] = F1_distr_after_CON

**COMBINE FEATURE IMPORTANCE**

In [None]:
fig, (ax1, ax2) = plt.subplots(2, 1, figsize = (12,12))
ax1.boxplot(pd.DataFrame.from_dict(importances_before_new), vert = False, labels = importances_before.keys())
ax1.set_title("1. Feature importance before model - train set", weight = 'bold', fontsize = 13, loc = 'right')
ax1.set_xlabel("% drop in macro F1", fontsize = 12)
ax1.set_xlim([-1, 50])

ax2.boxplot(pd.DataFrame.from_dict(importances_after_new), vert = False, labels = importances_after.keys())
ax2.set_title("2. Feature importance after model - train set", weight = 'bold', fontsize = 13, loc = 'right')
ax2.set_xlabel("% drop in macro F1", fontsize = 12)
ax2.set_xlim([-1, 50])
# fig.savefig("Feature_Importance_CompleNet.jpg")

In [None]:
from matplotlib.patches import Patch

fig, ax1 = plt.subplots(1, 1, figsize = (12,6))
ax1.boxplot(pd.DataFrame.from_dict(importances_before_new), vert = False, labels = importances_before.keys(), patch_artist=True,
            boxprops=dict(facecolor='b', color='b'),
            capprops=dict(color='b'),
            whiskerprops=dict(color='b'),
            flierprops=dict(color='b', markeredgecolor='b'),
            medianprops=dict(color='b'),)
ax1.set_title("Permutation importance", weight = 'bold', fontsize = 15, loc = 'center')
ax1.set_xlabel("% drop in macro F1", fontsize = 13)
ax1.set_xlim([-1, 50])

ax1.boxplot(pd.DataFrame.from_dict(importances_after_new), vert = False, labels = importances_after.keys(), patch_artist=True,
            boxprops=dict(facecolor='r', color='r'),
            capprops=dict(color='r'),
            whiskerprops=dict(color='r'),
            flierprops=dict(color='r', markeredgecolor='r'),
            medianprops=dict(color='r'),)

legend_elements = [Patch(facecolor='b', edgecolor='b',
                         label='Before model'),
                  Patch(facecolor='r', edgecolor='r',
                         label='After model')]

ax1.legend(handles=legend_elements, loc='upper right', fontsize = 13)

# ax1.set_title("2. Feature importance after model - train set", weight = 'bold', fontsize = 13, loc = 'right')
# ax1.set_xlabel("% drop in macro F1", fontsize = 12)
# ax1.set_xlim([-1, 50])
fig.savefig("Feature_Importance_CompleNet.jpg")

In [None]:
print("followers before")
print(np.mean(importances_before_new['log_followers']), stats.sem(importances_before_new['log_followers']))
print("followers after")
print(np.mean(importances_after_new['log_followers']), stats.sem(importances_after_new['log_followers']), '\n')

print("listed before")
print(np.mean(importances_before_new['log_listed']), stats.sem(importances_before_new['log_listed']))
print("listed after")
print(np.mean(importances_after_new['log_listed']), stats.sem(importances_after_new['log_listed']), '\n')

print("URL before")
print(np.mean(importances_before_new['urls_count']), stats.sem(importances_before_new['urls_count']))
print("URL after")
print(np.mean(importances_after_new['urls_count']), stats.sem(importances_after_new['urls_count']), '\n')

print("following before")
print(np.mean(importances_before_new['log_following']), stats.sem(importances_before_new['log_following']))
print("following after")
print(np.mean(importances_after_new['log_following']), stats.sem(importances_after_new['log_following']), '\n')

print("tweetcount before")
print(np.mean(importances_before_new['log_tweetcount']), stats.sem(importances_before_new['log_tweetcount']))
print("tweetcount after")
print(np.mean(importances_after_new['log_tweetcount']), stats.sem(importances_after_new['log_tweetcount']), '\n')

In [None]:
print(stats.pearsonr(a_X['log_followers'], a_X['log_listed']))
print(stats.pearsonr(a_X['log_followers'], a_X['log_following']))
print(stats.pearsonr(a_X['log_followers'], a_X['log_tweetcount']))

Visualize absolute differences in feature importance 

In [None]:
diff_perm_imp = abs(pd.DataFrame.from_dict(importances_before_new).subtract(pd.DataFrame.from_dict(importances_after_new)))
diff_perm_imp.mean()

In [None]:
plt.boxplot(diff_perm_imp, vert = False, labels = diff_perm_imp.keys())

**BEFORE AND AFTER DATA COMBINED**

**Find hyperparameters + optimal bound**

In [None]:
# print(len(after_data_RUS), len(before_data))

# after = after_data_RUS[['verified', 'log_followers', 'log_following', 'log_tweetcount',
#        'log_listed', 'account_age_y', 'sex_generalized', 'tweet_char_len',
#        'hashtag_count', 'mention_count', 'urls_count', 'organization',
#        'sentiment', 'emoji_count', 'public_metrics.retweet_count', 'viral']]

# before = before_data[['verified', 'log_followers', 'log_following', 'log_tweetcount',
#        'log_listed', 'account_age_y', 'sex_generalized', 'tweet_char_len',
#        'hashtag_count', 'mention_count', 'urls_count', 'organization',
#        'sentiment', 'emoji_count', 'public_metrics.retweet_count', 'viral']]

# combined_train = pd.concat([after, before])
# print(combined_train.shape)

# comb_X = combined_train[['verified', 'log_followers', 'log_following', 'log_tweetcount',
#        'log_listed', 'account_age_y', 'sex_generalized', 'tweet_char_len',
#        'hashtag_count', 'mention_count', 'urls_count', 'organization',
#        'sentiment', 'emoji_count', 'public_metrics.retweet_count']]

# comb_Y = combined_train['viral']

# sample = {0: 586497, 1: 6376}
# resample = RandomUnderSampler(random_state = 42, sampling_strategy = sample)
# comb_X, comb_Y = resample.fit_resample(comb_X, comb_Y)
# Counter(comb_Y)

In [None]:
# # set up model 
# model = RandomForestClassifier(n_estimators = 100, random_state = 42, n_jobs = 3)

# resampling_methods = {'bound' : list(np.arange(0, 65, 5))}
# scaler = False

# best_scores_mean_RF_comb, best_scores_std_RF_comb, best_ratio_RF_comb = best_resampling(model, comb_X, comb_Y, resampling_methods, scaler)

In [None]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.model_selection import cross_val_score

# ind_bound = comb_X.index[(comb_X['public_metrics.retweet_count'] >= 25) & (comb_X['public_metrics.retweet_count'] <= 100)].tolist()
# len(ind_bound)

# skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True)
# skf.get_n_splits(comb_X, comb_Y)

# cv = list()

# for item in skf.split(comb_X, comb_Y):
#     cv.append([np.array(list((set(item[0]) - set(ind_bound)))), item[1]])


# X_ = comb_X.drop(columns = ['public_metrics.retweet_count'])
    
# # do grid search TEST

# model = RandomForestClassifier(random_state = 42, n_jobs = 2)

# grid = {"n_estimators" : [90, 100, 130], 
#         "criterion" : ['gini', 'entropy'],
#         "max_depth" : [5, 10, 20, 40, 'None'],
#         "min_samples_split" : [2, 5, 10], 
#         "max_features" : ['sqrt', 'None']}

# grid_search = GridSearchCV(estimator = model, param_grid = grid, n_jobs = 2, cv = cv, scoring = 'f1_macro', refit = False)
# grid_result = grid_search.fit(X_, comb_Y)

# mean = pd.DataFrame(grid_result.cv_results_).iloc[grid_result.best_index_]['mean_test_score']
# std = pd.DataFrame(grid_result.cv_results_).iloc[grid_result.best_index_]['std_test_score']

# print("mean score: %f +- %f" % (mean, std))
# print("best parameters: ", grid_result.best_params_)

# # mean score: 0.735699 +- 0.003984
# # best parameters:  {'criterion': 'entropy', 'max_depth': 40, 'max_features': 'sqrt', 'min_samples_split': 5, 'n_estimators': 130}

**Fit final model**

In [None]:
print(len(after_data_RUS), len(before_data))

after = after_data_RUS[['verified', 'log_followers', 'log_following', 'log_tweetcount',
       'log_listed', 'account_age_y', 'sex_generalized', 'tweet_char_len',
       'hashtag_count', 'mention_count', 'urls_count', 'organization',
       'sentiment', 'emoji_count', 'public_metrics.retweet_count', 'viral']]

before = before_data[['verified', 'log_followers', 'log_following', 'log_tweetcount',
       'log_listed', 'account_age_y', 'sex_generalized', 'tweet_char_len',
       'hashtag_count', 'mention_count', 'urls_count', 'organization',
       'sentiment', 'emoji_count', 'public_metrics.retweet_count', 'viral']]

combined_train = pd.concat([after, before])
print(combined_train.shape)

comb_X = combined_train[['verified', 'log_followers', 'log_following', 'log_tweetcount',
       'log_listed', 'account_age_y', 'sex_generalized', 'tweet_char_len',
       'hashtag_count', 'mention_count', 'urls_count', 'organization',
       'sentiment', 'emoji_count', 'public_metrics.retweet_count']]

comb_Y = combined_train['viral']

sample = {0: 586497, 1: 6376}
resample = RandomUnderSampler(random_state = 42, sampling_strategy = sample)
comb_X, comb_Y = resample.fit_resample(comb_X, comb_Y)
Counter(comb_Y)

combined_data_RUS = comb_X.reset_index()
combined_data_RUS['viral'] = comb_Y

print('number of train data after RUS:', len(combined_data_RUS))

resample_data_comb = combined_data_RUS[(combined_data_RUS['public_metrics.retweet_count'] < 25) | (combined_data_RUS['public_metrics.retweet_count'] > 100)]
print('after resampling with bound of 25: ', len(resample_data_comb))

comb_X = resample_data_comb[['verified', 'log_followers',
       'log_following', 'log_tweetcount',
       'log_listed', 'account_age_y', 
       'sex_generalized', 'tweet_char_len', 
        'hashtag_count',
       'mention_count', 'urls_count', 'organization', 'sentiment', 'emoji_count']]

comb_Y = resample_data_comb['viral']

print(comb_X.shape, Counter(comb_Y))

In [None]:
start_time = time.time()

RF = RandomForestClassifier(criterion = 'entropy', max_depth = 40, max_features = 'sqrt', min_samples_split = 5, n_estimators = 150, random_state = 42, n_jobs = 3)
model_comb = RF.fit(comb_X, comb_Y)
print("done fitting model")

**Get F1 distribution**

In [None]:
F1_distr_comb = get_F1_distribution(comb_test, model_comb)
plt.hist(F1_distr_comb, bins = 50)
print("number of f-measures: ", len(F1_distr_comb))
print(np.mean(F1_distr_comb), np.std(F1_distr_comb), stats.sem(F1_distr_comb))

**CHECK ALL MODEL PERFORMANCES FIGURES**

In [None]:
all_F1['CC'] = F1_distr_comb
pd.DataFrame.from_dict(all_F1)

In [None]:
F1_diff_B = list()
F1_diff_A = list()
F1_diff_A_C = list()
F1_diff_B_C = list()

for i in range(0,len(all_F1['BB'])):
    bb = all_F1['BB'][i]
    ba = all_F1['BA'][i]
    F1_diff_B.append(bb - ba)
    
    aa = all_F1['AA'][i]
    ab = all_F1['AB'][i]
    F1_diff_A.append(aa - ab)
    
    cc = all_F1['CC'][i]
    F1_diff_A_C.append(aa - cc)
    F1_diff_B_C.append(bb - cc)
    
F1_differences = {1 : F1_diff_B, 2 : F1_diff_A, 3 : F1_diff_B_C, 4 : F1_diff_A_C}

In [None]:
sns.set()
fig, axs = plt.subplots(2, 2, figsize = (12,8))
counter = 1
title = {1 : "1. BB-BA", 2 : "2. AA-AB", 3 : "3. BB-CC", 4 : "4. AA-CC"}

for row in [0,1]:
    for column in [0,1]:
        
        q1 = np.percentile(F1_differences[counter], 1.25)
        q2 = np.percentile(F1_differences[counter], 98.75)
        print(title[counter])
        print(q1, q2)
        
        if counter < 3:
            color = 'g'
        else:
            color = 'b'
        axs[row, column].hist(F1_differences[counter], bins = 50, color = color)
        axs[row, column].set_title(title[counter], fontsize = 13, weight = 'bold', loc = 'right')
        axs[row, column].set_xlim(-0.04, 0.07)
        axs[row, column].set_ylim(0, 70)
        axs[row, column].axvline(color = '0', linestyle = 'dashed')
        axs[row, column].axvline(q1, color = 'r')
        axs[row, column].axvline(q2, color = 'r')
        #         axs[row, column].set_ylim(0, 0.3)
        
        counter = counter + 1
axs[0, 0].set_ylabel('Count', fontsize = 12)  
axs[1, 0].set_ylabel('Count', fontsize = 12)  
axs[1, 0].set_xlabel('Macro F1-score difference', fontsize = 12)
axs[1, 1].set_xlabel('Macro F1-score difference', fontsize = 12)
fig.tight_layout()
# fig.savefig("difference_CI_performance_CompleNet.jpg")