In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import math
import numbers
from scipy import stats
from dateutil import parser
from scipy.stats import ks_2samp 
from matplotlib.patches import Rectangle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.inspection import permutation_importance
import collections
import time
import ast
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import NeighbourhoodCleaningRule, EditedNearestNeighbours, RandomUnderSampler
from collections import Counter
import re
import nltk
from sklearn.ensemble import GradientBoostingClassifier
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from emoji import UNICODE_EMOJI
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from functions_thesis import preprocessing

In [None]:
# load data
before_data = pd.read_csv("before_train_val.csv", sep = "|").drop(columns = ['Unnamed: 0'])

# do some preprocessing
before_data = preprocessing(before_data)
print(before_data.shape)
before_data.keys()

In [None]:
# load data
after_data = pd.read_csv("after_train_val.csv", sep = "|").drop(columns = ['Unnamed: 0'])

# do some preprocessing
after_data = preprocessing(after_data)
print(after_data.shape)
after_data.keys()

**FOLLOWERS**

In [None]:
max_followers_before = before_data['public_metrics.followers_count'].max()
max_followers_after = after_data['public_metrics.followers_count'].max()
edges_new = np.logspace(np.log10(1),np.log10(max(max_followers_before, max_followers_after)), 20)
edges_new[0] = 0


bin_means1_retweet, bin_edges, binnumber_b = stats.binned_statistic(before_data['public_metrics.followers_count'], before_data['public_metrics.retweet_count'], statistic = 'mean', bins = edges_new)
bin_means2_retweet, bin_edges, binnumber_a = stats.binned_statistic(after_data['public_metrics.followers_count'], after_data['public_metrics.retweet_count'], statistic = 'mean', bins = edges_new)

after_data['bin'] = binnumber_a
before_data['bin'] = binnumber_b

# create mean value of each bin for x-axis
x = list()
for i in range(len(edges_new) - 1):
    left = edges_new[i]
    right = edges_new[i + 1]
    x.append((right - left) / 2 + left)

retweet_class = {1 : 'no retweets', 2 : '1-9 retweets', 3 : '10-99 retweets', 4 : '100-999 retweets', 5 : '1000+ retweets', 6 : 'at least 1 retweet'}
rt_class = 1

fig, axs = plt.subplots(2, 3, figsize = (17,8))

for row in [0,1]:
    for column in [0,1,2]:
        
        if rt_class == 6:
            binned_before = before_data[before_data['retweet_class'] > 1].groupby("bin").count()['public_metrics.retweet_count'] / before_data[before_data['retweet_class'] > 1].count()['id']
            binned_after = after_data[after_data['retweet_class'] > 1].groupby("bin").count()['public_metrics.retweet_count'] / after_data[after_data['retweet_class'] > 1].count()['id']
        else:     
            binned_before = before_data[before_data['retweet_class'] == rt_class].groupby("bin").count()['public_metrics.retweet_count'] / before_data[before_data['retweet_class'] == rt_class].count()['id']
            binned_after = after_data[after_data['retweet_class'] == rt_class].groupby("bin").count()['public_metrics.retweet_count'] / after_data[after_data['retweet_class'] == rt_class].count()['id']

        axs[row, column].plot(np.array(x)[[item - 1 for item in binned_before.index]], binned_before, label = 'before invasion', color = 'b')
        axs[row, column].plot(np.array(x)[[item - 1 for item in binned_after.index]], binned_after, label = 'after invasion', color = 'r')
        axs[row, column].set_title("UKRAINE prob. distr. - "+ retweet_class[rt_class])
        axs[row, column].set_ylabel('fraction of retweets from class')
        axs[row, column].set_xlabel('followers increasing (log)')
        axs[row, column].set_xscale('symlog')
        axs[row, column].set_xlim(1, 200000000)
#         axs[row, column].set_ylim(0, 0.3)
        
        rt_class = rt_class + 1
    axs[0,0].legend(loc = 'upper right')
    
fig.tight_layout()
# fig.savefig("distribution_retweetclasses.png")

In [None]:
viral_binned_before = before_data[before_data['viral'] == 1].groupby("bin").count()['viral'] / before_data[before_data['viral'] == 1].count()['viral']
viral_binned_after = after_data[after_data['viral'] == 1].groupby("bin").count()['viral'] / after_data[after_data['viral'] == 1].count()['viral']

print(len(viral_binned_before))

fig, ax = plt.subplots()

ax.plot(np.array(x)[[item - 1 for item in viral_binned_before.index]], viral_binned_before, label = 'before invasion', color = 'b')
ax.plot(np.array(x)[[item - 1 for item in viral_binned_after.index]], viral_binned_after, label = 'after invasion', color = 'r')
ax.set_title('Fraction of viral tweets for number of followers', fontsize = 13, weight = 'bold')
ax.set_xlabel('followers count (log)', fontsize = 12)
ax.set_ylabel('fraction of viral tweets', fontsize = 12)
ax.set_xscale('symlog')
ax.legend()
fig.tight_layout()
fig.savefig("viral_followers_complenet.png")

In [None]:
viral_binned_before = before_data.groupby("bin").count()['viral'] / len(before_data)
viral_binned_after = after_data.groupby("bin").count()['viral'] / len(after_data)

print(len(viral_binned_before))

fig, ax = plt.subplots()

ax.plot(np.array(x)[[item - 1 for item in viral_binned_before.index]], viral_binned_before, label = 'before invasion', color = 'b')
ax.plot(np.array(x)[[item - 1 for item in viral_binned_after.index]], viral_binned_after, label = 'after invasion', color = 'r')
ax.set_title('Fraction of tweets for number of followers', fontsize = 13, weight = 'bold')
ax.set_xlabel('followers increasing (log)', fontsize = 12)
ax.set_ylabel('fraction of total tweets', fontsize = 12)
ax.set_xscale('symlog')
ax.legend()
fig.tight_layout()
# fig.savefig("totalTweets_followers.png")

In [None]:
viral_before_users = before_data[before_data['viral'] == 1].drop_duplicates(subset = 'author_id')
viral_after_users = after_data[after_data['viral'] == 1].drop_duplicates(subset = 'author_id')

viral_binned_before = viral_before_users.groupby("bin").count()['viral'] / viral_before_users.count()['viral']
viral_binned_after = viral_after_users.groupby("bin").count()['viral'] / viral_after_users.count()['viral']
print(len(viral_binned_before))


fig, ax = plt.subplots()

ax.plot(np.array(x)[[item - 1 for item in viral_binned_before.index]], viral_binned_before, label = 'before invasion', color = 'b')
ax.plot(np.array(x)[[item - 1 for item in viral_binned_after.index]], viral_binned_after, label = 'after invasion', color = 'r')
ax.set_title('UKRAINE prob. distr. viral tweets (>100 retweets)')
ax.set_xlabel('followers increasing (log)')
ax.set_ylabel('fraction of viral tweets')
ax.set_xscale('symlog')

**SEX**

In [None]:
sex_before = before_data[['sex_generalized', 'viral']]
sex_after = after_data[['sex_generalized', 'viral']]

viral_before = before_data[before_data.viral == 1]
viral_after = after_data[after_data.viral == 1]

before_men = len(viral_before[viral_before.sex_generalized == 1]) / len(viral_before)
before_women = len(viral_before[viral_before.sex_generalized == -1]) / len(viral_before)
before_un = len(viral_before[viral_before.sex_generalized == 0]) / len(viral_before)
after_men = len(viral_after[viral_after.sex_generalized == 1]) / len(viral_after)
after_women = len(viral_after[viral_after.sex_generalized == -1]) / len(viral_after)
after_un = len(viral_after[viral_after.sex_generalized == 0]) / len(viral_after)

data = pd.DataFrame(data = {'Before or After Invasion' : ['before', 'after'], 'men' : [before_men, after_men], 'women' : [before_women, after_women], 'unknown' : [before_un, after_un]}, index = [1,2])

data.plot(x = 'Before or After Invasion', kind = 'barh', stacked = True, title = 'Sex percentage of viral tweets authors')
# fig, ax = plt.subplots()
# sex_before.plot(x = 'viral', stacked = True)
# # sex.plot(kind = 'bar', stacked = True)

**LISTED USERS**

In [None]:
max_listed_before = before_data['public_metrics.listed_count'].max()
max_listed_after = after_data['public_metrics.listed_count'].max()
edges_new = np.logspace(np.log10(1),np.log10(max(max_listed_before, max_listed_after)), 20)
edges_new[0] = 0


bin_means1_retweet, bin_edges, binnumber_b = stats.binned_statistic(before_data['public_metrics.listed_count'], before_data['public_metrics.retweet_count'], statistic = 'mean', bins = edges_new)
bin_means2_retweet, bin_edges, binnumber_a = stats.binned_statistic(after_data['public_metrics.listed_count'], after_data['public_metrics.retweet_count'], statistic = 'mean', bins = edges_new)

after_data['bin'] = binnumber_a
before_data['bin'] = binnumber_b

# create mean value of each bin for x-axis
x = list()
for i in range(len(edges_new) - 1):
    left = edges_new[i]
    right = edges_new[i + 1]
    x.append((right - left) / 2 + left)

retweet_class = {1 : 'no retweets', 2 : '1-9 retweets', 3 : '10-99 retweets', 4 : '100-999 retweets', 5 : '1000+ retweets', 6 : 'at least 1 retweet'}
rt_class = 1

fig, axs = plt.subplots(2, 3, figsize = (17,8))

for row in [0,1]:
    for column in [0,1,2]:
        
        if rt_class == 6:
            binned_before = before_data[before_data['retweet_class'] > 1].groupby("bin").count()['public_metrics.retweet_count'] / before_data[before_data['retweet_class'] > 1].count()['id']
            binned_after = after_data[after_data['retweet_class'] > 1].groupby("bin").count()['public_metrics.retweet_count'] / after_data[after_data['retweet_class'] > 1].count()['id']
        else:     
            binned_before = before_data[before_data['retweet_class'] == rt_class].groupby("bin").count()['public_metrics.retweet_count'] / before_data[before_data['retweet_class'] == rt_class].count()['id']
            binned_after = after_data[after_data['retweet_class'] == rt_class].groupby("bin").count()['public_metrics.retweet_count'] / after_data[after_data['retweet_class'] == rt_class].count()['id']

        axs[row, column].plot(np.array(x)[[item - 1 for item in binned_before.index]], binned_before, label = 'before invasion', color = 'b')
        axs[row, column].plot(np.array(x)[[item - 1 for item in binned_after.index]], binned_after, label = 'after invasion', color = 'r')
        axs[row, column].set_title("UKRAINE prob. distr. - "+ retweet_class[rt_class])
        axs[row, column].set_ylabel('fraction of retweets from class')
        axs[row, column].set_xlabel('listed count increasing (log)')
        axs[row, column].set_xscale('symlog')
#         axs[row, column].set_xlim(1, 200000000)
#         axs[row, column].set_ylim(0, 0.3)
        
        rt_class = rt_class + 1
    axs[0,0].legend(loc = 'upper right')
    
fig.tight_layout()
# fig.savefig("distribution_retweetclasses.png")

In [None]:
viral_binned_before = before_data[before_data['viral'] == 1].groupby("bin").count()['viral'] / before_data[before_data['viral'] == 1].count()['viral']
viral_binned_after = after_data[after_data['viral'] == 1].groupby("bin").count()['viral'] / after_data[after_data['viral'] == 1].count()['viral']

print(len(viral_binned_before))

fig, ax = plt.subplots()

ax.plot(np.array(x)[[item - 1 for item in viral_binned_before.index]], viral_binned_before, label = 'before invasion', color = 'b')
ax.plot(np.array(x)[[item - 1 for item in viral_binned_after.index]], viral_binned_after, label = 'after invasion', color = 'r')
ax.set_title('Fraction of viral tweets for listed count', fontsize = 13, weight = 'bold')
ax.set_xlabel('list count increasing (log)', fontsize = 12)
ax.set_ylabel('fraction of viral tweets', fontsize = 12)
ax.set_xscale('symlog')
ax.legend()
fig.tight_layout()
# fig.savefig("viral_list.png")

**TWEET COUNT**

In [None]:
max_tweetcount_before = before_data['public_metrics.tweet_count'].max()
max_tweetcount_after = after_data['public_metrics.tweet_count'].max()
edges_new = np.logspace(np.log10(1),np.log10(max(max_tweetcount_before, max_tweetcount_after)), 20)
edges_new[0] = 0


bin_means1_retweet, bin_edges, binnumber_b = stats.binned_statistic(before_data['public_metrics.tweet_count'], before_data['public_metrics.tweet_count'], statistic = 'mean', bins = edges_new)
bin_means2_retweet, bin_edges, binnumber_a = stats.binned_statistic(after_data['public_metrics.tweet_count'], after_data['public_metrics.tweet_count'], statistic = 'mean', bins = edges_new)

after_data['bin'] = binnumber_a
before_data['bin'] = binnumber_b

# create mean value of each bin for x-axis
x = list()
for i in range(len(edges_new) - 1):
    left = edges_new[i]
    right = edges_new[i + 1]
    x.append((right - left) / 2 + left)

retweet_class = {1 : 'no retweets', 2 : '1-9 retweets', 3 : '10-99 retweets', 4 : '100-999 retweets', 5 : '1000+ retweets', 6 : 'at least 1 retweet'}
rt_class = 1

fig, axs = plt.subplots(2, 3, figsize = (17,8))

for row in [0,1]:
    for column in [0,1,2]:
        
        if rt_class == 6:
            binned_before = before_data[before_data['retweet_class'] > 1].groupby("bin").count()['public_metrics.retweet_count'] / before_data[before_data['retweet_class'] > 1].count()['id']
            binned_after = after_data[after_data['retweet_class'] > 1].groupby("bin").count()['public_metrics.retweet_count'] / after_data[after_data['retweet_class'] > 1].count()['id']
        else:     
            binned_before = before_data[before_data['retweet_class'] == rt_class].groupby("bin").count()['public_metrics.retweet_count'] / before_data[before_data['retweet_class'] == rt_class].count()['id']
            binned_after = after_data[after_data['retweet_class'] == rt_class].groupby("bin").count()['public_metrics.retweet_count'] / after_data[after_data['retweet_class'] == rt_class].count()['id']

        axs[row, column].plot(np.array(x)[[item - 1 for item in binned_before.index]], binned_before, label = 'before invasion', color = 'b')
        axs[row, column].plot(np.array(x)[[item - 1 for item in binned_after.index]], binned_after, label = 'after invasion', color = 'r')
        axs[row, column].set_title("UKRAINE prob. distr. - "+ retweet_class[rt_class])
        axs[row, column].set_ylabel('fraction of retweets from class')
        axs[row, column].set_xlabel('tweet count increasing (log)')
        axs[row, column].set_xscale('symlog')
#         axs[row, column].set_xlim(1, 200000000)
#         axs[row, column].set_ylim(0, 0.3)
        
        rt_class = rt_class + 1
    axs[0,0].legend(loc = 'upper right')
    
fig.tight_layout()

In [None]:
viral_binned_before = before_data[before_data['viral'] == 1].groupby("bin").count()['viral'] / before_data[before_data['viral'] == 1].count()['viral']
viral_binned_after = after_data[after_data['viral'] == 1].groupby("bin").count()['viral'] / after_data[after_data['viral'] == 1].count()['viral']

print(len(viral_binned_before))

fig, ax = plt.subplots()

ax.plot(np.array(x)[[item - 1 for item in viral_binned_before.index]], viral_binned_before, label = 'before invasion', color = 'b')
ax.plot(np.array(x)[[item - 1 for item in viral_binned_after.index]], viral_binned_after, label = 'after invasion', color = 'r')
ax.set_title('Fraction of viral tweets for number of tweets', fontsize = 13, weight = 'bold')
ax.set_xlabel('tweet count increasing (log)', fontsize = 12)
ax.set_ylabel('fraction of viral tweets', fontsize = 12)
ax.set_xscale('symlog')
ax.legend()
fig.tight_layout()
# fig.savefig("viral_tweetcount.png")

**URLS COUNT**

In [None]:
viral_before = before_data[before_data['viral'] == 1]
viral_after = after_data[after_data['viral'] == 1]

In [None]:
viral_before['urls_count'].unique()

In [None]:
before_url = list()
# sns.reset_defaults()
labels = [0,1,2,3,4,5]
sns.set()
for item in labels:
    total_url_cat = before_data[before_data['urls_count'] == item]['urls_count'].count()
    print(item, total_url_cat)
    before_url.append(viral_before[viral_before['urls_count'] == item]['urls_count'].count() / total_url_cat)

# before_url = pd.DataFrame.from_dict(before_url)    
before_url

after_url = list()

for item in labels:
    
    if item == 5:
        total_url_cat = after_data[after_data['urls_count'] >= item]['urls_count'].count()
        print(item, total_url_cat)
        after_url.append(viral_after[viral_after['urls_count'] >= item]['urls_count'].count() / total_url_cat)
    else: 
        total_url_cat = after_data[after_data['urls_count'] == item]['urls_count'].count()
        print(item, total_url_cat)
        after_url.append(viral_after[viral_after['urls_count'] == item]['urls_count'].count() / total_url_cat)

x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars
labels = ['0','1','2','3','4','5/6']

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, before_url, width, label='Before invasion')
rects2 = ax.bar(x + width/2, after_url, width, label='After invasion', color = 'r')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Fraction of viral tweets', fontsize = 12)
ax.set_xlabel('Number of URL(s)', fontsize = 12)

ax.set_title('Fraction of viral tweets per URL(s) count', fontsize = 13, weight = 'bold')
ax.set_xticks(x, labels)
ax.legend()

# ax.bar_label(rects1, padding=3)
# ax.bar_label(rects2, padding=3)

fig.tight_layout()

plt.show()
# fig.savefig("URLs_count.png")

**ACCOUNT AGE**

In [None]:
frac_age_before = before_data.groupby(by = ['account_age_y']).sum()['viral'] / before_data.groupby(by = ['account_age_y']).count()['viral']
frac_age_after = after_data.groupby(by = ['account_age_y']).sum()['viral'] / after_data.groupby(by = ['account_age_y']).count()['viral']
frac_age_after = frac_age_after[:-1]
frac_age_after

In [None]:
labels = list(range(0,17))
x = np.arange(len(labels))  # the label locations
width = 0.35  # the width of the bars

fig, ax = plt.subplots()
rects1 = ax.bar(x - width/2, frac_age_before, width, label='Before invasion')
rects2 = ax.bar(x + width/2, frac_age_after, width, label='After invasion', color = 'r')

# Add some text for labels, title and custom x-axis tick labels, etc.
ax.set_ylabel('Fraction of viral tweets', fontsize = 12)
ax.set_xlabel('Account age(y)', fontsize = 12)

ax.set_title('Fraction of viral tweets', fontsize = 13, weight = 'bold')
ax.set_xticks(x, labels)
ax.legend()

# ax.bar_label(rects1, padding=3)
# ax.bar_label(rects2, padding=3)

fig.tight_layout()