In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from dateutil import parser
import datetime
from scipy import stats
import math
from scipy.stats import ks_2samp 

# initialise current year and month 
year = datetime.datetime.today().year
print("current year: ", year)

In [None]:
# merge tweet files
before = pd.read_csv("covid_before_tweets.csv", sep = "|").drop(columns = ['Unnamed: 0'])
before = before.dropna(subset = ['public_metrics.retweet_count'])
before = before[before.author_id != 'en']
before['author_id'] = before.apply(lambda row: int(row.author_id), axis = 1)
before['created_at'] = before.apply(lambda row: parser.parse(row.created_at), axis = 1)
print("number of tweets before invasion: ", len(before))
print("number of unique users based on tweets: ", len(before.author_id.unique()))
before['author_id'] = before.apply(lambda row: int(row.author_id), axis = 1)

# merge user files
users_before = pd.read_csv("covid_before_users.csv", sep = "|").drop(columns = ['Unnamed: 0', 'withheld.country_codes', 'withheld.scope']).drop_duplicates(subset = ['id'], keep = 'first').rename(columns = {"id" : "author_id", "created_at" : "account_age"})
users_before['account_age'] = users_before.apply(lambda row: parser.parse(row.account_age), axis = 1)
users_before['account_age_y'] = users_before.apply(lambda row: year - row['account_age'].year, axis = 1)

# combine tweets and users
before_covid_data = before.join(users_before.set_index('author_id'), on = ['author_id'])
before_covid_data

In [None]:
# merge tweet files
after1 = pd.read_csv("covid_after_tweets.csv", sep = "|").drop(columns = ['Unnamed: 0'])
# after1['created_at'] = after1.apply(lambda row: parser.parse(row.created_at), axis = 1)
after2 = pd.read_csv("covid_afterEXTRA_tweets.csv", sep = "|").drop(columns = ['Unnamed: 0'])
after_tweets = pd.concat([after1, after2])
after_tweets = after_tweets.dropna(subset=['public_metrics.retweet_count'])
after_tweets = after_tweets[after_tweets.author_id != 'en']
# after_tweets['created_at'] = after_tweets.apply(lambda row: parser.parse(row.created_at), axis = 1)

print("number of tweets after covid: ", len(after_tweets))
print("number of unique users based on tweets: ", len(after_tweets.author_id.unique()))

# merge user files
users_after1 = pd.read_csv("covid_after_users.csv", sep = "|").drop(columns = ['Unnamed: 0', 'withheld.country_codes', 'withheld.scope']).drop_duplicates(subset = ['id'], keep = 'first').rename(columns = {"id" : "author_id", "created_at" : "account_age"})
users_after2 = pd.read_csv("covid_afterEXTRA_users.csv", sep = "|").drop(columns = ['Unnamed: 0', 'withheld.country_codes']).drop_duplicates(subset = ['id'], keep = 'first').rename(columns = {"id" : "author_id", "created_at" : "account_age"})
users_after = pd.concat([users_after1, users_after2]).drop_duplicates(subset = ['author_id'], keep = 'first')
users_after['account_age'] = users_after.apply(lambda row: parser.parse(row.account_age), axis = 1)
users_after['account_age_y'] = users_after.apply(lambda row: year - row['account_age'].year, axis = 1)

# combine tweets and users
after_covid_data = after_tweets.join(users_after.set_index('author_id'), on = ['author_id'])
after_covid_data

In [None]:
# only with user data
after_covid_data = after_covid_data.dropna(subset = ['public_metrics.followers_count'])
after_covid_data

In [None]:
# only with user data
before_covid_data = before_covid_data.dropna(subset = ['public_metrics.followers_count'])
before_covid_data

In [None]:
before_covid_data['public_metrics.followers_count'] = before_covid_data.apply(lambda row: int(row['public_metrics.followers_count']), axis = 1)
before_covid_data['public_metrics.retweet_count'] = before_covid_data.apply(lambda row: int(row['public_metrics.retweet_count']), axis = 1)
before_covid_data['public_metrics.like_count'] = before_covid_data.apply(lambda row: int(row['public_metrics.like_count']), axis = 1)
before_covid_data['public_metrics.reply_count'] = before_covid_data.apply(lambda row: int(row['public_metrics.reply_count']), axis = 1)

after_covid_data['public_metrics.followers_count'] = after_covid_data.apply(lambda row: int(row['public_metrics.followers_count']), axis = 1)
after_covid_data['public_metrics.retweet_count'] = after_covid_data.apply(lambda row: int(row['public_metrics.retweet_count']), axis = 1)
after_covid_data['public_metrics.like_count'] = after_covid_data.apply(lambda row: int(row['public_metrics.like_count']), axis = 1)
after_covid_data['public_metrics.reply_count'] = after_covid_data.apply(lambda row: int(row['public_metrics.reply_count']), axis = 1)


In [None]:
def retweet_group(data, groups):
    for item in groups:
        left = groups[item][0]
        right = groups[item][1]
        
        # check if datapoint belongs to group
        if (data >= left) & (data < right):
            return item
        
# assign tweets to retweet class
groups = {1 : [0, 1], 2 : [1, 10], 3 : [10, 100], 4 : [100, 1000], 5 : [1000, 100000000000]}
before_covid_data['retweet_class'] = before_covid_data.apply(lambda row: retweet_group(row['public_metrics.retweet_count'], groups), axis = 1)
after_covid_data['retweet_class'] = after_covid_data.apply(lambda row: retweet_group(row['public_metrics.retweet_count'], groups), axis = 1)
print("count retweet classes BEFORE", before_covid_data.groupby('retweet_class').count()['id'])
print("count retweet classes AFTER", after_covid_data.groupby('retweet_class').count()['id'])

before_covid_data['retweet_bool'] = before_covid_data.apply(lambda row: 1 if row['public_metrics.retweet_count'] > 0 else 0, axis = 1)
after_covid_data['retweet_bool'] = after_covid_data.apply(lambda row: 1 if row['public_metrics.retweet_count'] > 0 else 0, axis = 1)


In [None]:
sns.set()
max_listed_before = before_covid_data['public_metrics.followers_count'].max()
max_listed_after = after_covid_data['public_metrics.followers_count'].max()
edges_new = np.logspace(np.log10(1),np.log10(max(max_listed_before, max_listed_after)), 20)
edges_new[0] = 0


bin_means1_retweet, bin_edges, binnumber_b = stats.binned_statistic(before_covid_data['public_metrics.followers_count'], before_covid_data['public_metrics.retweet_count'], statistic = 'mean', bins = edges_new)
bin_means2_retweet, bin_edges, binnumber_a = stats.binned_statistic(after_covid_data['public_metrics.followers_count'], after_covid_data['public_metrics.retweet_count'], statistic = 'mean', bins = edges_new)

after_covid_data['bin'] = binnumber_a
before_covid_data['bin'] = binnumber_b

# create mean value of each bin for x-axis
x = list()
for i in range(len(edges_new) - 1):
    left = edges_new[i]
    right = edges_new[i + 1]
    x.append((right - left) / 2 + left)

retweet_class = {1 : 'no retweets', 2 : '1-9 retweets', 3 : '10-99 retweets', 4 : '100-999 retweets', 5 : '1000+ retweets', 6 : 'at least 1 retweet'}
rt_class = 1

fig, axs = plt.subplots(2, 3, figsize = (17,8))

for row in [0,1]:
    for column in [0,1,2]:
        
        if rt_class == 6:
            binned_before = before_covid_data[before_covid_data['retweet_class'] > 1].groupby("bin").count()['public_metrics.retweet_count'] / before_covid_data[before_covid_data['retweet_class'] > 1].count()['id']
            binned_after = after_covid_data[after_covid_data['retweet_class'] > 1].groupby("bin").count()['public_metrics.retweet_count'] / after_covid_data[after_covid_data['retweet_class'] > 1].count()['id']
        else:     
            binned_before = before_covid_data[before_covid_data['retweet_class'] == rt_class].groupby("bin").count()['public_metrics.retweet_count'] / before_covid_data[before_covid_data['retweet_class'] == rt_class].count()['id']
            binned_after = after_covid_data[after_covid_data['retweet_class'] == rt_class].groupby("bin").count()['public_metrics.retweet_count'] / after_covid_data[after_covid_data['retweet_class'] == rt_class].count()['id']

        axs[row, column].plot(np.array(x)[[item - 1 for item in binned_before.index]], binned_before, label = 'before invasion', color = 'b')
        axs[row, column].plot(np.array(x)[[item - 1 for item in binned_after.index]], binned_after, label = 'after invasion', color = 'r')
        axs[row, column].set_title("COVID prob. distr. - "+ retweet_class[rt_class])
        axs[row, column].set_ylabel('fraction of retweets from class')
        axs[row, column].set_xlabel('followers count increasing (log)')
        axs[row, column].set_xscale('symlog')
        axs[row, column].set_xlim(1, 200000000)
        
        rt_class = rt_class + 1
    axs[0,0].legend(loc = 'upper right')
    
fig.tight_layout()
# fig.savefig("COVID_distribution_retweetclasses.png")

In [None]:
sns.set()
viral_binned_before = before_covid_data[before_covid_data['retweet_class'] > 3].groupby("bin").count()['id'] / before_covid_data[before_covid_data['retweet_class'] > 3].count()['id']
viral_binned_after = after_covid_data[after_covid_data['retweet_class'] > 3].groupby("bin").count()['id'] / after_covid_data[after_covid_data['retweet_class'] > 3].count()['id']


fig, ax = plt.subplots()
                                                                                                                            
ax.plot(np.array(x)[[item - 1 for item in viral_binned_before.index]], viral_binned_before, label = 'before invasion', color = 'b')
ax.plot(np.array(x)[[item - 1 for item in viral_binned_after.index]], viral_binned_after, label = 'after invasion', color = 'r')
ax.legend(loc = 'upper left', fontsize = 15)
ax.set_title('COVID - Fraction of viral tweets for number of followers', size = 13, weight = 'bold')
ax.set_xlabel('followers increasing (log)', size = 12)
ax.set_ylabel('fraction of viral tweets', size = 12)
ax.set_xscale('symlog')
fig.tight_layout()
fig.savefig("COVID_viral_followers.png")

In [None]:
max_listed_before = before_covid_data['public_metrics.listed_count'].max()
max_listed_after = after_covid_data['public_metrics.listed_count'].max()
edges_new = np.logspace(np.log10(1),np.log10(max(max_listed_before, max_listed_after)), 20)
edges_new[0] = 0


bin_means1_retweet, bin_edges, binnumber_b = stats.binned_statistic(before_covid_data['public_metrics.listed_count'], before_covid_data['public_metrics.retweet_count'], statistic = 'mean', bins = edges_new)
bin_means2_retweet, bin_edges, binnumber_a = stats.binned_statistic(after_covid_data['public_metrics.listed_count'], after_covid_data['public_metrics.retweet_count'], statistic = 'mean', bins = edges_new)

after_covid_data['bin'] = binnumber_a
before_covid_data['bin'] = binnumber_b

# create mean value of each bin for x-axis
x = list()
for i in range(len(edges_new) - 1):
    left = edges_new[i]
    right = edges_new[i + 1]
    x.append((right - left) / 2 + left)

retweet_class = {1 : 'no retweets', 2 : '1-9 retweets', 3 : '10-99 retweets', 4 : '100-999 retweets', 5 : '1000+ retweets', 6 : 'at least 1 retweet'}
rt_class = 1

fig, axs = plt.subplots(2, 3, figsize = (17,8))

for row in [0,1]:
    for column in [0,1,2]:
        
        if rt_class == 6:
            binned_before = before_covid_data[before_covid_data['retweet_class'] > 1].groupby("bin").count()['public_metrics.retweet_count'] / before_covid_data[before_covid_data['retweet_class'] > 1].count()['id']
            binned_after = after_covid_data[after_covid_data['retweet_class'] > 1].groupby("bin").count()['public_metrics.retweet_count'] / after_covid_data[after_covid_data['retweet_class'] > 1].count()['id']
        else:     
            binned_before = before_covid_data[before_covid_data['retweet_class'] == rt_class].groupby("bin").count()['public_metrics.retweet_count'] / before_covid_data[before_covid_data['retweet_class'] == rt_class].count()['id']
            binned_after = after_covid_data[after_covid_data['retweet_class'] == rt_class].groupby("bin").count()['public_metrics.retweet_count'] / after_covid_data[after_covid_data['retweet_class'] == rt_class].count()['id']

        axs[row, column].plot(np.array(x)[[item - 1 for item in binned_before.index]], binned_before, label = 'before invasion', color = 'b')
        axs[row, column].plot(np.array(x)[[item - 1 for item in binned_after.index]], binned_after, label = 'after invasion', color = 'r')
        axs[row, column].set_title("COVIDD prob. distr. - "+ retweet_class[rt_class])
        axs[row, column].set_ylabel('fraction of retweets from class')
        axs[row, column].set_xlabel('listed count increasing (log)')
        axs[row, column].set_xscale('symlog')
#         axs[row, column].set_xlim(1, 200000000)
#         axs[row, column].set_ylim(0, 0.3)
        
        rt_class = rt_class + 1
    axs[0,0].legend(loc = 'upper right')
    
fig.tight_layout()
# fig.savefig("distribution_retweetclasses.png")

In [None]:
sns.set()
viral_binned_before = before_covid_data[before_covid_data['retweet_class'] > 3].groupby("bin").count()['id'] / before_covid_data[before_covid_data['retweet_class'] > 3].count()['id']
viral_binned_after = after_covid_data[after_covid_data['retweet_class'] > 3].groupby("bin").count()['id'] / after_covid_data[after_covid_data['retweet_class'] > 3].count()['id']


fig, ax = plt.subplots()
                                                                                                                            
ax.plot(np.array(x)[[item - 1 for item in viral_binned_before.index]], viral_binned_before, label = 'before invasion', color = 'b')
ax.plot(np.array(x)[[item - 1 for item in viral_binned_after.index]], viral_binned_after, label = 'after invasion', color = 'r')
ax.legend(loc = 'upper left', fontsize = 15)
ax.set_title('COVID - Fraction of viral tweets for number of listed users', size = 13, weight = 'bold')
ax.set_xlabel('listed count increasing (log)', size = 12)
ax.set_ylabel('fraction of viral tweets', size = 12)
ax.set_xscale('symlog')

In [None]:
max_listed_before = before_covid_data['public_metrics.tweet_count'].max()
max_listed_after = after_covid_data['public_metrics.tweet_count'].max()
edges_new = np.logspace(np.log10(1),np.log10(max(max_listed_before, max_listed_after)), 20)
edges_new[0] = 0


bin_means1_retweet, bin_edges, binnumber_b = stats.binned_statistic(before_covid_data['public_metrics.tweet_count'], before_covid_data['public_metrics.retweet_count'], statistic = 'mean', bins = edges_new)
bin_means2_retweet, bin_edges, binnumber_a = stats.binned_statistic(after_covid_data['public_metrics.tweet_count'], after_covid_data['public_metrics.retweet_count'], statistic = 'mean', bins = edges_new)

after_covid_data['bin'] = binnumber_a
before_covid_data['bin'] = binnumber_b

# create mean value of each bin for x-axis
x = list()
for i in range(len(edges_new) - 1):
    left = edges_new[i]
    right = edges_new[i + 1]
    x.append((right - left) / 2 + left)

retweet_class = {1 : 'no retweets', 2 : '1-9 retweets', 3 : '10-99 retweets', 4 : '100-999 retweets', 5 : '1000+ retweets', 6 : 'at least 1 retweet'}
rt_class = 1

fig, axs = plt.subplots(2, 3, figsize = (17,8))

for row in [0,1]:
    for column in [0,1,2]:
        
        if rt_class == 6:
            binned_before = before_covid_data[before_covid_data['retweet_class'] > 1].groupby("bin").count()['public_metrics.retweet_count'] / before_covid_data[before_covid_data['retweet_class'] > 1].count()['id']
            binned_after = after_covid_data[after_covid_data['retweet_class'] > 1].groupby("bin").count()['public_metrics.retweet_count'] / after_covid_data[after_covid_data['retweet_class'] > 1].count()['id']
        else:     
            binned_before = before_covid_data[before_covid_data['retweet_class'] == rt_class].groupby("bin").count()['public_metrics.retweet_count'] / before_covid_data[before_covid_data['retweet_class'] == rt_class].count()['id']
            binned_after = after_covid_data[after_covid_data['retweet_class'] == rt_class].groupby("bin").count()['public_metrics.retweet_count'] / after_covid_data[after_covid_data['retweet_class'] == rt_class].count()['id']

        axs[row, column].plot(np.array(x)[[item - 1 for item in binned_before.index]], binned_before, label = 'before invasion', color = 'b')
        axs[row, column].plot(np.array(x)[[item - 1 for item in binned_after.index]], binned_after, label = 'after invasion', color = 'r')
        axs[row, column].set_title("COVID prob. distr. - "+ retweet_class[rt_class])
        axs[row, column].set_ylabel('fraction of retweets from class')
        axs[row, column].set_xlabel('tweet count increasing (log)')
        axs[row, column].set_xscale('symlog')
        
        rt_class = rt_class + 1
    axs[0,0].legend(loc = 'upper right')
    
fig.tight_layout()
# fig.savefig("distribution_retweetclasses.png")

In [None]:
sns.set()
viral_binned_before = before_covid_data[before_covid_data['retweet_class'] > 3].groupby("bin").count()['id'] / before_covid_data[before_covid_data['retweet_class'] > 3].count()['id']
viral_binned_after = after_covid_data[after_covid_data['retweet_class'] > 3].groupby("bin").count()['id'] / after_covid_data[after_covid_data['retweet_class'] > 3].count()['id']


fig, ax = plt.subplots()
                                                                                                                            
ax.plot(np.array(x)[[item - 1 for item in viral_binned_before.index]], viral_binned_before, label = 'before invasion', color = 'b')
ax.plot(np.array(x)[[item - 1 for item in viral_binned_after.index]], viral_binned_after, label = 'after invasion', color = 'r')
ax.legend(loc = 'upper left', fontsize = 15)
ax.set_title('COVID - Fraction of viral tweets for tweet count', size = 13, weight = 'bold')
ax.set_xlabel('tweet count increasing (log)', size = 12)
ax.set_ylabel('fraction of viral tweets', size = 12)
ax.set_xscale('symlog')