In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from dateutil import parser
import time
import datetime
from sklearn.model_selection import train_test_split

# initialise current year and month 
year = datetime.datetime.today().year
print("current year: ", year)

**ANALYSE TWEETS BEFORE INVASION**

**Combining Tweets and User files and splitting them into train, validation, and testset**

In [None]:
# merge tweet files
before1 = pd.read_csv("before_invasion_tweets.csv", sep = "|").drop(columns = ['Unnamed: 0'])
before1['created_at'] = before1.apply(lambda row: parser.parse(row.created_at), axis = 1)
before2 = pd.read_csv("before2_invasion_tweets.csv", sep = "|").drop(columns = ['Unnamed: 0'])
before2['created_at'] = before2.apply(lambda row: parser.parse(row.created_at), axis = 1)
before_invasion = pd.concat([before1, before2])
print("number of tweets before invasion: ", len(before_invasion))
print("number of unique users based on tweets: ", len(before_invasion.author_id.unique()))

# merge user files
users_before1 = pd.read_csv("before_invasion_users.csv", sep = "|").drop(columns = ['Unnamed: 0', 'withheld.country_codes', 'withheld.scope']).drop_duplicates(subset = ['id'], keep = 'first').rename(columns = {"id" : "author_id", "created_at" : "account_age"})
users_before2 = pd.read_csv("before2_invasion_users.csv", sep = "|").drop(columns = ['Unnamed: 0', 'withheld.country_codes', 'withheld.scope']).drop_duplicates(subset = ['id'], keep = 'first').rename(columns = {"id" : "author_id", "created_at" : "account_age"})
users_before = pd.concat([users_before1, users_before2]).drop_duplicates(subset = ['author_id'], keep = 'first')
users_before['account_age'] = users_before.apply(lambda row: parser.parse(row.account_age), axis = 1)
users_before['account_age_y'] = users_before.apply(lambda row: year - row['account_age'].year, axis = 1)

# combine tweets and users
data_before = before_invasion.join(users_before.set_index('author_id'), on = ['author_id'])

# make url boolean feature
data_before['URL'] = data_before.apply(lambda row: 1 if pd.isnull(row['entities.urls']) == False else 0, axis = 1)
data_before['retweeted'] = data_before.apply(lambda row: 1 if row['public_metrics.retweet_count'] > 0 else 0, axis = 1)

# split data into train, validation, and test set
before_train, rem = train_test_split(data_before, train_size = 0.7, random_state = 42, shuffle = True)
before_validation, before_test = train_test_split(rem, test_size = 0.5, random_state = 32, shuffle = True)
print(before_train.shape, before_validation.shape, before_test.shape)

# save files to csv
before_train.to_csv("before_train.csv", sep = "|")
before_validation.to_csv("before_val.csv", sep = "|")
before_test.to_csv("before_test.csv", sep = "|")

# show train set and continue with trainset for EDA
before_train

In [None]:
before_train.info()

In [None]:
len(before_train.author_id.unique())

In [None]:
data_before = before_train.drop(columns = ['geo.place_id', 'entities.mentions', 'withheld.copyright', 'withheld.country_codes', 'entities.cashtags', 'withheld.scope'])
subset = data_before[['created_at', 'public_metrics.retweet_count',
       'public_metrics.reply_count', 'public_metrics.like_count',
       'public_metrics.quote_count', 
       'account_age', 'public_metrics.followers_count',
       'public_metrics.following_count', 'public_metrics.tweet_count',
       'public_metrics.listed_count', 'account_age_y']]
# print(subset.describe().to_latex(caption = "Data before invasion: description of numerical values."))
subset.describe()

In [None]:
number_of_tweets = len(data_before)
print("percentage of tweets retweeted")
print("never: ", len(data_before[data_before['public_metrics.retweet_count'] == 0]) / number_of_tweets * 100)
print("more than 0: ", len(data_before[data_before['public_metrics.retweet_count'] > 0]) / number_of_tweets * 100)
print("more than 1: ", len(data_before[data_before['public_metrics.retweet_count'] > 1]) / number_of_tweets * 100)
print("more than 5: ", len(data_before[data_before['public_metrics.retweet_count'] > 5]) / number_of_tweets * 100)
print("more than 10: ", len(data_before[data_before['public_metrics.retweet_count'] > 10]) / number_of_tweets * 100)
print("more than 50: ", len(data_before[data_before['public_metrics.retweet_count'] > 50]) / number_of_tweets * 100)
print("more than 100: ", len(data_before[data_before['public_metrics.retweet_count'] > 100]) / number_of_tweets * 100)

plt.hist(data_before['public_metrics.retweet_count'], bins = 100, range = [5, 200])
plt.xlabel("Number of retweets", fontsize = 13)
plt.ylabel("Count", fontsize = 13)
plt.savefig("retweet_count_before.jpg")

retweet_count = before_train['public_metrics.retweet_count'].sum()

print("total number of retweets: ", retweet_count)
print("percentage of total (original + its retweets): ", (retweet_count/(retweet_count + number_of_tweets)) * 100)

**ANALYSE TWEETS AFTER INVASION**

In [None]:
# merge tweet files
after_invasion = pd.read_csv("after_invasion_tweets.csv", sep = "|").drop(columns = ['Unnamed: 0'])
after_invasion['created_at'] = after_invasion.apply(lambda row: parser.parse(row.created_at), axis = 1)
print("number of tweets before invasion: ", len(after_invasion))
print("number of unique users based on tweets: ", len(after_invasion.author_id.unique()))

# merge user files
users_after = pd.read_csv("after_invasion_users.csv", sep = "|").drop(columns = ['Unnamed: 0', 'withheld.country_codes', 'withheld.scope']).drop_duplicates(subset = ['id'], keep = 'first').rename(columns = {"id" : "author_id", "created_at" : "account_age"})
users_after['account_age'] = users_after.apply(lambda row: parser.parse(row.account_age), axis = 1)
users_after['account_age_y'] = users_after.apply(lambda row: year - row['account_age'].year, axis = 1)

# combine tweets and users
data_after = after_invasion.join(users_after.set_index('author_id'), on = ['author_id'])

# make url boolean feature
data_after['URL'] = data_after.apply(lambda row: 1 if pd.isnull(row['entities.urls']) == False else 0, axis = 1)
data_after['retweeted'] = data_after.apply(lambda row: 1 if row['public_metrics.retweet_count'] > 0 else 0, axis = 1)

# split data into train, validation, and test set
after_train, rem_after = train_test_split(data_after, train_size = 0.7, random_state = 42, shuffle = True)
after_validation, after_test = train_test_split(rem_after, test_size = 0.5, random_state = 32, shuffle = True)
print(after_train.shape, after_validation.shape, after_test.shape)

# save files to csv
after_train.to_csv("after_train.csv", sep = "|")
after_validation.to_csv("after_val.csv", sep = "|")
after_test.to_csv("after_test.csv", sep = "|")

# show train set and continue with trainset for EDA
after_train

In [None]:
number_of_tweets = len(after_train)
print("percentage of tweets retweeted")
print("never: ", len(after_train[after_train['public_metrics.retweet_count'] == 0]) / number_of_tweets * 100)
print("more than 0: ", len(after_train[after_train['public_metrics.retweet_count'] > 0]) / number_of_tweets * 100)
print("more than 1: ", len(after_train[after_train['public_metrics.retweet_count'] > 1]) / number_of_tweets * 100)
print("more than 5: ", len(after_train[after_train['public_metrics.retweet_count'] > 5]) / number_of_tweets * 100)
print("more than 10: ", len(after_train[after_train['public_metrics.retweet_count'] > 10]) / number_of_tweets * 100)
print("more than 50: ", len(after_train[after_train['public_metrics.retweet_count'] > 50]) / number_of_tweets * 100)
print("more than 100: ", len(after_train[after_train['public_metrics.retweet_count'] > 100]) / number_of_tweets * 100)

plt.hist(after_train['public_metrics.retweet_count'], bins = 100, range = [5, 200])
plt.xlabel("Number of retweets", fontsize = 13)
plt.ylabel("Count", fontsize = 13)
plt.savefig("retweet_count_after.jpg")

retweet_count = after_train['public_metrics.retweet_count'].sum()

print("total number of retweets: ", retweet_count)
print("percentage of total (original + its retweets): ", (retweet_count/(retweet_count + number_of_tweets)) * 100)