### Installing packages

In [None]:
!pip install statsmodels tqdm

### Importing packages

In [1]:
import os, pickle
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
from datetime import date
from tqdm import tqdm

### Global variables

In [2]:
dataset_folder = "full_dataset" # folder with the full dataset
parts = [] # list to store contributions from each file, for merging into one df later

In [7]:
log_dataset_cols = ["log_retweets_per_follower", "topic_log_probability", "Anger_log", "Disgust_log", "Fear_log", "Joy_log",
                    "Sadness_log", "Surprise_log", "Trust_log", "Anticipation_log", "user_verified", "retweet_count", "user_followers_count"]

non_log_dataset_cols = ["retweets_per_follower", "topic_probability", "Anger", "Disgust", "Fear", "Joy", 
                        "Sadness", "Surprise", "Trust", "Anticipation", "user_verified", "retweet_count", "user_followers_count"]

common_cols = ["user_created_at", "media", "topic_id"]

topic_bool_columns = ["topic_is_not_disinformation", "topic_is_trolling", "topic_is_out_of_scope", "topic_is_debatable", 
                 "topic_is_conspiracy_theory"]

new_cols = topic_bool_columns + ["media_bool", "account_age"]
# Columns that need to be processed in runtime and appended to the dataset

In [4]:
def account_age(x):
    y = x.date()
    ref_date = date(2020, 12, 1)

    return (ref_date-y).days / 365

In [27]:
def create_regression_dataset(save_as, type="log"):
    parts = []
    
    if type == "log":
        cols_list = log_dataset_cols
        output_cols = cols_list + new_cols + ["Emotion_prob_log"]
    elif type == "non_log":
        cols_list = non_log_dataset_cols
        output_cols = cols_list + new_cols + ["Emotion_prob"]
    
    for file in tqdm(os.listdir(dataset_folder)):
        df = pd.read_csv(os.path.join(dataset_folder, file))
        df.dropna(subset=['topic_id', "log_retweets_per_follower"], inplace=True)

        part = df[common_cols + cols_list]

        parts.append(part)

    df = pd.concat(parts, ignore_index=True)

    df["media_bool"] = df["media"].notnull()
    df["account_age"] = pd.to_datetime(df["user_created_at"], format='%a %b %d %H:%M:%S %z %Y').apply(account_age)

    for col in topic_bool_columns:
        df[col] = False
    
    if type == "log":
        df["Emotion_prob_log"] = df[["Anger_log", "Disgust_log", "Fear_log", "Joy_log", "Sadness_log", "Surprise_log", "Trust_log", "Anticipation_log"]].max(axis=1)
    elif type == "non_log":
        df["Emotion_prob"] = df[["Anger", "Disgust", "Fear", "Joy", "Sadness", "Surprise", "Trust", "Anticipation"]].max(axis=1)

    topic_id_topic_cat_dict = pd.read_pickle("topic_id_topic_cat_dict")

    for col in topic_bool_columns:
        df["topic_type"] = df["topic_id"].map(topic_id_topic_cat_dict)
        df[col][df["topic_type"] == col] = True

    df[output_cols].to_csv(f"{save_as}.csv", index=False)

### Extract the columns necessary for regression

In [20]:
create_regression_dataset("log_dataset", type="log") # Extract columns for the log linear regression

  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [13:46<00:00,  9.19s/it]


In [28]:
create_regression_dataset("non_log_dataset", type="non_log") # Extract columns for the negative binomial regression

  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
100%|██████████████████████████████████████████████████████████████████████████████████| 90/90 [12:09<00:00,  8.11s/it]


### Running regressions

#### Negative Binomial

In [29]:
df = pd.read_csv("non_log_dataset.csv") # loading in the non log dataset

In [20]:
# Formula for the regression
# To add topic probability, use column `topic_probability`
# To add emotion probability, use column `Emotion_prob`

prominent_emotion_formula = '''retweets_per_follower ~ Anger +
Disgust +
Fear +
Joy +
Sadness +
Surprise + 
Trust + 
Anticipation + 
topic_is_not_disinformation +
topic_is_trolling +
topic_is_out_of_scope +
topic_is_debatable +
topic_is_conspiracy_theory +
media_bool + 
user_verified + 
account_age'''

In [22]:
correlation = df[["retweets_per_follower", "media_bool", "user_verified", "account_age"] + topic_bool_columns].corr()

correlation.to_csv("correlation_non_log.csv")

In [28]:
no_of_regression_samples = 300000 # No of rows to run the reg on. Use None to run on the entire dataset.

In [26]:
# Run regression

nbr = smf.glm(prominent_emotion_formula, data=df.iloc[:no_of_regression_samples, :], family=sm.families.NegativeBinomial()).fit()
print(nbr.summary())

                   Generalized Linear Model Regression Results                   
Dep. Variable:     retweets_per_follower   No. Observations:               300000
Model:                               GLM   Df Residuals:                   299984
Model Family:           NegativeBinomial   Df Model:                           15
Link Function:                       Log   Scale:                          1.0000
Method:                             IRLS   Log-Likelihood:                -27507.
Date:                   Wed, 16 Mar 2022   Deviance:                       29617.
Time:                           19:49:12   Pearson chi2:                 2.10e+08
No. Iterations:                      100   Pseudo R-squ. (CS):            0.07067
Covariance Type:               nonrobust                                         
                                          coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------

#### Log Linear Regression

In [31]:
df = pd.read_csv("log_dataset.csv")

In [22]:
# To add topic probability, use column `topic_log_probability`
# To add emotion probability, use column `Emotion_prob_log`

prominent_emotion_formula_log = '''log_retweets_per_follower ~ Anger_log +
Disgust_log +
Fear_log +
Joy_log +
Sadness_log +
Surprise_log + 
Trust_log + 
Anticipation_log + 
topic_is_not_disinformation +
topic_is_trolling +
topic_is_out_of_scope +
topic_is_debatable +
topic_is_conspiracy_theory +
media_bool + 
user_verified + 
account_age'''

In [23]:
correlation = df[["log_retweets_per_follower", "media_bool", "user_verified", "account_age"] + topic_bool_columns].corr()

correlation.to_csv("correlation_log.csv", index=False)

In [37]:
no_of_regression_samples = None # No of rows to run the reg on. Use None to run on the entire dataset.

In [38]:
linreg = smf.ols(prominent_emotion_formula_log, data=df.iloc[:no_of_regression_samples, :]).fit()
print(linreg.summary())

                                OLS Regression Results                               
Dep. Variable:     log_retweets_per_follower   R-squared:                       0.428
Model:                                   OLS   Adj. R-squared:                  0.428
Method:                        Least Squares   F-statistic:                 2.626e+05
Date:                       Wed, 16 Mar 2022   Prob (F-statistic):               0.00
Time:                               19:51:37   Log-Likelihood:            -6.9558e+06
No. Observations:                    5626408   AIC:                         1.391e+07
Df Residuals:                        5626391   BIC:                         1.391e+07
Df Model:                                 16                                         
Covariance Type:                   nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------