### Installing packages

In [None]:
!pip install statsmodels tqdm

### Importing packages

In [1]:
import os, pickle
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import statsmodels.api as sm
from datetime import date
from tqdm import tqdm

### Global variables

In [2]:
dataset_folder = "new_full_dataset" # folder with the full dataset
parts = [] # list to store contributions from each file, for merging into one df later

In [6]:
log_dataset_cols = ["log_retweets_per_follower", "topic_log_probability", "Anger_log", "Disgust_log", "Fear_log", "Joy_log",
                    "Sadness_log", "Surprise_log", "Trust_log", "Anticipation_log", "user_verified", "suspended_tweet", "retweet_count", "user_followers_count"]

non_log_dataset_cols = ["retweets_per_follower", "topic_probability", "Anger", "Disgust", "Fear", "Joy", 
                        "Sadness", "Surprise", "Trust", "Anticipation", "user_verified", "suspended_tweet", "retweet_count", "user_followers_count"]

logit_cols = ["Anger_log", "Disgust_log", "Fear_log", "Joy_log",
            "Sadness_log", "Surprise_log", "Trust_log", "Anticipation_log", "user_verified", "suspended_tweet", "retweet_count", "user_followers_count"]

common_cols = ["user_created_at", "media", "topic_id"]

topic_bool_columns = ["topic_is_not_disinformation", "topic_is_trolling", "topic_is_out_of_scope", "topic_is_debatable", 
                 "topic_is_conspiracy_theory"]

new_cols = topic_bool_columns + ["media_bool", "account_age"]
# Columns that need to be processed in runtime and appended to the dataset

In [4]:
def account_age(x):
    y = x.date()
    ref_date = date(2020, 12, 1)

    return (ref_date-y).days / 365

In [5]:
def create_regression_dataset(save_as, typ="log"):
    parts = []
    
    if typ == "log":
        cols_list = log_dataset_cols
        output_cols = cols_list + new_cols + ["Emotion_prob_log"]
    elif typ == "non_log":
        cols_list = non_log_dataset_cols
        output_cols = cols_list + new_cols + ["Emotion_prob"]
    elif typ == "logit":
        cols_list = logit_cols
#         logit_cols.remove("retweet_count")
#         output_cols = logit_cols + ["has_been_retweeted", "media_bool", "account_age"]
        output_cols = logit_cols + new_cols + ["retweet_count_gte_one"]
        
    for file in tqdm(os.listdir(dataset_folder)):
        df = pd.read_csv(os.path.join(dataset_folder, file))
        
#         # Correction course:
#         df["retweet_count"] = df["retweet_count"].combine_first(df["retweet_count_before_susp"])
        
#         df["retweets_per_follower"] = df.retweet_count.divide(
#             df.user_followers_count
#         )

#         df.loc[
#             ~np.isfinite(df["retweets_per_follower"]), "retweets_per_follower"
#         ] = np.nan

#         df["log_retweets_per_follower"] = np.log10(df["retweets_per_follower"])

#         df.loc[
#             ~np.isfinite(df["log_retweets_per_follower"]), "log_retweets_per_follower"
#         ] = np.nan
        
#         df.to_csv(f"new_full_dataset/{file}", index=False)
        
        # If no topic_id assigned (i.e. tweet was used to train LDA), drop it
        df.dropna(subset=["topic_id"], inplace=True)
        
        if typ == "log":
            # If the log_retweets_per_follower (derived from retweets_per_follower) is Nan, drop the tweet
            df.dropna(subset=["log_retweets_per_follower"], inplace=True)
        elif typ == "non_log":
            df.dropna(subset=["retweets_per_follower"], inplace=True)
            
#         elif typ == "logit":
#             df.dropna(subset=['topic_id'], inplace=True)

        part = df[common_cols + cols_list]

        parts.append(part)

    print(f'{df["retweet_count"].isnull().sum()} tweets had no retweet count')
    
    df = pd.concat(parts, ignore_index=True)

    df["media_bool"] = df["media"].notnull()
    df["account_age"] = pd.to_datetime(df["user_created_at"], format='%a %b %d %H:%M:%S %z %Y').apply(account_age)

    for col in topic_bool_columns:
        df[col] = False
    
    if typ == "log":
        df["Emotion_prob_log"] = df[["Anger_log", "Disgust_log", "Fear_log", "Joy_log", "Sadness_log", "Surprise_log", "Trust_log", "Anticipation_log"]].max(axis=1)
    elif typ == "non_log":
        df["Emotion_prob"] = df[["Anger", "Disgust", "Fear", "Joy", "Sadness", "Surprise", "Trust", "Anticipation"]].max(axis=1)
    
    if typ == "logit":
        df["retweet_count_gte_one"] = 0
        df["retweet_count_gte_one"][df["retweet_count"] > 0] = 1
        
    topic_id_topic_cat_dict = pd.read_pickle("topic_id_topic_cat_dict")

    for col in topic_bool_columns:
        df["topic_type"] = df["topic_id"].map(topic_id_topic_cat_dict)
        df[col][df["topic_type"] == col] = True

    df[output_cols].to_csv(f"{save_as}.csv", index=False)

### Extract the columns necessary for regression

In [6]:
create_regression_dataset("log_dataset", typ="log") # Extract columns for the log linear regression

  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
100%|███████████████████████████████████████████| 90/90 [04:39<00:00,  3.11s/it]


0 tweets had no retweet count


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df["topic_type"] == col] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df["topic_type"] == col] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df["topic_type"] == col] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df["topic_type"] == col] = True
A value 

In [7]:
create_regression_dataset("non_log_dataset", typ="non_log") # Extract columns for the negative binomial regression

  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
100%|███████████████████████████████████████████| 90/90 [04:49<00:00,  3.22s/it]


0 tweets had no retweet count


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df["topic_type"] == col] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df["topic_type"] == col] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df["topic_type"] == col] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df["topic_type"] == col] = True
A value 

In [8]:
create_regression_dataset("logit_dataset", typ="logit")

  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
  df = pd.read_csv(os.path.join(dataset_folder, file))
100%|███████████████████████████████████████████| 90/90 [04:31<00:00,  3.01s/it]


0 tweets had no retweet count


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["retweet_count_gte_one"][df["retweet_count"] > 0] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df["topic_type"] == col] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df["topic_type"] == col] = True
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][df["topic_type"] == col

### Running regressions

#### Negative Binomial

In [18]:
df = pd.read_csv("non_log_dataset.csv") # loading in the non log dataset
# (6234721, 21)
# 19568181

In [20]:
df.shape

(19568181, 22)

In [20]:
# Formula for the regression
# To add topic probability, use column `topic_probability`
# To add emotion probability, use column `Emotion_prob`

prominent_emotion_formula = '''retweets_per_follower ~ Anger +
Disgust +
Fear +
Joy +
Sadness +
Surprise + 
Trust + 
Anticipation + 
topic_is_not_disinformation +
topic_is_trolling +
topic_is_out_of_scope +
topic_is_debatable +
topic_is_conspiracy_theory +
media_bool + 
user_verified + 
account_age'''

In [22]:
correlation = df[["retweets_per_follower", "media_bool", "user_verified", "account_age"] + topic_bool_columns].corr()

correlation.to_csv("correlation_non_log.csv")

In [28]:
no_of_regression_samples = 300000 # No of rows to run the reg on. Use None to run on the entire dataset.

In [26]:
# Run regression

nbr = smf.glm(prominent_emotion_formula, data=df.iloc[:no_of_regression_samples, :], family=sm.families.NegativeBinomial()).fit()
print(nbr.summary())

                   Generalized Linear Model Regression Results                   
Dep. Variable:     retweets_per_follower   No. Observations:               300000
Model:                               GLM   Df Residuals:                   299984
Model Family:           NegativeBinomial   Df Model:                           15
Link Function:                       Log   Scale:                          1.0000
Method:                             IRLS   Log-Likelihood:                -27507.
Date:                   Wed, 16 Mar 2022   Deviance:                       29617.
Time:                           19:49:12   Pearson chi2:                 2.10e+08
No. Iterations:                      100   Pseudo R-squ. (CS):            0.07067
Covariance Type:               nonrobust                                         
                                          coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------

In [19]:
df[df["retweet_count"] > 0].shape

(6234721, 22)

#### Log Linear Regression

In [2]:
df = pd.read_csv("log_dataset.csv")
# (5626408, 21)

In [13]:
df.shape

(5626408, 21)

In [10]:
df.shape

(6234721, 22)

In [7]:
# To add topic probability, use column `topic_log_probability`
# To add emotion probability, use column `Emotion_prob_log`

prominent_emotion_formula_log = '''log_retweets_per_follower ~ Anger_log +
Disgust_log +
Fear_log +
Joy_log +
Sadness_log +
Surprise_log + 
Trust_log + 
Anticipation_log + 
topic_is_not_disinformation +
topic_is_trolling +
topic_is_out_of_scope +
topic_is_debatable +
topic_is_conspiracy_theory +
media_bool + 
user_verified + 
account_age'''

In [8]:
correlation = df[["log_retweets_per_follower", "media_bool", "user_verified", "account_age"] + topic_bool_columns].corr()

correlation.to_csv("correlation_log.csv", index=False)

In [9]:
no_of_regression_samples = None # No of rows to run the reg on. Use None to run on the entire dataset.

In [11]:
linreg = smf.ols(prominent_emotion_formula_log, data=df.iloc[:no_of_regression_samples, :]).fit()
print(linreg.summary())

                                OLS Regression Results                               
Dep. Variable:     log_retweets_per_follower   R-squared:                       0.426
Model:                                   OLS   Adj. R-squared:                  0.426
Method:                        Least Squares   F-statistic:                 2.894e+05
Date:                       Mon, 04 Apr 2022   Prob (F-statistic):               0.00
Time:                               08:33:24   Log-Likelihood:            -7.7977e+06
No. Observations:                    6234721   AIC:                         1.560e+07
Df Residuals:                        6234704   BIC:                         1.560e+07
Df Model:                                 16                                         
Covariance Type:                   nonrobust                                         
                                          coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------

#### Logistic Regression

In [9]:
df = pd.read_csv("logit_dataset.csv")

In [10]:
df.shape

(20014503, 20)

In [42]:
no_of_regression_samples = 5000000 # No of rows to run the reg on. Use None to run on the entire dataset.

In [43]:
# Regression on 5 million tweets
for typ in topic_bool_columns:
    prominent_emotion_formula_log = f'''has_been_retweeted ~  
    Anger_log +
    Disgust_log +
    Fear_log +
    Joy_log +
    Sadness_log +
    Surprise_log + 
    Trust_log + 
    Anticipation_log + 
    {typ} +
    media_bool + 
    user_verified + 
    account_age'''

    log_reg = smf.logit(prominent_emotion_formula_log, data=df.iloc[:no_of_regression_samples, :]).fit()
    print(log_reg.summary())

Optimization terminated successfully.
         Current function value: 0.587906
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:     has_been_retweeted   No. Observations:              5000000
Model:                          Logit   Df Residuals:                  4999987
Method:                           MLE   Df Model:                           12
Date:                Tue, 29 Mar 2022   Pseudo R-squ.:                 0.08239
Time:                        22:51:52   Log-Likelihood:            -2.9395e+06
converged:                       True   LL-Null:                   -3.2035e+06
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                          coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
Intercept                              -1.1871      0.005 

Optimization terminated successfully.
         Current function value: 0.588064
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:     has_been_retweeted   No. Observations:              5000000
Model:                          Logit   Df Residuals:                  4999987
Method:                           MLE   Df Model:                           12
Date:                Tue, 29 Mar 2022   Pseudo R-squ.:                 0.08215
Time:                        22:52:39   Log-Likelihood:            -2.9403e+06
converged:                       True   LL-Null:                   -3.2035e+06
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                         coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Intercept                             -1.1521      0.005   -

In [44]:
# Logistic regression on 16m tweets (excludes tweets with 0 rts or suspended tweets)
for typ in topic_bool_columns:
    prominent_emotion_formula_log = f'''has_been_retweeted ~  
    Anger_log +
    Disgust_log +
    Fear_log +
    Joy_log +
    Sadness_log +
    Surprise_log + 
    Trust_log + 
    Anticipation_log + 
    {typ} +
    media_bool + 
    user_verified + 
    account_age'''

    log_reg = smf.logit(prominent_emotion_formula_log, data=df).fit()
    print(log_reg.summary())

Optimization terminated successfully.
         Current function value: 0.588119
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:     has_been_retweeted   No. Observations:             16609000
Model:                          Logit   Df Residuals:                 16608987
Method:                           MLE   Df Model:                           12
Date:                Tue, 29 Mar 2022   Pseudo R-squ.:                 0.08247
Time:                        22:53:26   Log-Likelihood:            -9.7681e+06
converged:                       True   LL-Null:                   -1.0646e+07
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                          coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------
Intercept                              -1.1770      0.003 

Optimization terminated successfully.
         Current function value: 0.588261
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:     has_been_retweeted   No. Observations:             16609000
Model:                          Logit   Df Residuals:                 16608987
Method:                           MLE   Df Model:                           12
Date:                Tue, 29 Mar 2022   Pseudo R-squ.:                 0.08225
Time:                        22:55:57   Log-Likelihood:            -9.7704e+06
converged:                       True   LL-Null:                   -1.0646e+07
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                         coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Intercept                             -1.1437      0.003   -

In [2]:
import pandas as pd
from tqdm import tqdm
import os

In [5]:
counts = []
counts_ = []
for file in tqdm(os.listdir("full_dataset")):
    df = pd.read_csv(os.path.join("full_dataset", file))
    df.dropna(subset=["topic_id"], inplace=True)
    counts_.append(df["retweet_count"].isnull().sum())
    counts.append(df["retweet_count_before_susp"].notnull().sum())

  df = pd.read_csv(os.path.join("full_dataset", file))
  df = pd.read_csv(os.path.join("full_dataset", file))
  df = pd.read_csv(os.path.join("full_dataset", file))
  df = pd.read_csv(os.path.join("full_dataset", file))
  df = pd.read_csv(os.path.join("full_dataset", file))
  df = pd.read_csv(os.path.join("full_dataset", file))
  df = pd.read_csv(os.path.join("full_dataset", file))
  df = pd.read_csv(os.path.join("full_dataset", file))
  df = pd.read_csv(os.path.join("full_dataset", file))
  df = pd.read_csv(os.path.join("full_dataset", file))
  df = pd.read_csv(os.path.join("full_dataset", file))
  df = pd.read_csv(os.path.join("full_dataset", file))
  df = pd.read_csv(os.path.join("full_dataset", file))
  df = pd.read_csv(os.path.join("full_dataset", file))
  df = pd.read_csv(os.path.join("full_dataset", file))
  df = pd.read_csv(os.path.join("full_dataset", file))
100%|███████████████████████████████████████████| 90/90 [04:35<00:00,  3.06s/it]


In [6]:
# 4255950
sum(counts)

3405503

In [7]:
sum(counts_)

3405503