In [1]:
import pandas as pd
import csv
import krippendorff
import numpy as np
import re
from collections import Counter
import statsmodels.api as sm
import statsmodels.formula.api as smf

Intercoder Reliability (ICR)

Retrieving data from 'ICR_Data.csv' file with coded responses of Coders 1 and 2 and creating a function to calculate ICR scores (Krippendorff's alpha)

In [2]:
intercoder_data = pd.read_csv('/Users/salmankhawar/Desktop/Thesis/Content Analysis/ICR_Data.csv')

In [3]:
intercoder_data_cleaned = intercoder_data[2:][['[coder]','[tweet_ID]','[news_outlet]','[tweet_topic] ','[hyperbolic_feature]','[slang_feature]','[listicle_feature]','[emoji_feature]','[hashtag_feature]','[cap_feature]','[punct_feature]','[forw_ref_feature]','[question_feature]','[media_feature]']]

In [4]:
cols = intercoder_data_cleaned.columns
intercoder_data_cleaned[cols] = intercoder_data_cleaned[cols].apply(pd.to_numeric, errors='coerce')

In [5]:
def ICR_score(column):
    coder1 = intercoder_data_cleaned.loc[(intercoder_data_cleaned["[coder]"] == 1), column]
    coder2 = intercoder_data_cleaned.loc[(intercoder_data_cleaned["[coder]"] == 2), column]
    array = [coder1, coder2]
    return krippendorff.alpha(array)

Calcuating ICR scores for individual variables

In [6]:
ICR_score('[news_outlet]')

0.9513953882218716

In [7]:
ICR_score('[tweet_topic] ')

0.7240888102957068

In [8]:
ICR_score('[hyperbolic_feature]')

0.6927775781530723

In [9]:
ICR_score('[slang_feature]')

0.7437499999999999

In [10]:
ICR_score('[listicle_feature]')

1.0

In [11]:
ICR_score('[emoji_feature]')

1.0

In [12]:
ICR_score('[hashtag_feature]')

0.9487870619946092

In [13]:
ICR_score('[cap_feature]')

1.0

In [14]:
ICR_score('[punct_feature]')

0.9058089924515917

In [15]:
ICR_score('[forw_ref_feature]')

0.7612596553773024

In [16]:
ICR_score('[question_feature]')

0.846441947565543

In [17]:
ICR_score('[media_feature]')

0.9057670920934173

Analyzing Manual Content Analysis Results.

Reading CSV file with manual content analysis results,retreiving engagement metrics from the sample file and joining them with the results dataframe. 

In [18]:
CA_results = pd.read_csv('/Users/salmankhawar/Desktop/Thesis/Content Analysis/Results_CA.csv')

In [19]:
CA_results.duplicated().sum()

0

In [20]:
CA_results.drop(['Unnamed: 0','Unnamed: 0.1'], axis=1, inplace=True)

In [21]:
manualCA_sample = pd.read_csv('/Users/salmankhawar/Desktop/Thesis/Data_Collection_Twitter/Data_csv/Data_Collected_Tweepy/ManualCA_sample.csv')

In [22]:
engagement_metrics = manualCA_sample[['favorite_count','retweet_count','is_retweet','twitter_account']]

In [23]:
CA_results = CA_results.join(engagement_metrics)

In [24]:
CA_results.head()

Unnamed: 0,[tweet_ID],StartDate,EndDate,Status,IPAddress,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,...,[hashtag_feature],[cap_feature],[punct_feature],[forw_ref_feature],[question_feature],[media_feature],favorite_count,retweet_count,is_retweet,twitter_account
0,1351933244179152904,2021-12-10 11:39:07,2021-12-10 11:42:07,0,84.84.82.135,100,179,1,2021-12-10 11:42:07,R_26o3GTpqSKoGoXO,...,0,0,0,0,0,0,12,3,No,BuzzFeed Politics
1,1309828834691350529,2021-12-10 11:42:39,2021-12-10 11:44:07,0,84.84.82.135,100,87,1,2021-12-10 11:44:07,R_1IDXJeinkI7KvcK,...,0,0,0,0,0,0,0,1,No,L.A. Times Politics
2,1006560210029305856,2021-12-10 11:46:35,2021-12-10 11:48:15,0,84.84.82.135,100,99,1,2021-12-10 11:48:15,R_DAE9jLJPb9q2huV,...,0,0,0,0,1,1,0,1,No,HuffPost Sports
3,1422561643213426692,2021-12-10 11:48:55,2021-12-10 11:52:04,0,84.84.82.135,100,188,1,2021-12-10 11:52:04,R_1fjvJlj04zzulOF,...,0,0,0,0,0,0,1,3,No,USA TODAY Politics
4,1465819596162678787,2021-12-10 11:52:10,2021-12-10 11:57:00,0,84.84.82.135,100,289,1,2021-12-10 11:57:00,R_1OZx8VPHhLFgtU9,...,0,0,0,0,0,1,0,32,Yes,L.A. Times Sports


In [25]:
CA_results.columns

Index(['[tweet_ID]', 'StartDate', 'EndDate', 'Status', 'IPAddress', 'Progress',
       'Duration (in seconds)', 'Finished', 'RecordedDate', 'ResponseId',
       'RecipientLastName', 'RecipientFirstName', 'RecipientEmail',
       'ExternalReference', 'LocationLatitude', 'LocationLongitude',
       'DistributionChannel', 'UserLanguage', '[news_outlet]', '[tweet_topic]',
       '[tweet_topic] _11_TEXT', '[hyperbolic_feature]', '[slang_feature]',
       '[listicle_feature]', '[emoji_feature]', '[hashtag_feature]',
       '[cap_feature]', '[punct_feature]', '[forw_ref_feature]',
       '[question_feature]', '[media_feature]', 'favorite_count',
       'retweet_count', 'is_retweet', 'twitter_account'],
      dtype='object')

Calculating detection rates for each sensationalist feature (1=feature present, 0=feature not present)

In [26]:
CA_results['[hyperbolic_feature]'].value_counts(normalize=True)*100

0    89.930556
1    10.069444
Name: [hyperbolic_feature], dtype: float64

In [27]:
CA_results['[slang_feature]'].value_counts(normalize=True)*100

0    94.166667
1     5.833333
Name: [slang_feature], dtype: float64

In [28]:
CA_results['[listicle_feature]'].value_counts(normalize=True)*100

0    99.097222
1     0.902778
Name: [listicle_feature], dtype: float64

In [29]:
CA_results['[emoji_feature]'].value_counts(normalize=True)*100

0    95.763889
1     4.236111
Name: [emoji_feature], dtype: float64

In [30]:
CA_results['[hashtag_feature]'].value_counts(normalize=True)*100

0    91.527778
1     8.472222
Name: [hashtag_feature], dtype: float64

In [31]:
CA_results['[cap_feature]'].value_counts(normalize=True)*100

0    96.944444
1     3.055556
Name: [cap_feature], dtype: float64

In [32]:
CA_results['[punct_feature]'].value_counts(normalize=True)*100

0    97.430556
1     2.569444
Name: [punct_feature], dtype: float64

In [33]:
CA_results['[forw_ref_feature]'].value_counts(normalize=True)*100

0    95.833333
1     4.166667
Name: [forw_ref_feature], dtype: float64

In [34]:
CA_results['[question_feature]'].value_counts(normalize=True)*100

0    94.444444
1     5.555556
Name: [question_feature], dtype: float64

Media Feature: 0=no media, 1=photo, 2=video, 3=animated gif

In [35]:
CA_results['[media_feature]'].value_counts(normalize=True)*100

0    79.791667
1    13.680556
2     5.208333
3     1.319444
Name: [media_feature], dtype: float64

Creating dummy variable for media (1=media present, 0=media not present)

In [36]:
def media_feature(x):
               if x == 0:
                        return 0
               else:
                        return 1

In [37]:
CA_results['[media_dummy]'] = CA_results['[media_feature]'].apply(media_feature) 

In [38]:
CA_results['[media_dummy]'].value_counts(normalize=True)*100

0    79.791667
1    20.208333
Name: [media_dummy], dtype: float64

Creating dummy variable for presence of sensationalism in tweets (1=sensationalist feature present, 0=sensationalist feature not present)

In [39]:
CA_features_grid = CA_results[['[hyperbolic_feature]', '[slang_feature]', '[listicle_feature]', '[emoji_feature]', '[question_feature]', '[hashtag_feature]', '[forw_ref_feature]', '[punct_feature]', '[cap_feature]', '[media_dummy]']].apply(pd.to_numeric)

In [40]:
CA_features_grid = np.array(CA_features_grid)

In [41]:
CA_features_grid = CA_features_grid.tolist()

In [42]:
CAcombined_features_dummy = [int(any(row)) for row in CA_features_grid]

In [43]:
CAcombined_features_dummy[:20]

[0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0]

In [44]:
CA_results['combined_features_dummy'] = CAcombined_features_dummy

Sensationalist feature present in 44.03% of tweets in sample

In [45]:
CA_results['combined_features_dummy'].value_counts(normalize=True)*100

0    55.972222
1    44.027778
Name: combined_features_dummy, dtype: float64

Creating count variable with number of features in each tweet

In [46]:
count_variable = np.sum(CA_features_grid ,axis=1).tolist()

In [47]:
CA_results['count_variable'] = count_variable

In [48]:
CA_results['count_variable'].value_counts(normalize=True)*100

0    55.972222
1    28.611111
2    10.902778
3     3.611111
4     0.694444
5     0.208333
Name: count_variable, dtype: float64

Creating variable for outlet type (1=tweets published by online-native outlets, 0=tweets published by legacy outlets)

In [49]:
def news_outlet_classifications(number):
    number = str(number)
    if re.search(r"\b[1-6]\b", number):
        return 1
    else:
        return 0

In [50]:
CA_results['outlet_type'] = CA_results['[news_outlet]'].apply(news_outlet_classifications) 

In [51]:
CA_results['outlet_type'].value_counts(normalize=True)*100

0    50.138889
1    49.861111
Name: outlet_type, dtype: float64

Creating variable for channel type (1=tweets published in soft news sub-channels i.e sports,entertainment/life, 0=tweets published in hard news sub-channels i.e politics)

In [52]:
def channel_classification(number):
    number = str(number)
    if re.search(r"\b[147][0]?\b", number):
        return 0
    else:
        return 1

In [53]:
CA_results['channel_type'] = CA_results['[news_outlet]'].apply(channel_classification) 

In [54]:
CA_results['channel_type'].value_counts(normalize=True)*100

1    50.138889
0    49.861111
Name: channel_type, dtype: float64

Creating dummy variable for topic type 

(1=tweets classified as 'entertainment/celebrity news','lifestyle and society ','sports','crime/law and order','human-interest story', 

0=tweets classified as 'politics/government affairs/military','international affairs','economy/business','health/education' and 'science/technology')

In [55]:
def topic_type(x):
    if x in [6,7,8,9,10]:
        return 1
    else:
        return 0

In [56]:
CA_results['topic_type'] = CA_results['[tweet_topic]'].apply(topic_type) 

In [57]:
CA_results['topic_type'].value_counts(normalize=True)*100

1    54.791667
0    45.208333
Name: topic_type, dtype: float64

Creating dummy variables for indiviudal topic categories

In [58]:
topic_dummies = pd.get_dummies(CA_results['[tweet_topic]'])

In [59]:
CA_results['politics/government affairs/military'] = topic_dummies[1]
CA_results['international affairs'] = topic_dummies[2]
CA_results['economy/business'] = topic_dummies[3]
CA_results['health/education'] = topic_dummies[4]
CA_results['science/technology'] = topic_dummies[5]
CA_results['entertainment/celebrity news'] = topic_dummies[6]
CA_results['lifestyle and society'] = topic_dummies[7]
CA_results['sports'] = topic_dummies[8]
CA_results['crime/law and order'] = topic_dummies[9]
CA_results['human-interest story'] = topic_dummies[10]
CA_results['other'] = topic_dummies[11]

Creating variable for length of tweets, after removing urls present in tweet texts

In [60]:
def remove_urls(df):
    url_removed =[]
    for text in df['text']:
            text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE)
            url_removed.append(text)

    df['text'] = url_removed

In [61]:
remove_urls(manualCA_sample)

In [62]:
def length_tweet(x):
    return len(x)

In [63]:
CA_results['len_tweet'] = manualCA_sample['text'].apply(length_tweet)

In [64]:
CA_results['len_tweet'].head(10)

0    129
1     86
2     66
3    140
4    127
5     65
6     90
7    120
8     60
9    191
Name: len_tweet, dtype: int64

Creating variables for engagement metrics per hundred thousand account followers

In [65]:
BuzzFeedPolitics_followers = int(140881)/int(100000)
BuzzFeedSports_followers = int(63060)/int(100000)
BuzzFeedEnt_followers = int(81725)/int(100000)
HuffPostPolitics_followers = int(1439630)/int(100000)
HuffPostSports_followers = int(62587)/int(100000)
HuffPostLife_followers = int(125749)/int(100000)
USATODAYPolitics_followers = int(215937)/int(100000)
USATODAYSports_followers = int(246780)/int(100000)
USATODAYLife_followers= int(85285)/int(100000)
LATimesPolitics_followers = int(12314)/int(100000)
LATimesSports_followers = int(50398)/int(100000)
LATEnt_followers = int(168665)/int(100000)

In [66]:
CA_results['followers_per100000'] = ''

In [67]:
CA_results.loc[(CA_results['twitter_account'] == 'BuzzFeed Politics'), 'followers_per100000'] = BuzzFeedPolitics_followers
CA_results.loc[(CA_results['twitter_account'] == 'BuzzFeed Sports'), 'followers_per100000'] = BuzzFeedSports_followers
CA_results.loc[(CA_results['twitter_account'] == 'BuzzFeed Arts & Entertainment'), 'followers_per100000'] = BuzzFeedEnt_followers
CA_results.loc[(CA_results['twitter_account'] == 'HuffPost Politics'), 'followers_per100000'] = HuffPostPolitics_followers
CA_results.loc[(CA_results['twitter_account'] == 'HuffPost Sports'), 'followers_per100000'] = HuffPostSports_followers
CA_results.loc[(CA_results['twitter_account'] == 'HuffPost Life'), 'followers_per100000'] = HuffPostLife_followers
CA_results.loc[(CA_results['twitter_account'] == 'USA TODAY Politics'), 'followers_per100000'] = USATODAYPolitics_followers
CA_results.loc[(CA_results['twitter_account'] == 'USA TODAY Sports'), 'followers_per100000'] = USATODAYSports_followers
CA_results.loc[(CA_results['twitter_account'] == 'USA TODAY Life'), 'followers_per100000'] = USATODAYLife_followers
CA_results.loc[(CA_results['twitter_account'] == 'L.A. Times Politics'), 'followers_per100000'] = LATimesPolitics_followers
CA_results.loc[(CA_results['twitter_account'] == 'L.A. Times Sports'), 'followers_per100000'] = LATimesSports_followers
CA_results.loc[(CA_results['twitter_account'] == 'LAT Entertainment'), 'followers_per100000'] = LATEnt_followers

In [68]:
CA_results['favorites_per100000'] = CA_results['favorite_count']/CA_results['followers_per100000']

In [69]:
CA_results['retweets_per100000'] = CA_results['retweet_count']/CA_results['followers_per100000']

In [70]:
CA_results['favorites_per100000'] = list(map(lambda x,y: x/y, CA_results['favorite_count'],CA_results['followers_per100000']))

In [71]:
CA_results['retweets_per100000'] = list(map(lambda x,y: x/y, CA_results['retweet_count'],CA_results['followers_per100000']))

In [72]:
CA_results['favorites_per100000'] = pd.to_numeric(CA_results['favorites_per100000'])
CA_results['retweets_per100000'] = pd.to_numeric(CA_results['retweets_per100000'])

Testing H1

Running Logistic Regression using dummy variable for presence of sensationalist feature/s in tweets as DV and outlet type(main variable of interest), channel type and tweet length as IVs (Table C1)

In [73]:
featuresH1 = ['outlet_type', 'len_tweet', 'channel_type']

In [75]:
logit_model = sm.Logit(CA_results['combined_features_dummy'], sm.add_constant(CA_results[featuresH1]))

  return ptp(axis=axis, out=out, **kwargs)


In [76]:
reg_model_H1a = logit_model.fit()
print(reg_model_H1a.summary())

Optimization terminated successfully.
         Current function value: 0.588847
         Iterations 5
                              Logit Regression Results                             
Dep. Variable:     combined_features_dummy   No. Observations:                 1440
Model:                               Logit   Df Residuals:                     1436
Method:                                MLE   Df Model:                            3
Date:                     Wed, 02 Feb 2022   Pseudo R-squ.:                  0.1416
Time:                             15:12:15   Log-Likelihood:                -847.94
converged:                            True   LL-Null:                       -987.84
Covariance Type:                 nonrobust   LLR p-value:                 2.349e-60
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -1.8320      0.187     -9.815      0.000      -2

Running OLS Regression using count variable for number of sensationalist features in tweets as DV and outlet type(main variable of interest), channel type and tweet length as IVs (Table C1)

In [78]:
OLS_model =  sm.OLS(CA_results['count_variable'], sm.add_constant(CA_results[featuresH1]))
reg_model_H1b = OLS_model.fit()
print(reg_model_H1b.summary())

                            OLS Regression Results                            
Dep. Variable:         count_variable   R-squared:                       0.219
Model:                            OLS   Adj. R-squared:                  0.218
Method:                 Least Squares   F-statistic:                     134.5
Date:                Wed, 02 Feb 2022   Prob (F-statistic):           8.45e-77
Time:                        15:12:36   Log-Likelihood:                -1692.5
No. Observations:                1440   AIC:                             3393.
Df Residuals:                    1436   BIC:                             3414.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -0.0545      0.062     -0.879   

Testing H2

Running Logistic Regression using dummy variable for presence of sensationalist feature/s in tweets as DV and topic type(main variable of interest), channel type and tweet length as IVs (Table C2)

In [79]:
featuresH2 = ['topic_type','outlet_type','len_tweet']

In [80]:
logit_model2a = sm.Logit(CA_results['combined_features_dummy'], sm.add_constant(CA_results[featuresH2]))
reg_modelH2a = logit_model2a.fit()
print(reg_modelH2a.summary())

Optimization terminated successfully.
         Current function value: 0.601310
         Iterations 5
                              Logit Regression Results                             
Dep. Variable:     combined_features_dummy   No. Observations:                 1440
Model:                               Logit   Df Residuals:                     1436
Method:                                MLE   Df Model:                            3
Date:                     Wed, 02 Feb 2022   Pseudo R-squ.:                  0.1235
Time:                             15:12:50   Log-Likelihood:                -865.89
converged:                            True   LL-Null:                       -987.84
Covariance Type:                 nonrobust   LLR p-value:                 1.367e-52
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const          -1.7585      0.184     -9.535      0.000      -2.12

Running OLS Regression using count variable for number of sensationalist features in tweets as DV and topic type(main variable of interest), channel type and tweet length as IVs (Table C2)

In [81]:
OLS_model2b =  sm.OLS(CA_results['count_variable'], sm.add_constant(CA_results[featuresH2]))
reg_modelH2b = OLS_model2b.fit()
print(reg_modelH2b.summary())

                            OLS Regression Results                            
Dep. Variable:         count_variable   R-squared:                       0.187
Model:                            OLS   Adj. R-squared:                  0.186
Method:                 Least Squares   F-statistic:                     110.4
Date:                Wed, 02 Feb 2022   Prob (F-statistic):           2.40e-64
Time:                        15:12:57   Log-Likelihood:                -1721.3
No. Observations:                1440   AIC:                             3451.
Df Residuals:                    1436   BIC:                             3472.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const          -0.0300      0.063     -0.473      

Running Logistic Regression using dummy variable for presence of sensationalist feature/s in tweets as DV and individual soft topics and 'other' category as IVs. The reference group is tweets pertaining to hard topic categories.(Table C3)

In [82]:
soft_topics = ['entertainment/celebrity news','lifestyle and society','sports','human-interest story', 'crime/law and order','other']

In [83]:
logit_model2c = sm.Logit(CA_results['combined_features_dummy'], sm.add_constant(CA_results[soft_topics]))
reg_modelH2c = logit_model2c.fit()
print(reg_modelH2c.summary())

Optimization terminated successfully.
         Current function value: 0.592995
         Iterations 19
                              Logit Regression Results                             
Dep. Variable:     combined_features_dummy   No. Observations:                 1440
Model:                               Logit   Df Residuals:                     1433
Method:                                MLE   Df Model:                            6
Date:                     Wed, 02 Feb 2022   Pseudo R-squ.:                  0.1356
Time:                             15:13:10   Log-Likelihood:                -853.91
converged:                            True   LL-Null:                       -987.84
Covariance Type:                 nonrobust   LLR p-value:                 6.268e-55
                                   coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const                          

  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)


In [84]:
logit_model2d = sm.OLS(CA_results['count_variable'], sm.add_constant(CA_results[soft_topics]))
reg_modelH2d = logit_model2d.fit()
print(reg_modelH2d.summary())

                            OLS Regression Results                            
Dep. Variable:         count_variable   R-squared:                       0.212
Model:                            OLS   Adj. R-squared:                  0.209
Method:                 Least Squares   F-statistic:                     64.21
Date:                Wed, 02 Feb 2022   Prob (F-statistic):           9.41e-71
Time:                        15:13:20   Log-Likelihood:                -1699.4
No. Observations:                1440   AIC:                             3413.
Df Residuals:                    1433   BIC:                             3450.
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                                   coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------------------
const           

  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)
  return ptp(axis=axis, out=out, **kwargs)


Testing H3

Running OLS Regression using favorite count (per hundred thousand followers) as DV and dummy variable for presence of sensationalist feature/s (main variable of interest) outlet type, channel type and tweet length as IVs (Table C4)

In [85]:
featuresh3a = ['combined_features_dummy','channel_type','outlet_type','len_tweet']

In [86]:
OLS_model3a =  sm.OLS(CA_results['favorites_per100000'], sm.add_constant(CA_results[featuresh3a]))
reg_modelh3a = OLS_model3a.fit()
print(reg_modelh3a.summary())

                             OLS Regression Results                            
Dep. Variable:     favorites_per100000   R-squared:                       0.008
Model:                             OLS   Adj. R-squared:                  0.005
Method:                  Least Squares   F-statistic:                     2.989
Date:                 Wed, 02 Feb 2022   Prob (F-statistic):             0.0180
Time:                         15:13:39   Log-Likelihood:                -8285.0
No. Observations:                 1440   AIC:                         1.658e+04
Df Residuals:                     1435   BIC:                         1.661e+04
Df Model:                            4                                         
Covariance Type:             nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const           

Running OLS Regression using retweet count (per hundred thousand followers) as DV and dummy variable for presence of sensationalist feature/s (main variable of interest) outlet type, channel type and tweet length as IVs (Table C4)

In [87]:
OLS_model3b =  sm.OLS(CA_results['retweets_per100000'], sm.add_constant(CA_results[featuresh3a]))
reg_modelh3b = OLS_model3b.fit()
print(reg_modelh3b.summary())

                            OLS Regression Results                            
Dep. Variable:     retweets_per100000   R-squared:                       0.019
Model:                            OLS   Adj. R-squared:                  0.017
Method:                 Least Squares   F-statistic:                     7.084
Date:                Wed, 02 Feb 2022   Prob (F-statistic):           1.20e-05
Time:                        15:13:51   Log-Likelihood:                -9573.1
No. Observations:                1440   AIC:                         1.916e+04
Df Residuals:                    1435   BIC:                         1.918e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

Running OLS Regression using favorite count (per hundred thousand followers) as DV and count variable for number of sensationalist feature/s (main variable of interest), outlet type, channel type and tweet length as IVs (Table C5)

In [88]:
featuresh3b = ['count_variable','channel_type','outlet_type','len_tweet']

In [89]:
OLS_model3c =  sm.OLS(CA_results['favorites_per100000'], sm.add_constant(CA_results[featuresh3b]))
reg_modelh3c = OLS_model3c.fit()
print(reg_modelh3c.summary())

                             OLS Regression Results                            
Dep. Variable:     favorites_per100000   R-squared:                       0.012
Model:                             OLS   Adj. R-squared:                  0.009
Method:                  Least Squares   F-statistic:                     4.276
Date:                 Wed, 02 Feb 2022   Prob (F-statistic):            0.00192
Time:                         15:14:05   Log-Likelihood:                -8282.5
No. Observations:                 1440   AIC:                         1.657e+04
Df Residuals:                     1435   BIC:                         1.660e+04
Df Model:                            4                                         
Covariance Type:             nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const              5.5934      6.0

Running OLS Regression using favorite count (per hundred thousand followers) as DV and count variable for number of sensationalist feature/s (main variable of interest), outlet type, channel type and tweet length as IVs (Table C5)

In [90]:
OLS_model3d =  sm.OLS(CA_results['retweets_per100000'], sm.add_constant(CA_results[featuresh3b]))
reg_modelh3d = OLS_model3d.fit()
print(reg_modelh3d.summary())

                            OLS Regression Results                            
Dep. Variable:     retweets_per100000   R-squared:                       0.019
Model:                            OLS   Adj. R-squared:                  0.017
Method:                 Least Squares   F-statistic:                     7.107
Date:                Wed, 02 Feb 2022   Prob (F-statistic):           1.15e-05
Time:                        15:14:13   Log-Likelihood:                -9573.1
No. Observations:                1440   AIC:                         1.916e+04
Df Residuals:                    1435   BIC:                         1.918e+04
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const             14.2724     14.760      0.

Running OLS Regression using favorite count (per hundred thousand followers) as DV and individual sensationalist features as IVs (Table C6)

In [91]:
ca_features_indiv = ['[hyperbolic_feature]', '[slang_feature]', '[listicle_feature]', '[emoji_feature]', '[question_feature]', '[hashtag_feature]', '[forw_ref_feature]', '[punct_feature]', '[cap_feature]', '[media_dummy]']

In [92]:
OLS_modelh3d =  sm.OLS(CA_results['favorites_per100000'], sm.add_constant(CA_results[ca_features_indiv]))
reg_modelh3d = OLS_modelh3d.fit()
print(reg_modelh3d.summary())

                             OLS Regression Results                            
Dep. Variable:     favorites_per100000   R-squared:                       0.023
Model:                             OLS   Adj. R-squared:                  0.016
Method:                  Least Squares   F-statistic:                     3.374
Date:                 Wed, 02 Feb 2022   Prob (F-statistic):           0.000230
Time:                         15:14:24   Log-Likelihood:                -8274.2
No. Observations:                 1440   AIC:                         1.657e+04
Df Residuals:                     1429   BIC:                         1.663e+04
Df Model:                           10                                         
Covariance Type:             nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                 

Running OLS Regression using retweet count (per hundred thousand followers) as DV and individual sensationalist features as IVs (Table C6)

In [93]:
OLS_modelh3d =  sm.OLS(CA_results['retweets_per100000'], sm.add_constant(CA_results[ca_features_indiv]))
reg_modelh3d = OLS_modelh3d.fit()
print(reg_modelh3d.summary())

                            OLS Regression Results                            
Dep. Variable:     retweets_per100000   R-squared:                       0.012
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     1.739
Date:                Wed, 02 Feb 2022   Prob (F-statistic):             0.0674
Time:                        15:14:30   Log-Likelihood:                -9578.5
No. Observations:                1440   AIC:                         1.918e+04
Df Residuals:                    1429   BIC:                         1.924e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   14.9622 