In [1]:
import pandas as pd
import re
import csv
import numpy as np
from collections import Counter
import nltk
from nltk.corpus import stopwords
import emoji
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.formula.api import poisson
import matplotlib.pyplot as plt
import math
import scipy.stats as stats
import krippendorff
import array as arr
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

In [2]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

Reading file 'AutomatedCA_sample.csv' containing sample for automated content analysis

In [3]:
AutomatedCA_sample = pd.read_csv('/Users/salmankhawar/Desktop/UvA Courses/Thesis/Final_data_files/samples_data_files/AutomatedCA_sample.csv')

Detecting Sensationalist Features


1) Hyperbolic feature


getting list of hyperbolic words/phrases from 'Hyperbolic Words.txt' file

In [4]:
hyperbolic_words = open("/Users/salmankhawar/Desktop/UvA Courses/Thesis/Content Analysis/Hyperbolic Words.txt", "r")
hyperbolic_words_list = hyperbolic_words.readlines()

hyperbolic_list=[]
for words in hyperbolic_words_list:
    hyperbolic_list.append(words.strip())
    
print(hyperbolic_list)  

['A Single', 'Absolutely', 'Amazing', 'Awesome', 'Best', 'Breathtaking', 'But what happened next', 'Can change your life', "Can't Even Handle", "Can't Handle", 'Cannot Even Handle', "Doesn't want you to see", 'Epic', 'Everything You Need To Know', 'Gasp-Worthy', 'Go Viral', 'Greatest', 'Incredible', 'Infuriate', 'Literally', 'Mind Blowing', 'Mind BLOWN', 'Mind Blown', 'Need To Visit Before You Die', 'Nothing Could Prepare Me For', 'Of All Time', 'OMG', 'One Weird Trick', 'Perfection', 'Priceless', 'Prove', 'Right Now', 'Scientific Reasons', 'Shocked', 'Shocking', 'Simple Lessons', "Stop What You're Doing", 'TERRIFYING', 'Terrifying', 'That Will Make You Rethink', "The World's Best", 'This Is What Happens', 'Totally blew my mind', 'Unbelievable', 'Unimaginable', 'WHAT?', 'Whoa', 'WHOA', 'Whoah', 'Will Blow Your Mind', 'Will Change Your Life Forever', 'Won the Internet', 'Wonderful', 'Worst', 'Wow', 'WOW', "You Didn't Know Exist", "You Didn't Know Existed", "You Won't Believe", 'Have To 

Detetcting presence of the hyperbolic words/phrases in the sample

In [5]:
def hyperbolic_detection1(tweet, query):
    tweet = str(tweet).lower()
    newquery = []
    for word in query:
        newquery.append(str(word).lower())
    tokens = re.findall(r"[\w']+|[.,!?;$@#]", tweet)
    
    for word in newquery:
        if word in tokens:
            return 1
    return 0

In [6]:
AutomatedCA_sample['hyperbolic_feature_1'] = AutomatedCA_sample['text'].apply(hyperbolic_detection1,
                                                      args=(hyperbolic_list,))

Detecting tweets with superlatives - POS tags: RBS (superlative adverbs) and JJS (superlative adjectives)

In [7]:
def hyperbole_detections2(text):
    newquery = []
    text = str(text).lower()
    wordslist = nltk.word_tokenize(text)
    tagged = nltk.pos_tag(wordslist)

    for tupl in tagged:
        if tupl[1] == "RBS" or tupl[1] == "JJS":
                        newquery.append(tupl[0])
    
    if len(newquery) == 0:
            return 0
    return 1

In [8]:
AutomatedCA_sample['hyperbolic_feature_2'] = AutomatedCA_sample['text'].apply(hyperbole_detections2)

Consolidating results for hyperbole feature

In [9]:
hyperbole_grid = AutomatedCA_sample[['hyperbolic_feature_1', 'hyperbolic_feature_2']]

In [10]:
hyperbole_grid = np.array(hyperbole_grid)
hyperbole_grid = hyperbole_grid.tolist()
any_row_1 = [int(any(row)) for row in hyperbole_grid]

In [11]:
AutomatedCA_sample['hyperbolic_feature'] = any_row_1

Hyperbolic words/phrases were present in 7.05% of the sample

In [12]:
AutomatedCA_sample['hyperbolic_feature'].value_counts(normalize=True)*100

0    92.945312
1    7.054687 
Name: hyperbolic_feature, dtype: float64

In [13]:
AutomatedCA_sample[AutomatedCA_sample['hyperbolic_feature'] == 1]['text'].head(20)

0      Justice Amy Coney Barrett's decision marks the first time the nation's highest court has weighed in on the legality of such requirements.                                                                                                                       
20     31 "Sex And The City" Guest Stars You Probably Totally Forgot About                                                                                                                                                                                             
21     The German military says a firefight at one of the gates of Kabul‚Äôs international airport killed at least one Afghan soldier.                                                                                                                                   
25     Phone-Banking Is Democrats‚Äô Best Hope For Organizing In A Pandemic. It's Also Helping Volunteers Feel Less Lonely.                                                                                   

2) Forward referencing feature



Creating a list of demonstrative pronouns, personal pronouns and adverbs, and a fucntion to detect tweets that start with such words

In [14]:
forwardref_list = ['This','That','These','Those','He','She','They','Him','Her','Them','You','Here','There','Meet','Read']

In [15]:
def forwardref_detection(text):
    text = str(text)
    if (text.startswith(tuple(forwardref_list))):
            return 1
    return 0

In [16]:
AutomatedCA_sample['forwardref_feature'] = AutomatedCA_sample['text'].apply(forwardref_detection)

The forward referencing feature is detected in 3.84% of the sample

In [17]:
AutomatedCA_sample['forwardref_feature'].value_counts(normalize=True)*100

0    96.15625
1    3.84375 
Name: forwardref_feature, dtype: float64

In [18]:
AutomatedCA_sample[AutomatedCA_sample['forwardref_feature'] == 1]['text'].head(20)

60     Here's What Everyone Wore To This Year's BET Awards                                                                                                 
80     This Is What Actual Teenagers Think About Claudia Conway                                                                                            
82     This $9 Trillion Proposal Is The Most Ambitious Climate Change Policy Yet From A Democratic Presidential Candidate                                  
91     This Is What People Promised They Would Do If Biden Won                                                                                             
96     There are new Harry Styles pics on the timeline and fans *Ariel* excited üßú‚Äç‚ôÄÔ∏è #TheBuzz                                                              
108    There is a petition calling for the @Browns to start Johnny Manziel.                                                                                
121    This is the legacy of 'movie monster' Jean-Paul 

3) Listicle feature

Function to detect tweets starting with a number

In [19]:
def listicle_detection(text):
    text = str(text)
    if re.match(r"\d", text):
            return 1
    return 0

In [20]:
AutomatedCA_sample['listicle_feature'] = AutomatedCA_sample['text'].apply(listicle_detection)

Listicles were detected in .95% of the sample

In [21]:
AutomatedCA_sample['listicle_feature'].value_counts(normalize=True)*100

0    99.054688
1    0.945312 
Name: listicle_feature, dtype: float64

In [22]:
AutomatedCA_sample[AutomatedCA_sample['listicle_feature'] == 1]['text'].head(20)

20      31 "Sex And The City" Guest Stars You Probably Totally Forgot About                                                                                              
27      86% of NBA GM's say they would sign Anthony Davis first if they were starting a franchise today.                                                                 
57      21 Absolutely Fucking Perfect Moments In Sporting Caption History\n\n                                                                                            
252     21 Quizzes Marvel Fans Should Take Right Now                                                                                                                     
271     140 women have accused former USA Gymnastics doctor Larry Nassar of abuse. His victims think we don't care.                                                      
457     72 straight pats for miami kicker just snapped. 72! Fsu up 1                                                                                  

4) Question feature


creating function to detect interrogative structure in tweets, i.e presence of a question mark

In [23]:
def question_detection(text):
    text = str(text)
    if re.search('\w[?]\s', text):
        return 1
    return 0

In [24]:
AutomatedCA_sample['question_feature'] = AutomatedCA_sample['text'].apply(question_detection)

The question feature is detected in 4.59% of the sample

In [25]:
AutomatedCA_sample['question_feature'].value_counts(normalize=True)*100

0    95.410156
1    4.589844 
Name: question_feature, dtype: float64

In [26]:
AutomatedCA_sample[AutomatedCA_sample['question_feature'] == 1]['text'].head(20)

37     #What2Watch hosts @scottygb and @DionneGrant face some blowback over their Should You Watch It? Board choices üòÖ                                                                                                                                                 
49     What makes an elite team? @CoachLindsayG\nhas a powerpoint for that. \n\nThe @USCWBB HC makes her Trojan debut tonight as USC tries to climb back to glory  \n                                                                                                  
63     Review: Is Christmas the season of Nintendo or the opiate of the masses? Two new movies chime in                                                                                                                                                                
90     hello! have you had trouble finding a good therapist? any tips that worked for you? (or if you're a therapist yourself, what advice do you have?) would love to hear from you for a story. DMs open, o

5) All caps words


Using part of speech tagging to identify capitalized proper nouns/acronyms (POS tag: NNP) in the sample, and excluding them in detection of the all caps feature

In [27]:
tag_list = []
def pos_tagging(txt):
        txt = str(txt)
        wordslist = nltk.word_tokenize(txt)
        tagged = nltk.pos_tag(wordslist)
            
        for tupl in tagged:
                if tupl[1] == "NNP":
                        tag_list.append(tupl[0])
                  
                else:
                    pass

In [28]:
AutomatedCA_sample['text'].apply(pos_tagging)
tag_list[0:20]

['Justice',
 'Amy',
 'Coney',
 'Barrett',
 'Russell',
 'Wilson',
 'Seahawks',
 'Manhattan',
 '‚Äô',
 'Trump',
 '‚Äô',
 'Jon',
 'Stewart',
 'SportsCenter',
 'Melania',
 'Trump',
 'Secretary',
 'State',
 'Pompeo',
 'RNC']

In [29]:
tag_list_upper = [w for w in tag_list if w.isupper()]
tag_list_upper = list(set(tag_list_upper))
tag_list_upper[0:20]

['ZURICH',
 'LF',
 'TCU',
 'DHS',
 'CARES',
 'BIABIA',
 'R-N.Y.',
 'XX',
 'OLE',
 'BREAKING',
 'CIA',
 'FAN',
 'CUP',
 'NAJ_TV',
 'ACL',
 'COP26',
 'HONOR',
 'MINUTES',
 'SPAC',
 'CB']

In [30]:
def capitalized_detection(text):
    text = str(text)
    
    m = re.findall(r"\b[A-Z]{2,}\b", text)
    if m:
        if any(x in m for x in tag_list_upper):
                       return 0
             
        else: 
                       return 1
            
    else:
        return 0

In [31]:
AutomatedCA_sample['capitalized_feature'] = AutomatedCA_sample['text'].apply(capitalized_detection) 

In [32]:
AutomatedCA_sample['capitalized_feature'].value_counts(normalize=True)*100

0    99.746094
1    0.253906 
Name: capitalized_feature, dtype: float64

In [33]:
AutomatedCA_sample[AutomatedCA_sample['capitalized_feature'] == 1]['text'].head(20)

177     RILEY!!!                                                                                                                                                                                                                                                                  
476     @doliver8 @Josh1Rivera @heyitsanika Hi Dav- I mean Olivia, we would love to get an exclusive on your red carpet look when the time comes. XOXO                                                                                                                            
606     Donald Trump's son last week endorsed sending an AR-15 rifle to the acquitted shooter.                                                                                                                                                                                    
1095    Under its first woman president, the AFL-CIO is pursuing big legislative goals in Washington while trying to turn around the decline in union membership.              

6) Emoji feature


extracting list of emojis from the emoji library and creating function for detection

In [34]:
emoji_list = list(emoji.UNICODE_EMOJI['en'].keys())
len(emoji_list)

4702

In [35]:
def emoji_detection(text):
    text = str(text)
    for word in text:
        if word in emoji_list:
            return 1
    return 0

In [36]:
AutomatedCA_sample['emoji_feature'] = AutomatedCA_sample['text'].apply(emoji_detection)

The emoji feature is detected in 2.89% of the sample

In [37]:
AutomatedCA_sample['emoji_feature'].value_counts(normalize=True)*100

0    97.109375
1    2.890625 
Name: emoji_feature, dtype: float64

In [38]:
AutomatedCA_sample[AutomatedCA_sample['emoji_feature'] == 1]['text'].head(20)

11     Atlanta ties it at 2-2 on Dansby Swanson‚Äôs solo home run! #WorldSeries \n\nüé•: @MLB\n                                                                                                                                          
37     #What2Watch hosts @scottygb and @DionneGrant face some blowback over their Should You Watch It? Board choices üòÖ                                                                                                               
56     .@robcorddry says he joined #TheUnicorn because of his pal Walton Goggins: "First of all, just do whatever Walton says." ü¶Ñ                                                                                                    
88     The on-screen couples we can't get enough of, and the ones we're dreaming of üòç #TheBuzz                                                                                                                                       
96     There are new Harry Styles pics on the timeline and fans *A

7) Hashtags

In [39]:
def hashtag_detection(text):
    text = str(text)
    if re.search('#\w', text):
        return 1
    return 0

In [40]:
AutomatedCA_sample['hashtag_feature'] = AutomatedCA_sample['text'].apply(hashtag_detection)

Hashtags are detected in 8.90% of the sample

In [41]:
AutomatedCA_sample['hashtag_feature'].value_counts(normalize=True)*100

0    91.097656
1    8.902344 
Name: hashtag_feature, dtype: float64

In [42]:
AutomatedCA_sample[AutomatedCA_sample['hashtag_feature'] == 1]['text'].head(20)

1      Now that's the Russell Wilson we are used to right there #Seahawks #mnf                                                                                                                                                          
10     Full interview: @BradleyWhitford discusses his new series #PerfectHarmony, whether "The West Wing" could be made today, what the Emmys are really like, and more                                                                 
11     Atlanta ties it at 2-2 on Dansby Swanson‚Äôs solo home run! #WorldSeries \n\nüé•: @MLB\n                                                                                                                                             
17     Bottom of the ninth. Tie game. Mood. #ComeTogether                                                                                                                                                                               
37     #What2Watch hosts @scottygb and @DionneGrant face some b

8) Informal punctuation feature


Using regular expressions to detect informal punctuation patterns

In [43]:
def informalpunct_detection(text):
    text = str(text)
    pattern = '[!?*.]{2,}|\*\w+\*'
    if re.search(pattern, text):
            return 1
    else:
            return 0

In [44]:
AutomatedCA_sample['informalpunct_feature'] = AutomatedCA_sample['text'].apply(informalpunct_detection) 

Informal punctuation feature is present in 1.76% of the sample

In [45]:
AutomatedCA_sample['informalpunct_feature'].value_counts(normalize=True)*100

0    98.242188
1    1.757812 
Name: informalpunct_feature, dtype: float64

In [46]:
AutomatedCA_sample[AutomatedCA_sample['informalpunct_feature'] == 1]['text'].head(20)

77      Yep...                                                                                                                                                                                                                                                             
90      hello! have you had trouble finding a good therapist? any tips that worked for you? (or if you're a therapist yourself, what advice do you have?) would love to hear from you for a story. DMs open, or you can email doliver@usatoday.com. thanks much!!          
96      There are new Harry Styles pics on the timeline and fans *Ariel* excited üßú‚Äç‚ôÄÔ∏è #TheBuzz                                                                                                                                                                             
177     RILEY!!!                                                                                                                                                                           

9) Slang feature

Two dictionaries for slang words and abbrevations commonly appearing in tweets were used to detect this feature.


The first list was taken from https://www.kaggle.com/code/nmaguette/up-to-date-list-of-slangs-for-text-preprocessing :

In [47]:
dict1 = {
    "$" : " dollar ",
    "‚Ç¨" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

In [48]:
resultList = list(dict1.items())

In [49]:
df_dict1 = pd.DataFrame(resultList, columns=['slang', 'formal_translation'])

The second list was taken from: https://www.kaggle.com/datasets/gogylogy/twitterslang

In [50]:
dict2 = pd.read_csv('/Users/salmankhawar/Desktop/Thesis Publication/Paper Revisions/twitterSlang.csv')

In [51]:
df_slang = pd.concat([df_dict1, dict2])
slang_list = df_slang['slang'].tolist()

In [52]:
slang_list
len(slang_list)

331

In [53]:
def slang_detection(txt, query):
    newquery = []
    txt = str(txt).lower()

    for word in query:
        newquery.append(str(word).lower())
    tokens = re.findall(r"[\w']+", txt)
    
    for word in newquery:
        if word in tokens:
            return 1
    return 0

In [54]:
AutomatedCA_sample['slang_feature'] = AutomatedCA_sample['text'].apply(slang_detection,
                                                      args=(slang_list,))

Slang feature was detected in 5.12% of the sample

In [55]:
AutomatedCA_sample['slang_feature'].value_counts(normalize=True)*100

0    94.882812
1    5.117188 
Name: slang_feature, dtype: float64

In [56]:
AutomatedCA_sample[AutomatedCA_sample['slang_feature'] == 1]['text'].head(20)

18     Supreme Court casts doubt on Trump's bid to exclude from census immigrants in U.S. illegally                                                                                                                                                                                 
36     Prince Harry and Duchess Meghan said they asked the queen in advance to pass her childhood nickname, Lilibet, down to their new baby daughter after a BBC report said they didn't.                                                                                           
78     Judge Royce Lamberth denied release from custody for Phoenix QAnon believer Jake Angeli, who was caught on video storming the U.S. Capitol on Jan. 6.\n\n¬†¬†¬†¬†¬†¬†                                                                                                              
86     Afghan women living in the U.S. share the recipes and the memories that keep them going.                                                                    

10) Media Feature


extracting information on media accompanying tweets using metadata

In [57]:
AutomatedCA_sample['media_type'].value_counts(normalize=True)*100

no media        81.898438
photo           13.746094
video           2.734375 
animated_gif    1.621094 
Name: media_type, dtype: float64

In [58]:
def media_feature(x):
        if x == 'no media':
            return 0
        else:
            return 1

In [59]:
AutomatedCA_sample['media_dummy'] = AutomatedCA_sample['media_type'].apply(media_feature) 

The media feature is present in 18.10% of the sample

In [60]:
AutomatedCA_sample['media_dummy'].value_counts(normalize=True)*100

0    81.898438
1    18.101562
Name: media_dummy, dtype: float64

Validating automated methods against the manually coded sample (n=1440) - Table 3

In [61]:
Validation_sample = pd.read_csv('/Users/salmankhawar/Desktop/UvA Courses/Thesis/Data_Collection_Twitter/Data_csv/Data_Collected_Tweepy/ManualCA_sample.csv')

In [62]:
CA_results = pd.read_csv('/Users/salmankhawar/Desktop/UvA Courses/Thesis/Content Analysis/Results_CA.csv')

1) Hyperbole feature

In [63]:
Validation_sample['hyperbolic_feature_aca_1'] = Validation_sample['text'].apply(hyperbolic_detection1, args=(hyperbolic_list,))

In [64]:
Validation_sample['hyperbolic_feature_aca_2'] = Validation_sample['text'].apply(hyperbole_detections2)

In [65]:
hyperbole_grid_val = Validation_sample[['hyperbolic_feature_aca_1', 'hyperbolic_feature_aca_2']]

In [66]:
hyperbole_grid_val = np.array(hyperbole_grid_val)
hyperbole_grid_val = hyperbole_grid_val.tolist()
any_row_2 = [int(any(row)) for row in hyperbole_grid_val]
Validation_sample['hyperbolic_feature_aca'] = any_row_2

In [67]:
y_test_hyp = CA_results["[hyperbolic_feature]"]
y_pred_hyp = Validation_sample["hyperbolic_feature_aca"]
confusion_matrix(y_test_hyp, y_pred_hyp)

array([[1231,   64],
       [  93,   52]])

In [72]:
print("F1-score:", f1_score(y_test_hyp, y_pred_hyp, average="macro"))
print("Precision:", precision_score(y_test_hyp, y_pred_hyp, average="macro"))
print("Recall:", recall_score(y_test_hyp, y_pred_hyp, average="macro"))

F1-score: 0.6692604442337823
Precision: 0.6890170851130326
Recall: 0.6545999201171615


In [69]:
print(classification_report(y_test_hyp, y_pred_hyp))

              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1295
           1       0.45      0.36      0.40       145

    accuracy                           0.89      1440
   macro avg       0.69      0.65      0.67      1440
weighted avg       0.88      0.89      0.89      1440



In [70]:
rho, p_value = stats.spearmanr(CA_results["[hyperbolic_feature]"], Validation_sample["hyperbolic_feature_aca"])
 
print("Spearman rank correlation:", rho)

Spearman rank correlation: 0.3418890244465509


In [73]:
array_hyperbole = [CA_results["[hyperbolic_feature]"], Validation_sample["hyperbolic_feature_aca"]]

In [75]:
print("Krippendorff's alpha:", krippendorff.alpha(array_hyperbole))

Krippendorff's alpha: 0.3387505687146245


2) Forward referencing feature

In [76]:
Validation_sample['forwardref_feature_aca'] = Validation_sample['text'].apply(forwardref_detection)

In [77]:
y_test_fr = CA_results["[forw_ref_feature]"]
y_pred_fr = Validation_sample["forwardref_feature_aca"]
confusion_matrix(y_test_fr, y_pred_fr)

array([[1347,   33],
       [  35,   25]])

In [78]:
print("F1-score:", f1_score(y_test_fr, y_pred_fr, average="macro"))
print("Precision:", precision_score(y_test_fr, y_pred_fr, average="macro"))
print("Recall:", recall_score(y_test_fr, y_pred_fr, average="macro"))

F1-score: 0.6995544864320868
Precision: 0.7028544338539847
Recall: 0.696376811594203


In [79]:
print(classification_report(y_test_fr, y_pred_fr))

              precision    recall  f1-score   support

           0       0.97      0.98      0.98      1380
           1       0.43      0.42      0.42        60

    accuracy                           0.95      1440
   macro avg       0.70      0.70      0.70      1440
weighted avg       0.95      0.95      0.95      1440



In [80]:
rho, p_value = stats.spearmanr(CA_results["[forw_ref_feature]"], Validation_sample["forwardref_feature_aca"])
 
print("Spearman rank correlation:", rho)

Spearman rank correlation: 0.3991786915054091


In [93]:
array_forwref = [CA_results["[forw_ref_feature]"], Validation_sample["forwardref_feature_aca"]]

In [94]:
print("Krippendorff's alpha:", krippendorff.alpha(array_forwref))

Krippendorff's alpha: 0.399317615581929


3) Listicle feature

In [81]:
Validation_sample['listicle_feature_aca'] = Validation_sample['text'].apply(listicle_detection)

In [82]:
y_test_lst = CA_results["[listicle_feature]"]
y_pred_lst = Validation_sample["listicle_feature_aca"]
confusion_matrix(y_test_lst, y_pred_lst)

array([[1424,    3],
       [   4,    9]])

In [83]:
print("F1-score:", f1_score(y_test_lst, y_pred_lst, average="macro"))
print("Precision:", precision_score(y_test_lst, y_pred_lst, average="macro"))
print("Recall:", recall_score(y_test_lst, y_pred_lst, average="macro"))

F1-score: 0.8587740805604203
Precision: 0.8735994397759104
Recall: 0.845102689881947


In [84]:
print(classification_report(y_test_lst, y_pred_lst))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1427
           1       0.75      0.69      0.72        13

    accuracy                           1.00      1440
   macro avg       0.87      0.85      0.86      1440
weighted avg       0.99      1.00      1.00      1440



In [85]:
rho, p_value = stats.spearmanr(CA_results["[listicle_feature]"], Validation_sample["listicle_feature_aca"])
 
print("Spearman rank correlation:", rho)

Spearman rank correlation: 0.7181369551974195


In [95]:
array_listicle = [CA_results["[listicle_feature]"], Validation_sample["listicle_feature_aca"]]

In [96]:
 print("Krippendorff's alpha:", krippendorff.alpha(array_listicle))

Krippendorff's alpha: 0.7176462346760071


4) Question feature

In [86]:
Validation_sample['question_feature_aca'] = Validation_sample['text'].apply(question_detection)

In [87]:
y_test_q = CA_results["[question_feature]"]
y_pred_q = Validation_sample["question_feature_aca"]
confusion_matrix(y_test_q, y_pred_q)

array([[1358,    2],
       [  11,   69]])

In [88]:
print("F1-score:", f1_score(y_test_q, y_pred_q, average="macro"))
print("Precision:", precision_score(y_test_q, y_pred_q, average="macro"))
print("Recall:", recall_score(y_test_q, y_pred_q, average="macro"))

F1-score: 0.9545718175398406
Precision: 0.9818979619131885
Recall: 0.930514705882353


In [89]:
print(classification_report(y_test_q, y_pred_q))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      1360
           1       0.97      0.86      0.91        80

    accuracy                           0.99      1440
   macro avg       0.98      0.93      0.95      1440
weighted avg       0.99      0.99      0.99      1440



In [90]:
rho, p_value = stats.spearmanr(CA_results["[question_feature]"], Validation_sample["question_feature_aca"])
 
print("Spearman rank correlation:", rho)

Spearman rank correlation: 0.9109646740425484


In [91]:
array_question = [CA_results["[question_feature]"], Validation_sample["question_feature_aca"]]

In [97]:
print("Krippendorff's alpha:", krippendorff.alpha(array_question))  

Krippendorff's alpha: 0.909175182428612


5) All caps feature

In [98]:
Validation_sample['capitalized_feature_aca'] = Validation_sample['text'].apply(capitalized_detection) 

In [99]:
y_test_cap = CA_results["[cap_feature]"]
y_pred_cap = Validation_sample["capitalized_feature_aca"]
confusion_matrix(y_test_cap, y_pred_cap)

array([[1391,    5],
       [  39,    5]])

In [100]:
print("F1-score:", f1_score(y_test_cap, y_pred_cap, average="macro"))
print("Precision:", precision_score(y_test_cap, y_pred_cap, average="macro"))
print("Recall:", recall_score(y_test_cap, y_pred_cap, average="macro"))

F1-score: 0.5848077376739798
Precision: 0.7363636363636363
Recall: 0.555027350872623


In [101]:
print(classification_report(y_test_cap, y_pred_cap))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      1396
           1       0.50      0.11      0.19        44

    accuracy                           0.97      1440
   macro avg       0.74      0.56      0.58      1440
weighted avg       0.96      0.97      0.96      1440



In [103]:
array_caps = [CA_results["[cap_feature]"], Validation_sample["capitalized_feature_aca"]]

In [104]:
print("Krippendorff's alpha:", krippendorff.alpha(array_caps))

Krippendorff's alpha: 0.169903803307908


6) Emoji feature

In [105]:
Validation_sample['emoji_feature_aca'] = Validation_sample['text'].apply(emoji_detection) 

In [106]:
y_test_emoji = CA_results["[emoji_feature]"]
y_pred_emoji = Validation_sample["emoji_feature_aca"]
confusion_matrix(y_test_emoji, y_pred_emoji)

array([[1378,    1],
       [  16,   45]])

In [107]:
print("F1-score:", f1_score(y_test_emoji, y_pred_emoji, average="macro"))
print("Precision:", precision_score(y_test_emoji, y_pred_emoji, average="macro"))
print("Recall:", recall_score(y_test_emoji, y_pred_emoji, average="macro"))

F1-score: 0.91749547539525
Precision: 0.9833915538643878
Recall: 0.8684898774355378


In [108]:
print(classification_report(y_test_emoji, y_pred_emoji))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1379
           1       0.98      0.74      0.84        61

    accuracy                           0.99      1440
   macro avg       0.98      0.87      0.92      1440
weighted avg       0.99      0.99      0.99      1440



In [109]:
rho, p_value = stats.spearmanr(CA_results["[emoji_feature]"], Validation_sample["emoji_feature_aca"])
 
print("Spearman rank correlation:", rho)

Spearman rank correlation: 0.8440969006858452


In [110]:
array_emoji = [CA_results["[emoji_feature]"], Validation_sample["emoji_feature_aca"]]

In [111]:
print("Krippendorff's alpha:", krippendorff.alpha(array_emoji))

Krippendorff's alpha: 0.8350482455992532


7) Hashtag feature

In [112]:
Validation_sample['hashtag_feature_aca'] = Validation_sample['text'].apply(hashtag_detection)

In [113]:
y_test_hash = CA_results["[hashtag_feature]"]
y_pred_hash = Validation_sample["hashtag_feature_aca"]
confusion_matrix(y_test_hash, y_pred_hash)

array([[1315,    3],
       [   0,  122]])

In [114]:
print("F1-score:", f1_score(y_test_hash, y_pred_hash, average="macro"))
print("Precision:", precision_score(y_test_hash, y_pred_hash, average="macro"))
print("Recall:", recall_score(y_test_hash, y_pred_hash, average="macro"))

F1-score: 0.9933574331399506
Precision: 0.988
Recall: 0.9988619119878603


In [115]:
print(classification_report(y_test_hash, y_pred_hash))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1318
           1       0.98      1.00      0.99       122

    accuracy                           1.00      1440
   macro avg       0.99      1.00      0.99      1440
weighted avg       1.00      1.00      1.00      1440



In [116]:
rho, p_value = stats.spearmanr(CA_results["[hashtag_feature]"], Validation_sample["hashtag_feature_aca"])
 
print("Spearman rank correlation:", rho)

Spearman rank correlation: 0.9868021342702413


In [117]:
array_hashtag = [CA_results["[hashtag_feature]"], Validation_sample["hashtag_feature_aca"]]

In [118]:
print("Krippendorff's alpha:", krippendorff.alpha(array_hashtag))

Krippendorff's alpha: 0.986719479173554


8) Informal punctuation feature

In [119]:
Validation_sample['informalpunct_feature_aca'] = Validation_sample['text'].apply(informalpunct_detection)

In [120]:
y_test_punct = CA_results["[punct_feature]"]
y_pred_punct = Validation_sample["informalpunct_feature_aca"]
confusion_matrix(y_test_punct, y_pred_punct)

array([[1402,    1],
       [  22,   15]])

In [121]:
print("F1-score:", f1_score(y_test_punct, y_pred_punct, average="macro"))
print("Precision:", precision_score(y_test_punct, y_pred_punct, average="macro"))
print("Recall:", recall_score(y_test_punct, y_pred_punct, average="macro"))

F1-score: 0.77895095140525
Precision: 0.9610252808988764
Recall: 0.7023463235152473


In [122]:
print(classification_report(y_test_punct, y_pred_punct))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1403
           1       0.94      0.41      0.57        37

    accuracy                           0.98      1440
   macro avg       0.96      0.70      0.78      1440
weighted avg       0.98      0.98      0.98      1440



In [123]:
rho, p_value = stats.spearmanr(CA_results["[punct_feature]"], Validation_sample["informalpunct_feature_aca"])
 
print("Spearman rank correlation:", rho)

Spearman rank correlation: 0.6108576614481374


In [124]:
array_punct = [CA_results["[punct_feature]"], Validation_sample["informalpunct_feature_aca"]]

In [125]:
print("Krippendorff's alpha:", krippendorff.alpha(array_punct))

Krippendorff's alpha: 0.5580554090942462


9) Slang feature

In [126]:
Validation_sample['slang_feature_aca'] = Validation_sample['text'].apply(slang_detection,args=(slang_list,))

In [127]:
y_test_slang = CA_results["[slang_feature]"]
y_pred_slang = Validation_sample["slang_feature_aca"]
confusion_matrix(y_test_slang, y_pred_slang)

array([[1291,   65],
       [  68,   16]])

In [128]:
print("F1-score:", f1_score(y_test_slang, y_pred_slang, average="macro"))
print("Precision:", precision_score(y_test_slang, y_pred_slang, average="macro"))
print("Recall:", recall_score(y_test_slang, y_pred_slang, average="macro"))

F1-score: 0.5724761426418885
Precision: 0.5737470362194423
Recall: 0.5712705436156764


In [129]:
print(classification_report(y_test_slang, y_pred_slang))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      1356
           1       0.20      0.19      0.19        84

    accuracy                           0.91      1440
   macro avg       0.57      0.57      0.57      1440
weighted avg       0.91      0.91      0.91      1440



In [130]:
rho, p_value = stats.spearmanr(CA_results["[slang_feature]"], Validation_sample["slang_feature_aca"])
 
print("Spearman rank correlation:", rho)

Spearman rank correlation: 0.14499643252721264


In [131]:
array_slang = [CA_results["[slang_feature]"], Validation_sample["slang_feature_aca"]]

In [132]:
print("Krippendorff's alpha:", krippendorff.alpha(array_slang))

Krippendorff's alpha: 0.14524917685138683


Hypothesis Testing


Creating dummy variable for outlet type (1=tweets published by online-native outlets, 0=tweets published by legacy outlets)

In [133]:
legacy_outlets = ['USA TODAY Politics', 'USA TODAY Sports', 'USA TODAY Life', 'L.A. Times Politics', 'L.A. Times Sports', 'LAT Entertainment']
regex_legacy_outlets = re.compile("(?=(" + "|".join(map(re.escape, legacy_outlets)) + "))")

In [134]:
def news_outlet_classification(noc):
    noc = str(noc)
    m=re.search(regex_legacy_outlets, noc)
    if m:
        return 0
    else:
        return 1

In [135]:
AutomatedCA_sample['outlet_type'] = AutomatedCA_sample['twitter_account'].apply(news_outlet_classification) 

In [136]:
AutomatedCA_sample['outlet_type'].value_counts()

1    12800
0    12800
Name: outlet_type, dtype: int64

Creating dummy variable for presence of sensationalism in tweets (1=sensationalist feature present, 0=sensationalist feature not present)

In [137]:
features_grid = AutomatedCA_sample[['hyperbolic_feature', 'slang_feature', 'listicle_feature', 'emoji_feature', 'question_feature', 'hashtag_feature', 'forwardref_feature', 'informalpunct_feature', 'capitalized_feature', 'media_dummy']]

In [138]:
features_grid = np.array(features_grid)
features_grid = features_grid.tolist()
any_row = [int(any(row)) for row in features_grid]
AutomatedCA_sample['combined_features_dummy'] = any_row

Sensationalist feature/s present in 40.14% of tweets in sample

In [139]:
AutomatedCA_sample['combined_features_dummy'].value_counts(normalize=True)*100

0    59.855469
1    40.144531
Name: combined_features_dummy, dtype: float64

Creating count variable with number of features in each tweet

In [140]:
count_variable = np.sum(features_grid ,axis=1).tolist()

In [141]:
AutomatedCA_sample['count_variable'] = count_variable
AutomatedCA_sample['count_variable'].value_counts(normalize=True)*100

0    59.855469
1    29.488281
2    8.406250 
3    1.902344 
4    0.292969 
5    0.050781 
6    0.003906 
Name: count_variable, dtype: float64

Creating variable for channel type (1=tweets published in entertainment-oriented news sub-channels i.e sports,entertainment/life, 0=tweets published in politics-oriented news sub-channels)

In [142]:
def pol_ent_classification(x):
    x = str(x)
    if re.search('(Politics)', x):
        return 0
    else:
        return 1

In [143]:
AutomatedCA_sample['channel_type'] = AutomatedCA_sample['twitter_account'].apply(pol_ent_classification) 

In [144]:
AutomatedCA_sample['channel_type'].value_counts()

1    12800
0    12800
Name: channel_type, dtype: int64

Creating dummy variable for tweet length

In [145]:
def length_tweet(x):
    x = str(x)
    return len(x)

In [146]:
AutomatedCA_sample['len_tweet'] = AutomatedCA_sample['text'].apply(length_tweet) 

Testing H1: Running Logistic Regression using dummy variable for presence of sensationalist feature/s in tweets as DV and outlet type (main variable of interest), channel type and tweet length as IVs (Table B1)

In [147]:
features_h1 = ['outlet_type','channel_type','len_tweet']

In [148]:
logit_model = sm.Logit(AutomatedCA_sample['combined_features_dummy'], sm.add_constant(AutomatedCA_sample[features_h1]))
ACAreg_modelH1a = logit_model.fit()
print(ACAreg_modelH1a.summary())

Optimization terminated successfully.
         Current function value: 0.618043
         Iterations 5
                              Logit Regression Results                             
Dep. Variable:     combined_features_dummy   No. Observations:                25600
Model:                               Logit   Df Residuals:                    25596
Method:                                MLE   Df Model:                            3
Date:                     Tue, 19 Dec 2023   Pseudo R-squ.:                 0.08247
Time:                             02:18:18   Log-Likelihood:                -15822.
converged:                            True   LL-Null:                       -17244.
Covariance Type:                 nonrobust   LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -1.6387      0.043    -38.323      0.000      -1

  return ptp(axis=axis, out=out, **kwargs)


Poisson regression using count variable (aggregate of sensationalist features present in each tweet) (Table B1)

In [149]:
Poisson_Regression_H1 = sm.Poisson(AutomatedCA_sample['count_variable'], sm.add_constant(AutomatedCA_sample[features_h1]))
ACApoiss_modelH1 = Poisson_Regression_H1.fit()
print(ACApoiss_modelH1.summary())

Optimization terminated successfully.
         Current function value: 0.904451
         Iterations 6
                          Poisson Regression Results                          
Dep. Variable:         count_variable   No. Observations:                25600
Model:                        Poisson   Df Residuals:                    25596
Method:                           MLE   Df Model:                            3
Date:                Tue, 19 Dec 2023   Pseudo R-squ.:                 0.07114
Time:                        02:18:45   Log-Likelihood:                -23154.
converged:                       True   LL-Null:                       -24927.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -1.6358      0.029    -56.404      0.000      -1.693      -1.579
outlet_type      0.4883

H2: Sensationalism and User Engagement


Running Poisson Regression using favorite count as DV and the number of sensationalist features (count variable - main variable of interest), outlet type, channel type and tweet length as IVs (Table C1)

In [150]:
features_h2a = ['count_variable','outlet_type','channel_type','len_tweet']

In [152]:
Poisson_Regression_H2a = sm.Poisson(AutomatedCA_sample['favorite_count'], sm.add_constant(AutomatedCA_sample[features_h2a]))
ACApoiss_modelH2a = Poisson_Regression_H2a.fit()
print (ACApoiss_modelH2a.summary())

Optimization terminated successfully.
         Current function value: 31.761551
         Iterations 6
                          Poisson Regression Results                          
Dep. Variable:         favorite_count   No. Observations:                25600
Model:                        Poisson   Df Residuals:                    25595
Method:                           MLE   Df Model:                            4
Date:                Tue, 19 Dec 2023   Pseudo R-squ.:                 0.05044
Time:                        02:22:53   Log-Likelihood:            -8.1310e+05
converged:                       True   LL-Null:                   -8.5628e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const              2.0859      0.006    350.020      0.000       2.074       2.098
count_variable  

Running Poisson Regression using retweet count as DV and the number of sensationalist features (count variable - main variable of interest), outlet type, channel type and tweet length as IVs (Table C1)

In [153]:
Poisson_Regression_H2b = sm.Poisson(AutomatedCA_sample['retweet_count'], sm.add_constant(AutomatedCA_sample[features_h2a]))
ACApoiss_modelH2b = Poisson_Regression_H2b.fit()
print (ACApoiss_modelH2b.summary())

Optimization terminated successfully.
         Current function value: 110.559087
         Iterations 7
                          Poisson Regression Results                          
Dep. Variable:          retweet_count   No. Observations:                25600
Model:                        Poisson   Df Residuals:                    25595
Method:                           MLE   Df Model:                            4
Date:                Tue, 19 Dec 2023   Pseudo R-squ.:                  0.1761
Time:                        02:26:14   Log-Likelihood:            -2.8303e+06
converged:                       True   LL-Null:                   -3.4351e+06
Covariance Type:            nonrobust   LLR p-value:                     0.000
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
const              1.9103      0.005    364.268      0.000       1.900       1.921
count_variable 

Running Poisson Regression using favorite count as DV and dummy variable for presence of sensationalist feature/s (main variable of interest), outlet type, channel type and tweet length as IVs (Table C2)

In [155]:
features_h2b = ['combined_features_dummy','outlet_type','channel_type','len_tweet']

In [156]:
Poisson_Regression_H2c = sm.Poisson(AutomatedCA_sample['favorite_count'], sm.add_constant(AutomatedCA_sample[features_h2b]))
ACApoiss_modelH2c = Poisson_Regression_H2c.fit()
print(ACApoiss_modelH2c.summary())

Optimization terminated successfully.
         Current function value: 31.831094
         Iterations 6
                          Poisson Regression Results                          
Dep. Variable:         favorite_count   No. Observations:                25600
Model:                        Poisson   Df Residuals:                    25595
Method:                           MLE   Df Model:                            4
Date:                Tue, 19 Dec 2023   Pseudo R-squ.:                 0.04836
Time:                        02:32:55   Log-Likelihood:            -8.1488e+05
converged:                       True   LL-Null:                   -8.5628e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                       2.0884      0.006    348.217      0.000       2.077  

Running Poisson Regression using retweet count as DV and dummy variable for presence of sensationalist feature/s (main variable of interest), outlet type, channel type and tweet length as IVs (Table C2)

In [158]:
Poisson_Regression_H2d = sm.Poisson(AutomatedCA_sample['retweet_count'], sm.add_constant(AutomatedCA_sample[features_h2b]))
ACApoiss_modelH2d = Poisson_Regression_H2d.fit()
print(ACApoiss_modelH2d.summary())

Optimization terminated successfully.
         Current function value: 110.597840
         Iterations 7
                          Poisson Regression Results                          
Dep. Variable:          retweet_count   No. Observations:                25600
Model:                        Poisson   Df Residuals:                    25595
Method:                           MLE   Df Model:                            4
Date:                Tue, 19 Dec 2023   Pseudo R-squ.:                  0.1758
Time:                        02:34:35   Log-Likelihood:            -2.8313e+06
converged:                       True   LL-Null:                   -3.4351e+06
Covariance Type:            nonrobust   LLR p-value:                     0.000
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                       1.8353      0.005    348.088      0.000       1.825 

Running Poisson Regression using favorite count as DV and individual sensationalist features as IVs controlling for outlet type, channel type, and tweet length (Table C3)

In [159]:
aca_features_indiv = ['hyperbolic_feature','slang_feature','listicle_feature','emoji_feature','question_feature','hashtag_feature','informalpunct_feature','forwardref_feature','media_dummy','capitalized_feature', 'channel_type','outlet_type','len_tweet']

In [160]:
Poisson_Regression_H2e = sm.Poisson(AutomatedCA_sample['favorite_count'], sm.add_constant(AutomatedCA_sample[aca_features_indiv]))
ACApoiss_modelH2e = Poisson_Regression_H2e.fit()
print (ACApoiss_modelH2e.summary())

Optimization terminated successfully.
         Current function value: 31.326948
         Iterations 6
                          Poisson Regression Results                          
Dep. Variable:         favorite_count   No. Observations:                25600
Model:                        Poisson   Df Residuals:                    25586
Method:                           MLE   Df Model:                           13
Date:                Tue, 19 Dec 2023   Pseudo R-squ.:                 0.06343
Time:                        02:37:09   Log-Likelihood:            -8.0197e+05
converged:                       True   LL-Null:                   -8.5628e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     2.0539      0.006    333.445      0.000       2.042       2

Running Poisson Regression using retweet count as DV and individual sensationalist features as IVs controlling for outlet type, channel type, and tweet length (Table C3)

In [161]:
Poisson_Regression_H2f = sm.Poisson(AutomatedCA_sample['retweet_count'], sm.add_constant(AutomatedCA_sample[aca_features_indiv]))
ACApoiss_modelH2f = Poisson_Regression_H2f.fit()
print (ACApoiss_modelH2f.summary())

Optimization terminated successfully.
         Current function value: 108.068670
         Iterations 8
                          Poisson Regression Results                          
Dep. Variable:          retweet_count   No. Observations:                25600
Model:                        Poisson   Df Residuals:                    25586
Method:                           MLE   Df Model:                           13
Date:                Tue, 19 Dec 2023   Pseudo R-squ.:                  0.1946
Time:                        02:40:36   Log-Likelihood:            -2.7666e+06
converged:                       True   LL-Null:                   -3.4351e+06
Covariance Type:            nonrobust   LLR p-value:                     0.000
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                     1.8523      0.005    349.588      0.000       1.842       