In [1]:
import pandas as pd
import re
import csv
import numpy as np
from collections import Counter
import nltk
from nltk.corpus import stopwords
import emoji
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
pd.options.display.max_rows
pd.set_option('display.max_colwidth', -1)

Reading file 'AutomatedCA_sample.csv' containing sample for automated content analysis 

In [3]:
AutomatedCA_sample = pd.read_csv('/Users/salmankhawar/Desktop/Thesis/Content Analysis/AutomatedCA_sample.csv')

Detecting Sensationalist Features

Hyperbolic feature: 

getting list of hyperbolic words/phrases from 'Hyperbolic Words.txt' file

In [4]:
hyperbolic_words = open("/Users/salmankhawar/Desktop/Thesis/Content Analysis/Hyperbolic Words.txt", "r")
hyperbolic_words_list = hyperbolic_words.readlines()

hyperbolic_list=[]
for words in hyperbolic_words_list:
    hyperbolic_list.append(words.strip())
    
print(hyperbolic_list)   

['A Single', 'Absolutely', 'Amazing', 'Awesome', 'Best', 'Breathtaking', 'But what happened next', 'Can change your life', "Can't Even Handle", "Can't Handle", 'Cannot Even Handle', "Doesn't want you to see", 'Epic', 'Everything You Need To Know', 'Gasp-Worthy', 'Go Viral', 'Greatest', 'Incredible', 'Infuriate', 'Literally', 'Mind Blowing', 'Mind BLOWN', 'Mind Blown', 'Need To Visit Before You Die', 'Nothing Could Prepare Me For', 'Of All Time', 'OMG', 'One Weird Trick', 'Perfection', 'Priceless', 'Prove', 'Right Now', 'Scientific Reasons', 'Shocked', 'Shocking', 'Simple Lessons', "Stop What You're Doing", 'TERRIFYING', 'Terrifying', 'That Will Make You Rethink', "The World's Best", 'This Is What Happens', 'Totally blew my mind', 'Unbelievable', 'Unimaginable', 'WHAT?', 'Whoa', 'WHOA', 'Whoah', 'Will Blow Your Mind', 'Will Change Your Life Forever', 'Won the Internet', 'Wonderful', 'Worst', 'Wow', 'WOW', "You Didn't Know Exist", "You Didn't Know Existed", "You Won't Believe", 'Have To 

In [5]:
def hyperbolic_detection(text, query):
    text = str(text).lower()
    newquery = []
    for word in query:
        newquery.append(str(word).lower())
    tokens = re.findall(r"[\w']+|[.,!?;$@#]", text)
    
    for word in newquery:
        if word in tokens:
            return 1
    return 0

In [6]:
AutomatedCA_sample['hyperbolic_feature'] = AutomatedCA_sample['text'].apply(hyperbolic_detection,
                                                      args=(hyperbolic_list,))

The hyperbolic feature is found in 2.56% of the sample

In [7]:
AutomatedCA_sample['hyperbolic_feature'].value_counts(normalize=True)*100

0    97.4375
1    2.5625 
Name: hyperbolic_feature, dtype: float64

In [8]:
AutomatedCA_sample[AutomatedCA_sample['hyperbolic_feature'] == 1]['text'].head(10)

25     Phone-Banking Is Democrats’ Best Hope For Organizing In A Pandemic. It's Also Helping Volunteers Feel Less Lonely.                                          
45     Proud middle children share the reasons they’re absolutely sure they’re the best (even if their diplomatic parents won’t openly admit it).                  
57     21 Absolutely Fucking Perfect Moments In Sporting Caption History\n\n                                                                                       
92     Whoa! Megan Fox showed up in a nearly invisible dress for the #VMAs.                                                                                        
131    This is the greatest thing on the internet right now.                                                                                                       
201    Best Actor in a Motion Picture Drama: Rami Malek for #BohemianRhapsody (Will he thank Bryan Singer???)                                                      
213    The CEO o

Slang feature: 

extracting list of slang words from 'Slang.txt' file

In [9]:
slang_words = open("/Users/salmankhawar/Desktop/Thesis/Content Analysis/Slang.txt", "r")
slang_words_list = slang_words.readlines()

slang_list=[]
for words in slang_words_list:
    slang_list.append(words.strip().split('=')[0])
       
print(slang_list[:100])        
len(slang_list)

['?', '?', '?4U', ';S', '^^', '<3', '<3', '<33', '@TEOTD', '.02', '1TG, 2TG', '1UP', '121', '1337', '143', '1432', '14AA41', '182', '19', '10M', '10X', '10Q', '1CE', '1DR', '1NAM', '2', '20', '2B', '2EZ', '2G2BT', '2M2H', '2MI', '2MOR', '2MORO', '2M2H', '2N8', '2NTE', '4', '411', '404', '411', '420', '420', '459', '4AO', '4COL', '4EAE', '4EVA', '4NR', '^5', '511', '555', '55555', '55555', '6Y', '7K', '81', '831', '86', '88', '88', '9', '*s*', '*w*', 'A3', 'AA', 'AA', 'AA', 'AAF', 'AAF', 'AAK', 'AAK', 'AAMOF', 'AAMOI', 'AAP', 'AAR', 'AAS', 'AASHTA', 'AATK', 'AAYF', 'ABBR', 'ABC', 'ABD', 'ABT', 'ABT2', 'ABTA', 'ABU', 'AC', 'ACC', 'ACD', 'ACDNT', 'ACE', 'ACK', 'ACPT', 'ACQSTN', 'ADAD', 'ADBB', 'ADD', 'ADDY', 'ADIH']


1506

Extracting 200 most commonly appearing words, and filtering out the ones that appear in the slang list

In [10]:
texts = AutomatedCA_sample['text'].tolist()

In [11]:
total_words = Counter()
for text in texts:
    text = str(text).lower()
    text = text.split(' ')
    newtext = []
    for item in text:
            newtext.append(item)
            
    newtext = ' '.join(newtext)
        
    tokens = re.findall(r"[\w']{2,}", newtext)
    for token in tokens:
        total_words[token] += 1
        


In [12]:
most_common_list = total_words.most_common(200)

words_without_count = []

for a_tuple in most_common_list:
       words_without_count.append(a_tuple[0].upper())
print(words_without_count)


['THE', 'TO', 'OF', 'AND', 'IN', 'FOR', 'ON', 'IS', 'WITH', 'THAT', 'HIS', 'TRUMP', 'AT', 'IT', 'ARE', 'AS', 'FROM', 'WAS', 'HAS', 'ABOUT', 'HE', 'NEW', 'THIS', 'BE', 'BIDEN', 'WILL', 'AFTER', 'HAVE', 'YOU', 'BUT', 'SAID', 'AN', 'BY', 'PRESIDENT', 'NOT', 'HER', 'WHO', 'WHAT', 'OUT', 'FIRST', 'THEIR', 'OVER', 'ONE', 'MORE', 'HOW', 'UP', 'HOUSE', 'THEY', 'SAYS', 'ALL', 'NOW', 'JUST', 'WE', 'COVID', 'FORMER', 'COURT', 'PEOPLE', 'BEEN', 'CAN', 'TIME', 'OR', 'NO', '19', 'YEAR', 'JOE', 'GET', 'SHE', 'AGAINST', 'GAME', 'STATE', 'IF', 'DEMOCRATS', 'SOME', 'BACK', 'LIKE', 'SO', 'WHEN', 'HAD', 'TWO', "IT'S", 'INTO', 'WERE', 'COULD', 'ITS', 'ELECTION', 'WOULD', 'HIM', 'LAST', 'YOUR', 'WHITE', 'MY', 'THAN', 'US', 'SAY', 'WEEK', 'SEASON', 'DO', 'SUPREME', "HERE'S", 'WIN', 'DAY', 'SENATE', 'MOST', 'HERE', 'WORLD', 'OUR', 'REPUBLICAN', 'DOWN', 'BEFORE', 'BILL', 'SHOW', 'OFF', 'DURING', 'KNOW', 'FEDERAL', 'VIA', 'MAKE', 'YEARS', 'TEAM', 'STILL', 'WHY', 'OTHER', 'CAPITOL', "TRUMP'S", 'DONALD', 'THERE',

In [13]:
slang_list = [e for e in slang_list if e not in words_without_count]

In [14]:
slang_list = [s for s in slang_list if len(s) > 1]

In [15]:
def slang_detection(text, query):
    newquery = []
    text = str(text).lower()

    for word in query:
        newquery.append(str(word).lower())
    tokens = re.findall(r"[\w']+", text)
    
    for word in newquery:
        if word in tokens:
            return 1
    return 0

In [16]:
AutomatedCA_sample['slang_feature'] = AutomatedCA_sample['text'].apply(slang_detection,
                                                      args=(slang_list,))

The slang feature is detected in 5.68% of the sample

In [17]:
AutomatedCA_sample['slang_feature'].value_counts(normalize=True)*100

0    94.324219
1    5.675781 
Name: slang_feature, dtype: float64

In [18]:
AutomatedCA_sample[AutomatedCA_sample['slang_feature'] == 1]['text'].head(10)

6      "My wife looked over at me and said she wished we had graduation hats to throw into the air."                                                                                                                             
13     LeBron James has 39 points in 43 minutes to lead Lakers to 124-116 OT win over Pacers.                                                                                                                                    
27     86% of NBA GM's say they would sign Anthony Davis first if they were starting a franchise today.                                                                                                                          
36     Prince Harry and Duchess Meghan said they asked the queen in advance to pass her childhood nickname, Lilibet, down to their new baby daughter after a BBC report said they didn't.                                        
66     'Sadly, he is who he is.' Hillary Clinton criticizes Trump in DNC speech promoting Biden 

Listicle feature

In [19]:
def listicle_detection(text):
    text = str(text)
    if re.match(r"^\d", text):
            return 1
    return 0

In [20]:
AutomatedCA_sample['listicle_feature'] = AutomatedCA_sample['text'].apply(listicle_detection)

The listicle feature is detected in 0.95% of the sample

In [21]:
AutomatedCA_sample['listicle_feature'].value_counts(normalize=True)*100

0    99.054688
1    0.945312 
Name: listicle_feature, dtype: float64

In [22]:
AutomatedCA_sample[AutomatedCA_sample['listicle_feature'] == 1]['text'].head(10)

20     31 "Sex And The City" Guest Stars You Probably Totally Forgot About                                                                                             
27     86% of NBA GM's say they would sign Anthony Davis first if they were starting a franchise today.                                                                
57     21 Absolutely Fucking Perfect Moments In Sporting Caption History\n\n                                                                                           
252    21 Quizzes Marvel Fans Should Take Right Now                                                                                                                    
271    140 women have accused former USA Gymnastics doctor Larry Nassar of abuse. His victims think we don't care.                                                     
457    72 straight pats for miami kicker just snapped. 72! Fsu up 1                                                                                             

Emoji feature

In [23]:
emoji_list = list(emoji.UNICODE_EMOJI['en'].keys())
len(emoji_list)

4702

In [24]:
def emoji_detection(text):
    text = str(text)
    for word in text:
        if word in emoji_list:
            return 1
    return 0

In [25]:
AutomatedCA_sample['emoji_feature'] = AutomatedCA_sample['text'].apply(emoji_detection)

The emoji feature is detected in 2.89% of the sample

In [26]:
AutomatedCA_sample['emoji_feature'].value_counts(normalize=True)*100

0    97.109375
1    2.890625 
Name: emoji_feature, dtype: float64

In [27]:
AutomatedCA_sample[AutomatedCA_sample['emoji_feature'] == 1]['text'].head(10)

11     Atlanta ties it at 2-2 on Dansby Swanson’s solo home run! #WorldSeries \n\n🎥: @MLB\n                                                                                                                                          
37     #What2Watch hosts @scottygb and @DionneGrant face some blowback over their Should You Watch It? Board choices 😅                                                                                                               
56     .@robcorddry says he joined #TheUnicorn because of his pal Walton Goggins: "First of all, just do whatever Walton says." 🦄                                                                                                    
88     The on-screen couples we can't get enough of, and the ones we're dreaming of 😍 #TheBuzz                                                                                                                                       
96     There are new Harry Styles pics on the timeline and fans *Ariel* excited 

Question feature

In [28]:
def question_detection(text):
    text = str(text)
    if re.search('\w[?]\s', text):
        return 1
    return 0

In [29]:
AutomatedCA_sample['question_feature'] = AutomatedCA_sample['text'].apply(question_detection)

The question feature is detected in 4.59% of the sample

In [30]:
AutomatedCA_sample['question_feature'].value_counts(normalize=True)*100

0    95.410156
1    4.589844 
Name: question_feature, dtype: float64

In [31]:
AutomatedCA_sample[AutomatedCA_sample['question_feature'] == 1]['text'].head(10)

37     #What2Watch hosts @scottygb and @DionneGrant face some blowback over their Should You Watch It? Board choices 😅                                                                                                                                          
49     What makes an elite team? @CoachLindsayG\nhas a powerpoint for that. \n\nThe @USCWBB HC makes her Trojan debut tonight as USC tries to climb back to glory  \n                                                                                           
63     Review: Is Christmas the season of Nintendo or the opiate of the masses? Two new movies chime in                                                                                                                                                         
90     hello! have you had trouble finding a good therapist? any tips that worked for you? (or if you're a therapist yourself, what advice do you have?) would love to hear from you for a story. DMs open, or you can email doliver@

Hashtag feature

In [32]:
def hashtag_detection(text):
    text = str(text)
    if re.search('#\w', text):
        return 1
    return 0

In [33]:
AutomatedCA_sample['hashtag_feature'] = AutomatedCA_sample['text'].apply(hashtag_detection)

The hashtag feature is detected in 8.90% of the sample

In [34]:
AutomatedCA_sample['hashtag_feature'].value_counts(normalize=True)*100

0    91.097656
1    8.902344 
Name: hashtag_feature, dtype: float64

In [35]:
AutomatedCA_sample[AutomatedCA_sample['hashtag_feature'] == 1]['text'].head(10)

1     Now that's the Russell Wilson we are used to right there #Seahawks #mnf                                                                                                                                                          
10    Full interview: @BradleyWhitford discusses his new series #PerfectHarmony, whether "The West Wing" could be made today, what the Emmys are really like, and more                                                                 
11    Atlanta ties it at 2-2 on Dansby Swanson’s solo home run! #WorldSeries \n\n🎥: @MLB\n                                                                                                                                             
17    Bottom of the ninth. Tie game. Mood. #ComeTogether                                                                                                                                                                               
37    #What2Watch hosts @scottygb and @DionneGrant face some blowback ov

Informal punctuation feature

In [36]:
def informalpunct_detection(text):
    text = str(text)
    pattern = '[!?*.]{2,}|\*\w+\*'
    if re.search(pattern, text):
            return 1
    else:
            return 0

In [37]:
AutomatedCA_sample['informalpunct_feature'] = AutomatedCA_sample['text'].apply(informalpunct_detection) 

The informal punctuation feature is detected in 1.76% of the sample

In [38]:
AutomatedCA_sample['informalpunct_feature'].value_counts(normalize=True)*100

0    98.242188
1    1.757812 
Name: informalpunct_feature, dtype: float64

In [39]:
AutomatedCA_sample[AutomatedCA_sample['informalpunct_feature'] == 1]['text'].head(10)

77     Yep...                                                                                                                                                                                                                                                         
90     hello! have you had trouble finding a good therapist? any tips that worked for you? (or if you're a therapist yourself, what advice do you have?) would love to hear from you for a story. DMs open, or you can email doliver@usatoday.com. thanks much!!      
96     There are new Harry Styles pics on the timeline and fans *Ariel* excited 🧜‍♀️ #TheBuzz                                                                                                                                                                         
177    RILEY!!!                                                                                                                                                                                                    

Forward referencing

In [40]:
forwardref_list = ['This','That','These','Those','He','She','They','Him','Her','Them','You','Here','There']

In [41]:
def forwardref_detection(text):
    text = str(text)
    if (text.startswith(tuple(forwardref_list))):
            return 1
    return 0

In [42]:
AutomatedCA_sample['forwardref_feature'] = AutomatedCA_sample['text'].apply(forwardref_detection) 

The forward referencing feature is detected in 3.70% of the sample

In [43]:
AutomatedCA_sample['forwardref_feature'].value_counts(normalize=True)*100

0    96.300781
1    3.699219 
Name: forwardref_feature, dtype: float64

In [44]:
AutomatedCA_sample[AutomatedCA_sample['forwardref_feature'] == 1]['text'].head(10)

60     Here's What Everyone Wore To This Year's BET Awards                                                                
80     This Is What Actual Teenagers Think About Claudia Conway                                                           
82     This $9 Trillion Proposal Is The Most Ambitious Climate Change Policy Yet From A Democratic Presidential Candidate 
91     This Is What People Promised They Would Do If Biden Won                                                            
96     There are new Harry Styles pics on the timeline and fans *Ariel* excited 🧜‍♀️ #TheBuzz                             
108    There is a petition calling for the @Browns to start Johnny Manziel.                                               
121    This is the legacy of 'movie monster' Jean-Paul Belmondo                                                           
131    This is the greatest thing on the internet right now.                                                              
154    You used 

All Caps feature:

Using part of speech tagging to identify all uppercase proper nouns with lengths of two or more characters in the sample, and excluding them in detection of the all caps feature

In [45]:
tag_list = []
def pos_tagging(txt):
        txt = str(txt)
        wordslist = nltk.word_tokenize(txt)
        tagged = nltk.pos_tag(wordslist)
            
        for tupl in tagged:
                if tupl[1] == "NNP":
                        tag_list.append(tupl[0])
                  
                else:
                    pass

In [46]:
AutomatedCA_sample['text'].apply(pos_tagging)
tag_list[50:]

['James',
 'OT',
 'Pacers',
 'October',
 'November',
 'USSF',
 'President',
 'Cindy',
 'Parlow',
 'Cone',
 'FIFA',
 '’',
 'World',
 'Cup',
 'Schools',
 'Newsom',
 'Bottom',
 'Mood',
 'ComeTogether',
 'Supreme',
 'Court',
 'Trump',
 'U.S.',
 'Andrew',
 'Yang',
 'Is',
 'Campaign',
 'Has',
 'Big',
 'City',
 'Guest',
 'Stars',
 'Totally',
 'Forgot',
 'Kabul',
 '’',
 'Afghan',
 'Kamala',
 'Harris',
 '’',
 'Friendship',
 'Joe',
 'Biden',
 '’',
 'Late',
 'Son',
 'Was',
 'Central',
 'Their',
 'First',
 'Joint',
 'Event',
 'USC',
 'Utah',
 'Isaiah',
 'Mobley',
 'DeVonta',
 'Smith',
 'Denver',
 'Eagles',
 'Broncos',
 'Best',
 'Hope',
 'Organizing',
 'A',
 'Pandemic',
 'Volunteers',
 'Feel',
 'Less',
 'Lonely',
 'Astros',
 '’',
 'Carlos',
 'Gomez',
 'KC',
 'NBA',
 'GM',
 'Anthony',
 'Davis',
 'Their',
 'Impeachment',
 'Inquiry',
 'Hearings',
 'Will',
 'Be',
 'Public',
 'Soon',
 'Supreme',
 'Court',
 'Mississippi',
 'Notre',
 'Dame',
 'Marcus',
 'Freeman',
 'Brian',
 'Kelly',
 'Trump',
 'Paul',
 '

In [47]:
tag_list_upper = [w for w in tag_list if w.isupper() and len(w) > 1]
tag_list_upper = list(set(tag_list_upper))
tag_list_upper[50:]

['HOMER',
 'REPEAT',
 'GWR',
 'N.Y.',
 'WORLD',
 'GQ',
 'VAL',
 'OG',
 'AS',
 'TRICK',
 'JANUARY',
 'NO',
 'UPS',
 'TD-INT',
 'LOL',
 'OVERLOAD',
 'RBI',
 'SAVAGEEEEEEEEE',
 'RTDNEWS',
 'SUCCEED',
 'PICK-SIX',
 'WINSLOW',
 'T.K',
 'NAJ_TV',
 'MCFC',
 'NOW',
 'NASA',
 'D.C',
 'AOC',
 'CAA',
 'TSLA',
 'KD',
 'C-17',
 'J.',
 'J.D',
 'COP26',
 'USAID',
 'SATC',
 'UNLV',
 'AGAIN',
 'FERNANDOGARIBAY',
 'RG3',
 'II',
 'CARD',
 'AP',
 'CSKA',
 'CONTENT',
 'CGI',
 'S.E',
 'FUCCBOI',
 'ADORABLE',
 'AFI',
 'TERRY',
 'IT',
 'MAXI',
 'KRISTAPS',
 'MEANWHILE',
 'S6',
 'FRISCO',
 'SEX',
 'NICKIMINAJ',
 'FINAL',
 'G-7',
 'P-R',
 'BRONCOS',
 'H1-B',
 'JAM',
 'KING',
 'OBJ',
 'DHS',
 'UCLA',
 'LP',
 'LOT',
 'NEVER',
 'ALERT🚨',
 'PARIS',
 'CHASE',
 'HE',
 'MAJOR',
 'DOE',
 'BNHR',
 'DAYS',
 'GILDED',
 'NBC',
 'SXSW',
 'GW',
 'NBD',
 'PCR',
 'RB',
 'JAYS',
 'GOT7',
 'HA',
 'PFAS',
 'KANSAS',
 'FOR',
 'LET',
 'OLY',
 'COUNTED',
 'ISIS-K',
 'LMA',
 'OPS',
 'ANWR',
 'KELLYROWLAND',
 'FSU',
 'WRECKED',
 'SING

In [48]:
def capitalized_detection(text):
    text = str(text)
    
    m = re.findall(r"\b[A-Z]{2,}\b", text)
    if m:
        if any(x in m for x in tag_list_upper):
                       return 0
             
        else: 
                       return 1
            
    else:
        return 0

In [49]:
AutomatedCA_sample['capitalized_feature'] = AutomatedCA_sample['text'].apply(capitalized_detection) 

The all caps feature is detected in 0.25% of the sample

In [50]:
AutomatedCA_sample['capitalized_feature'].value_counts(normalize=True)*100

0    99.746094
1    0.253906 
Name: capitalized_feature, dtype: float64

In [51]:
AutomatedCA_sample[AutomatedCA_sample['capitalized_feature'] == 1]['text'].head(10)

177     RILEY!!!                                                                                                                                                  
476     @doliver8 @Josh1Rivera @heyitsanika Hi Dav- I mean Olivia, we would love to get an exclusive on your red carpet look when the time comes. XOXO            
606     Donald Trump's son last week endorsed sending an AR-15 rifle to the acquitted shooter.                                                                    
1095    Under its first woman president, the AFL-CIO is pursuing big legislative goals in Washington while trying to turn around the decline in union membership. 
1148    @joga_newsletter what a LEGEND                                                                                                                            
2003    ‘This Was The XFL’: Examining Television’s Greatest Sports Flop Ever -  @30for30                                                                          
2830    The Florida go

Media feature

In [52]:
AutomatedCA_sample['media_type'].value_counts(normalize=True)*100

no media        81.898438
photo           13.746094
video           2.734375 
animated_gif    1.621094 
Name: media_type, dtype: float64

Creating dummy variable for media feature (1= photo, video and animated gif, 0=no media)

In [53]:
def media_feature(x):
        if x == 'no media':
            return 0
        else:
            return 1

In [54]:
AutomatedCA_sample['media_dummy'] = AutomatedCA_sample['media_type'].apply(media_feature) 

The media feature is detected in 18.10% of the sample

In [55]:
AutomatedCA_sample['media_dummy'].value_counts(normalize=True)*100

0    81.898438
1    18.101562
Name: media_dummy, dtype: float64

Analysis - RQ2

Creating dummy variable for outlet type (1=tweets published by online-native outlets, 0=tweets published by legacy outlets)

In [56]:
legacy_outlets = ['USA TODAY Politics', 'USA TODAY Sports', 'USA TODAY Life', 'L.A. Times Politics', 'L.A. Times Sports', 'LAT Entertainment']
regex_legacy_outlets = re.compile("(?=(" + "|".join(map(re.escape, legacy_outlets)) + "))")

In [57]:
def news_outlet_classification(text):
    text = str(text)
    m=re.search(regex_legacy_outlets, text)
    if m:
        return 0
    else:
        return 1

In [58]:
AutomatedCA_sample['outlet_type'] = AutomatedCA_sample['twitter_account'].apply(news_outlet_classification) 

In [59]:
AutomatedCA_sample['outlet_type'].value_counts()

1    12800
0    12800
Name: outlet_type, dtype: int64

Creating dummy variable for presence of sensationalism in tweets (1=sensationalist feature present, 0=sensationalist feature not present)

In [60]:
features_grid = AutomatedCA_sample[['hyperbolic_feature', 'slang_feature', 'listicle_feature', 'emoji_feature', 'question_feature', 'hashtag_feature', 'forwardref_feature', 'informalpunct_feature', 'capitalized_feature', 'media_dummy']]

In [61]:
features_grid = np.array(features_grid)
features_grid = features_grid.tolist()

In [62]:
any_row = [int(any(row)) for row in features_grid]

In [63]:
AutomatedCA_sample['combined_features_dummy'] = any_row

In [64]:
AutomatedCA_sample['combined_features_dummy'].value_counts()

0    16126
1    9474 
Name: combined_features_dummy, dtype: int64

Sensationalist feature/s present in 37.01% of tweets in sample

In [65]:
AutomatedCA_sample['combined_features_dummy'].value_counts(normalize=True)*100

0    62.992188
1    37.007812
Name: combined_features_dummy, dtype: float64

Creating count variable with number of features in each tweet

In [66]:
count_variable = np.sum(features_grid ,axis=1).tolist()

In [67]:
AutomatedCA_sample['count_variable'] = count_variable
AutomatedCA_sample['count_variable'].value_counts(normalize=True)*100

0    62.992188
1    27.171875
2    7.726562 
3    1.730469 
4    0.332031 
5    0.046875 
Name: count_variable, dtype: float64

Creating variable for channel type (1=tweets published in soft news sub-channels i.e sports,entertainment/life, 0=tweets published in hard news sub-channels i.e politics)

In [68]:
def hard_soft_classifications(x):
    x = str(x)
    if re.search('(Politics)', x):
        return 0
    else:
        return 1

In [69]:
AutomatedCA_sample['channel_type'] = AutomatedCA_sample['twitter_account'].apply(hard_soft_classifications) 

In [70]:
AutomatedCA_sample['channel_type'].value_counts()

1    12800
0    12800
Name: channel_type, dtype: int64

Creating dummy variable for tweet length

In [71]:
def length_tweet(x):
    x = str(x)
    return len(x)

In [72]:
AutomatedCA_sample['len_tweet'] = AutomatedCA_sample['text'].apply(length_tweet) 

Creating variables for engagement metrics per hundred thousand account followers

In [73]:
BuzzFeedPolitics_followers = int(140881)/int(100000)
BuzzFeedSports_followers = int(63060)/int(100000)
BuzzFeedEnt_followers = int(81725)/int(100000)
HuffPostPolitics_followers = int(1439630)/int(100000)
HuffPostSports_followers = int(62587)/int(100000)
HuffPostLife_followers = int(125749)/int(100000)
USATODAYPolitics_followers = int(215937)/int(100000)
USATODAYSports_followers = int(246780)/int(100000)
USATODAYLife_followers= int(85285)/int(100000)
LATimesPolitics_followers = int(12314)/int(100000)
LATimesSports_followers = int(50398)/int(100000)
LATEnt_followers = int(168665)/int(100000)

In [74]:
AutomatedCA_sample['followers_per100000'] = ''

In [75]:
AutomatedCA_sample.loc[(AutomatedCA_sample['twitter_account'] == 'BuzzFeed Politics'), 'followers_per100000'] = BuzzFeedPolitics_followers
AutomatedCA_sample.loc[(AutomatedCA_sample['twitter_account'] == 'BuzzFeed Sports'), 'followers_per100000'] = BuzzFeedSports_followers
AutomatedCA_sample.loc[(AutomatedCA_sample['twitter_account'] == 'BuzzFeed Arts & Entertainment'), 'followers_per100000'] = BuzzFeedEnt_followers
AutomatedCA_sample.loc[(AutomatedCA_sample['twitter_account'] == 'HuffPost Politics'), 'followers_per100000'] = HuffPostPolitics_followers
AutomatedCA_sample.loc[(AutomatedCA_sample['twitter_account'] == 'HuffPost Sports'), 'followers_per100000'] = HuffPostSports_followers
AutomatedCA_sample.loc[(AutomatedCA_sample['twitter_account'] == 'HuffPost Life'), 'followers_per100000'] = HuffPostLife_followers
AutomatedCA_sample.loc[(AutomatedCA_sample['twitter_account'] == 'USA TODAY Politics'), 'followers_per100000'] = USATODAYPolitics_followers
AutomatedCA_sample.loc[(AutomatedCA_sample['twitter_account'] == 'USA TODAY Sports'), 'followers_per100000'] = USATODAYSports_followers
AutomatedCA_sample.loc[(AutomatedCA_sample['twitter_account'] == 'USA TODAY Life'), 'followers_per100000'] = USATODAYLife_followers
AutomatedCA_sample.loc[(AutomatedCA_sample['twitter_account'] == 'L.A. Times Politics'), 'followers_per100000'] = LATimesPolitics_followers
AutomatedCA_sample.loc[(AutomatedCA_sample['twitter_account'] == 'L.A. Times Sports'), 'followers_per100000'] = LATimesSports_followers
AutomatedCA_sample.loc[(AutomatedCA_sample['twitter_account'] == 'LAT Entertainment'), 'followers_per100000'] = LATEnt_followers

In [76]:
AutomatedCA_sample['favorites_per100000'] = list(map(lambda x,y: x/y, AutomatedCA_sample['favorite_count'],AutomatedCA_sample['followers_per100000']))

In [77]:
AutomatedCA_sample['retweets_per100000'] = list(map(lambda x,y: x/y, AutomatedCA_sample['retweet_count'],AutomatedCA_sample['followers_per100000']))

Testing H1:

Running Logistic Regression using dummy variable for presence of sensationalist feature/s in tweets as DV and outlet type, channel type and tweet length as IVs (Table D1)

In [78]:
features_aca = ['outlet_type', 'len_tweet', 'channel_type']

In [79]:
logit_model = sm.Logit(AutomatedCA_sample['combined_features_dummy'], sm.add_constant(AutomatedCA_sample[features_aca]))
ACAreg_modelH1a = logit_model.fit()
print(ACAreg_modelH1a.summary())

Optimization terminated successfully.
         Current function value: 0.577510
         Iterations 5
                              Logit Regression Results                             
Dep. Variable:     combined_features_dummy   No. Observations:                25600
Model:                               Logit   Df Residuals:                    25596
Method:                                MLE   Df Model:                            3
Date:                     Wed, 02 Feb 2022   Pseudo R-squ.:                  0.1237
Time:                             14:21:27   Log-Likelihood:                -14784.
converged:                            True   LL-Null:                       -16870.
Covariance Type:                 nonrobust   LLR p-value:                     0.000
                   coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------
const           -2.0169      0.046    -44.119      0.000      -2

  return ptp(axis=axis, out=out, **kwargs)


OLS regression using count variable (aggregate of sensationalist features present in each tweet) (Table D1)

In [80]:
OLS_model =  sm.OLS(AutomatedCA_sample['count_variable'], sm.add_constant(AutomatedCA_sample[features_aca]))
ACAreg_modelh1b = OLS_model.fit()
print(ACAreg_modelh1b.summary())

                            OLS Regression Results                            
Dep. Variable:         count_variable   R-squared:                       0.160
Model:                            OLS   Adj. R-squared:                  0.159
Method:                 Least Squares   F-statistic:                     1620.
Date:                Wed, 02 Feb 2022   Prob (F-statistic):               0.00
Time:                        14:21:36   Log-Likelihood:                -26622.
No. Observations:               25600   AIC:                         5.325e+04
Df Residuals:                   25596   BIC:                         5.328e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.0636      0.013      4.944   

Testing H3

Running OLS Regression using favorite count (per hundred thousand followers) as DV and dummy variable for presence of sensationalist feature/s (main variable of interest) outlet type, channel type and tweet length as IVs (Table D2)

In [81]:
features_aca_h3a = ['combined_features_dummy','outlet_type','channel_type','len_tweet']

In [87]:
features_aca_h3b = ['count_variable','outlet_type','channel_type','len_tweet']

In [83]:
OLS_model2a =  sm.OLS(AutomatedCA_sample['favorites_per100000'], sm.add_constant(AutomatedCA_sample[features_aca_h3a]))

In [84]:
ACAreg_model2a = OLS_model2a.fit()
print(ACAreg_model2a.summary())

                             OLS Regression Results                            
Dep. Variable:     favorites_per100000   R-squared:                       0.001
Model:                             OLS   Adj. R-squared:                  0.000
Method:                  Least Squares   F-statistic:                     4.083
Date:                 Wed, 02 Feb 2022   Prob (F-statistic):            0.00261
Time:                         14:21:50   Log-Likelihood:            -1.8987e+05
No. Observations:                25600   AIC:                         3.798e+05
Df Residuals:                    25595   BIC:                         3.798e+05
Df Model:                            4                                         
Covariance Type:             nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const           

Running OLS Regression using retweet count (per hundred thousand followers) as DV and dummy variable for presence of sensationalist feature/s (main variable of interest) outlet type, channel type and tweet length as IVs (Table D2)

In [85]:
OLS_model2b =  sm.OLS(AutomatedCA_sample['retweets_per100000'], sm.add_constant(AutomatedCA_sample[features_aca_h3a]))
ACAreg_model2b = OLS_model2b.fit()
print(ACAreg_model2b.summary())

                            OLS Regression Results                            
Dep. Variable:     retweets_per100000   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     22.54
Date:                Wed, 02 Feb 2022   Prob (F-statistic):           1.30e-18
Time:                        14:21:54   Log-Likelihood:            -2.1387e+05
No. Observations:               25600   AIC:                         4.277e+05
Df Residuals:                   25595   BIC:                         4.278e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const                     

Running OLS Regression using favorite count (per hundred thousand followers) as DV and count variable for number of sensationalist features (main variable of interest), outlet type, channel type and tweet length as IVs (Table D3)

In [88]:
OLS_model2c =  sm.OLS(AutomatedCA_sample['favorites_per100000'], sm.add_constant(AutomatedCA_sample[features_aca_h3b]))
ACAreg_model2c = OLS_model2c.fit()
print(ACAreg_model2c.summary())

                             OLS Regression Results                            
Dep. Variable:     favorites_per100000   R-squared:                       0.001
Model:                             OLS   Adj. R-squared:                  0.001
Method:                  Least Squares   F-statistic:                     4.475
Date:                 Wed, 02 Feb 2022   Prob (F-statistic):            0.00129
Time:                         14:22:24   Log-Likelihood:            -1.8987e+05
No. Observations:                25600   AIC:                         3.798e+05
Df Residuals:                    25595   BIC:                         3.798e+05
Df Model:                            4                                         
Covariance Type:             nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const             34.4924      7.5

Running OLS Regression using retweet count (per hundred thousand followers) as DV and count variable for number of sensationalist features (main variable of interest), outlet type, channel type and tweet length as IVs (Table D3)

In [89]:
OLS_model2d =  sm.OLS(AutomatedCA_sample['retweets_per100000'], sm.add_constant(AutomatedCA_sample[features_aca_h3b]))
ACAreg_model2d = OLS_model2d.fit()
print(ACAreg_model2d.summary())

                            OLS Regression Results                            
Dep. Variable:     retweets_per100000   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.004
Method:                 Least Squares   F-statistic:                     23.82
Date:                Wed, 02 Feb 2022   Prob (F-statistic):           1.09e-19
Time:                        14:22:35   Log-Likelihood:            -2.1386e+05
No. Observations:               25600   AIC:                         4.277e+05
Df Residuals:                   25595   BIC:                         4.278e+05
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const             41.9679     19.329      2.

Running OLS Regression using favorite count (per hundred thousand followers) as DV and individual sensationalist features as IVs (Table D4)

In [90]:
aca_features_indiv = ['hyperbolic_feature','slang_feature','listicle_feature','emoji_feature','question_feature','hashtag_feature','informalpunct_feature','forwardref_feature','media_dummy','capitalized_feature']

In [92]:
OLS_model3a =  sm.OLS(AutomatedCA_sample['favorites_per100000'], sm.add_constant(AutomatedCA_sample[aca_features_indiv]))
ACAreg_modelH3a = OLS_model3a.fit()
print(ACAreg_modelH3a.summary())

                             OLS Regression Results                            
Dep. Variable:     favorites_per100000   R-squared:                       0.001
Model:                             OLS   Adj. R-squared:                  0.000
Method:                  Least Squares   F-statistic:                     1.451
Date:                 Wed, 02 Feb 2022   Prob (F-statistic):              0.151
Time:                         14:22:54   Log-Likelihood:            -1.8987e+05
No. Observations:                25600   AIC:                         3.798e+05
Df Residuals:                    25589   BIC:                         3.799e+05
Df Model:                           10                                         
Covariance Type:             nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const               

Running OLS Regression using retweet count (per hundred thousand followers) as DV and individual sensationalist features as IVs (Table D4)

In [93]:
OLS_model3b =  sm.OLS(AutomatedCA_sample['retweets_per100000'], sm.add_constant(AutomatedCA_sample[aca_features_indiv]))
ACAreg_modelH3b = OLS_model3b.fit()
print(ACAreg_modelH3b.summary())

                            OLS Regression Results                            
Dep. Variable:     retweets_per100000   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     6.266
Date:                Wed, 02 Feb 2022   Prob (F-statistic):           1.17e-09
Time:                        14:23:02   Log-Likelihood:            -2.1388e+05
No. Observations:               25600   AIC:                         4.278e+05
Df Residuals:                   25589   BIC:                         4.279e+05
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                    20.23