# Article Text --> Segment domains --> Vectors --> Clusters  --> Over time
JAN2021<br>
Richard Kuzma and Iain Cruickshank


In [1]:
import pandas as pd, numpy as np, pickle, time, re
from urllib.parse import urlparse
from matplotlib import pyplot as plt
from matplotlib.pyplot import figure

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/richardkuzma/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Load Tweeted Articles Data

In [2]:
DATA_PATH = '/Volumes/seagate_external_drive/anti_vax_embeddings/src/'
DF_NAME = 'Anti_vaxtweets_urls_articles_cleaned.pkl'

with open(DATA_PATH + DF_NAME, 'rb') as f:
    df = pickle.load(f)

df.reset_index(inplace=True, drop=True)
pd.set_option('display.max_columns', None, 'display.max_rows', None)
df.head()

Unnamed: 0,created_at,id_str,entities,user,retweet_count,favorite_count,retweeted_status,Hash words,link,urls,first_url,url_total_retweets,url_total_favorites,article_text,article_text_preprocessed
0,2020-01-24 16:02:16+00:00,1.220739e+18,"{'hashtags': [{'text': 'vaccine', 'indices': [...","{'id': 1929460038, 'id_str': '1929460038', 'na...",3.0,8.0,,#vaccine #vaccine #coronavirus. #wuhan #davos,https://twitter.com/user/status/12207385956628...,[https://www.eurekalert.org/pub_releases/2020-...,https://www.eurekalert.org/pub_releases/2020-0...,3.0,8.0,EurekAlert! provides eligible reporters with ...,"[eurekalert, provides, eligible, reporters, fr..."
1,2020-01-24 16:52:18+00:00,1.220751e+18,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'id': 339740849, 'id_str': '339740849', 'name...",0.0,4.0,,No hashtags,https://twitter.com/user/status/12207511874791...,[https://www.cdc.gov/mmwr/volumes/68/wr/mm6809...,https://www.cdc.gov/mmwr/volumes/68/wr/mm6809a...,19.0,8.0,"Weekly / March 8, 2019 / 68(9);231–232\t Jud...","[weekly, march, judith, guzman, cottrill, chri..."
2,2020-01-24 16:50:00+00:00,1.220751e+18,"{'hashtags': [{'text': 'FightFlu', 'indices': ...","{'id': 753312146734346240, 'id_str': '75331214...",2.0,2.0,,#fightflu:,https://twitter.com/user/status/12207506089713...,[https://www.cdc.gov/flu/resource-center/index...,https://www.cdc.gov/flu/resource-center/index.htm,2.0,2.0,Seasonal Flu Vaccine Campaign Toolkit Find e...,"[seasonal, flu, vaccine, campaign, toolkit, fi..."
3,2020-01-24 16:40:00+00:00,1.220748e+18,"{'hashtags': [{'text': 'FluShotFriday', 'indic...","{'id': 753318075886338048, 'id_str': '75331807...",0.0,1.0,,#flushotfriday,https://twitter.com/user/status/12207480921835...,[https://go.usa.gov/xVn4Y],https://go.usa.gov/xVn4Y,0.0,1.0,exclamation square light icon Getting a flu v...,"[exclamation, square, light, icon, getting, fl..."
4,2020-01-24 16:30:43+00:00,1.220746e+18,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'id': 80857088, 'id_str': '80857088', 'name':...",18.0,38.0,,No hashtags,https://twitter.com/user/status/12207457581960...,[https://www.vaccinechoiceprayercommunity.org/...,https://www.vaccinechoiceprayercommunity.org/b...,18.0,38.0,\n\n\t\t1/20/2020\n\t\n \n\n\t\t6 Comments\n\...,"[comments, finally, someone, afraid, tell, tru..."


In [4]:
### Filter to only tweets within the specified date range
print('df original shape: {}'.format(df.shape))
df = df[(df['created_at'] > '2020-02-01') & (df['created_at'] < '2020-06-27')]
print('Keep only tweets in following date range [1FEB2020 - 26JUN2020]')
print('df new shape: {}'.format(df.shape))

df original shape: (52784, 15)
Keep only tweets in following date range [1FEB2020 - 26JUN2020]
df new shape: (36948, 15)


<br><br><br>
### Track domains for labeling

In [5]:
df['domain'] = df['first_url'].apply(lambda x: urlparse(x).netloc.lower())

domains = df['domain'].value_counts()
counts = domains.tolist()
domains = domains.index.tolist()

### make lists of domains to track
domain_info = pd.read_csv('/Volumes/seagate_external_drive/anti_vax_embeddings/src/domain_info.csv')
additional_dubious_domains = domain_info[(domain_info['Bias'] == 'fake-news') | (domain_info['Bias'] == 'conspiracy')].Domain.values.tolist()
additional_science_domains = domain_info[(domain_info['Bias'] == 'pro-science')].Domain.values.tolist()
additional_news_domains = domain_info[domain_info['Rating'] == 'green'].Domain.values.tolist()

news_look = ['www.nytimes.com', 'reuters.com', 'theguardian.com', 'bbc.com', 'www.bbc.co.uk', 'cbsnews', 'nbcnews', 'usatoday', 'axios', 'bostonglobe'] + additional_news_domains
dubious_look = ['breitbart.com', 'oann.com', 'newsmax', 'drudge', 'blaze', 'nypost', 'www.rt.com', 'www.dailymail.co.uk '] + additional_dubious_domains
science_look = ['sciencemag', 'nature.com', 'cdc.gov', 'nih.gov', 'sciencedirect'] + additional_science_domains
govt_look = ['.gov'] 
not_any_look = news_look + dubious_look + science_look + govt_look


# find all those domains in the dataset

news = []
news_count = 0

dubious = []
dubious_count = 0

science = []
science_count = 0

govt = []
govt_count = 0

not_any = []
not_any_count = 0



for i in range(len(domains)):
    if any(news_elem in domains[i] for news_elem in news_look):
        news.append(domains[i])
        news_count += counts[i]
    elif any(dubious_elem in domains[i] for dubious_elem in dubious_look):
        dubious.append(domains[i])
        dubious_count += counts[i]
    elif any(science_elem in domains[i] for science_elem in science_look):
        science.append(domains[i])
        science_count += counts[i]
    elif any(govt_elem in domains[i] for govt_elem in govt_look):
        govt.append(domains[i])
        govt_count += counts[i]
    elif not any(not_any_elem in domains[i] for not_any_elem in not_any_look):
        not_any.append(domains[i])
        not_any_count += counts[i]




print('news num domains: {}. Num Articles: {}'.format(len(news), news_count))
print('dubious num domains: {}. Num Articles: {}'.format(len(dubious), dubious_count))
print('science num domains: {}. Num Articles: {}'.format(len(science), science_count))
print('govt num domains: {}. Num Articles: {}'.format(len(govt), govt_count))
print('not_any num domains: {}. Num Articles: {}'.format(len(not_any), not_any_count))
    

news num domains: 101. Num Articles: 4251
dubious num domains: 207. Num Articles: 1836
science num domains: 118. Num Articles: 1478
govt num domains: 113. Num Articles: 259
not_any num domains: 8017. Num Articles: 29124


<br><br>
### Label the domains

In [6]:
def slice_df_by_domain(selected_list: list, df=df):
    temp = df.copy()
    domain_list = df['domain'].tolist()
    idxs = []
    for i in range(len(domain_list)):
        if any(elem in domain_list[i] for elem in selected_list):
            idxs.append(i)
        else:
            pass
    return temp.iloc[idxs]

def apply_labels_by_domain(selected_list: list, df=df):
    temp = df.copy()
    domain_list = df['domain'].tolist()
    vals = []
    for i in range(len(domain_list)):
        if any(elem in domain_list[i] for elem in selected_list):
            vals.append(int(1))
        else:
            vals.append(int(0))
    return vals


def add_labels_to_dataframe(groupings, group_titles, df=df):
    for group, title in zip(groupings, group_titles):
        temp_list = apply_labels_by_domain(group)
        temp_dict = {title: temp_list}
        series = pd.DataFrame(temp_dict, dtype=int)
        df.reset_index(drop=True, inplace=True)
        series.reset_index(drop=True, inplace=True)
        df = pd.concat([df, series], axis=1)
    return df



groupings = [news, science, govt, dubious, not_any]
group_titles = ['news', 'science', 'govt', 'dubious', 'not_any']

df = add_labels_to_dataframe(groupings, group_titles, df)
df['group'] = (df.iloc[:, 16:21] == 1).idxmax(1).astype('category').cat.codes
print(df.group.value_counts())

3    29124
2     4251
0     1836
4     1478
1      259
Name: group, dtype: int64


<br><br><br><br><br>
### Down-select to only known domains
- Includes only articles grouped into big domain topics (science, news, govt, dubious)
- Excludes unknown domains

In [7]:
### segment out the big domains groups from the rest of the articles
df = df[df['group'] != 3]
print(df.shape)
df.head()

(7824, 22)


Unnamed: 0,created_at,id_str,entities,user,retweet_count,favorite_count,retweeted_status,Hash words,link,urls,first_url,url_total_retweets,url_total_favorites,article_text,article_text_preprocessed,domain,news,science,govt,dubious,not_any,group
0,2020-02-29 23:17:56+00:00,1.233894e+18,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'id': 1172275165524385798, 'id_str': '1172275...",0.0,0.0,,No hashtags,https://twitter.com/user/status/12338942013953...,[https://nypost.com/2020/02/28/israeli-scienti...,https://nypost.com/2020/02/28/israeli-scientis...,1414.0,111.0,Thanks for contacting us. We've received your...,"[thanks, contacting, us, received, submission,...",nypost.com,0,0,0,1,0,0
5,2020-02-29 23:39:43+00:00,1.2339e+18,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'id': 465592070, 'id_str': '465592070', 'name...",0.0,0.0,,No hashtags,https://twitter.com/user/status/12338996799240...,[https://nypost.com/2020/02/28/israeli-scienti...,https://nypost.com/2020/02/28/israeli-scientis...,310.0,1055.0,"\nNews\n By Yaron Steinbuch \n\tFebruary 28, ...","[news, yaron, steinbuch, february, amid, fears...",nypost.com,0,0,0,1,0,0
6,2020-02-29 23:40:00+00:00,1.2339e+18,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'id': 23081056, 'id_str': '23081056', 'name':...",328.0,0.0,{'created_at': 'Thu Feb 27 23:27:42 +0000 2020...,No hashtags,https://twitter.com/user/status/12338997538151...,[https://time.com/5790545/first-covid-19-vacci...,https://time.com/5790545/first-covid-19-vaccine/,73061.0,2115.0,"Moderna Therapeutics, a biotech company based...","[moderna, therapeutics, biotech, company, base...",time.com,1,0,0,0,0,2
7,2020-02-29 23:41:55+00:00,1.2339e+18,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'id': 1090679949256810498, 'id_str': '1090679...",0.0,0.0,,No hashtags,https://twitter.com/user/status/12339002367681...,[https://www.reuters.com/article/us-china-heal...,https://www.reuters.com/article/us-china-healt...,0.0,0.0,"Discover Thomson Reuters By David Stanway, Jo...","[discover, thomson, reuters, david, stanway, j...",www.reuters.com,1,0,0,0,0,2
11,2020-02-29 23:56:36+00:00,1.233904e+18,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'id': 44376499, 'id_str': '44376499', 'name':...",0.0,0.0,,No hashtags,https://twitter.com/user/status/12339039325037...,[https://www.nytimes.com/2020/02/27/opinion/co...,https://www.nytimes.com/2020/02/27/opinion/cor...,48.0,71.0,Advertisement Supported by We need to stop wh...,"[advertisement, supported, need, stop, drives,...",www.nytimes.com,1,0,0,0,1,2


<br><br>
### Label times (month, week)

In [8]:

import datetime
def extract_week_num(date_string):
    temp = date_string[:10] # only the YYYY-mm-dd
    temp_dt = datetime.datetime.strptime(temp, "%Y-%m-%d").date() # convert to a datetime
    week = temp_dt.isocalendar()[1] # convert to week
    return week

df['month'] = df['created_at'].apply(lambda x: x[:7])
df['week_num'] = df['created_at'].apply(lambda x: extract_week_num(x))


<br><br><br>
### Label Partisan Content

In [9]:
src_path = '/Volumes/seagate_external_drive/anti_vax_embeddings/src/JAN2021/'
load_name = 'domain_info.csv'

domain_info = pd.read_csv(src_path + load_name)

print(domain_info.shape)
print(domain_info['Bias'].value_counts())
print()
print(domain_info.head())

# label domains by partisan lean (left, center, right)
right_lean_domains = domain_info[(domain_info['Bias'] == 'right') | (domain_info['Bias'] == 'right-center')]['Domain'].tolist()
left_lean_domains = domain_info[(domain_info['Bias'] == 'left') | (domain_info['Bias'] == 'left-center')]['Domain'].tolist()
center_lean_domains = domain_info[(domain_info['Bias'] == 'center')]['Domain'].tolist()
fake_news_domains = domain_info[(domain_info['Bias'] == 'fake-news')]['Domain'].tolist()
conspiracy_domains = domain_info[(domain_info['Bias'] == 'conspiracy')]['Domain'].tolist()

any_lean_domains = right_lean_domains + left_lean_domains + center_lean_domains + fake_news_domains + conspiracy_domains

def label_partisan(df):
    df['no_www'] = df['domain'].apply(lambda x: re.search(r"(www.)?(.*)", x).group(2))
    df['fake_news'] = df['no_www'].apply(lambda x: 1 if x in fake_news_domains else 0)
    df['conspiracy'] = df['no_www'].apply(lambda x: 1 if x in conspiracy_domains else 0)
    df['lean_left'] = df['no_www'].apply(lambda x: 1 if x in left_lean_domains else 0)
    df['lean_center'] = df['no_www'].apply(lambda x: 1 if x in center_lean_domains else 0)
    df['lean_right'] = df['no_www'].apply(lambda x: 1 if x in right_lean_domains else 0)
    df['lean_not_known'] = df['no_www'].apply(lambda x: 1 if x not in any_lean_domains else 0) 

    return df

df = label_partisan(df)
df['bias'] = (df.loc[:, ['fake_news', 'conspiracy', 'lean_left', 'lean_center', 'lean_right', 'lean_not_known']] == 1).idxmax(1).astype('category').cat.codes
df.reset_index(drop=True, inplace=True)
print(df.shape)
df.head()


(3037, 4)
fake-news       533
left-center     494
center          432
left            333
conspiracy      299
right           279
unknown         277
right-center    249
pro-science     140
black             1
Name: Bias, dtype: int64

                Domain   Rating       Bias Factual_Rating
0  100percentfedup.com      red  fake-news       very low
1           10news.one  unknown  fake-news       very low
2        12minutos.com    black      black          black
3        180report.com  unknown     center          mixed
4     20minutenews.com    black    unknown        unknown
(7824, 32)


Unnamed: 0,created_at,id_str,entities,user,retweet_count,favorite_count,retweeted_status,Hash words,link,urls,first_url,url_total_retweets,url_total_favorites,article_text,article_text_preprocessed,domain,news,science,govt,dubious,not_any,group,month,week_num,no_www,fake_news,conspiracy,lean_left,lean_center,lean_right,lean_not_known,bias
0,2020-02-29 23:17:56+00:00,1.233894e+18,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'id': 1172275165524385798, 'id_str': '1172275...",0.0,0.0,,No hashtags,https://twitter.com/user/status/12338942013953...,[https://nypost.com/2020/02/28/israeli-scienti...,https://nypost.com/2020/02/28/israeli-scientis...,1414.0,111.0,Thanks for contacting us. We've received your...,"[thanks, contacting, us, received, submission,...",nypost.com,0,0,0,1,0,0,2020-02,9,nypost.com,0,0,0,0,1,0,5
1,2020-02-29 23:39:43+00:00,1.2339e+18,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'id': 465592070, 'id_str': '465592070', 'name...",0.0,0.0,,No hashtags,https://twitter.com/user/status/12338996799240...,[https://nypost.com/2020/02/28/israeli-scienti...,https://nypost.com/2020/02/28/israeli-scientis...,310.0,1055.0,"\nNews\n By Yaron Steinbuch \n\tFebruary 28, ...","[news, yaron, steinbuch, february, amid, fears...",nypost.com,0,0,0,1,0,0,2020-02,9,nypost.com,0,0,0,0,1,0,5
2,2020-02-29 23:40:00+00:00,1.2339e+18,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'id': 23081056, 'id_str': '23081056', 'name':...",328.0,0.0,{'created_at': 'Thu Feb 27 23:27:42 +0000 2020...,No hashtags,https://twitter.com/user/status/12338997538151...,[https://time.com/5790545/first-covid-19-vacci...,https://time.com/5790545/first-covid-19-vaccine/,73061.0,2115.0,"Moderna Therapeutics, a biotech company based...","[moderna, therapeutics, biotech, company, base...",time.com,1,0,0,0,0,2,2020-02,9,time.com,0,0,1,0,0,0,3
3,2020-02-29 23:41:55+00:00,1.2339e+18,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'id': 1090679949256810498, 'id_str': '1090679...",0.0,0.0,,No hashtags,https://twitter.com/user/status/12339002367681...,[https://www.reuters.com/article/us-china-heal...,https://www.reuters.com/article/us-china-healt...,0.0,0.0,"Discover Thomson Reuters By David Stanway, Jo...","[discover, thomson, reuters, david, stanway, j...",www.reuters.com,1,0,0,0,0,2,2020-02,9,reuters.com,0,0,0,1,0,0,2
4,2020-02-29 23:56:36+00:00,1.233904e+18,"{'hashtags': [], 'symbols': [], 'user_mentions...","{'id': 44376499, 'id_str': '44376499', 'name':...",0.0,0.0,,No hashtags,https://twitter.com/user/status/12339039325037...,[https://www.nytimes.com/2020/02/27/opinion/co...,https://www.nytimes.com/2020/02/27/opinion/cor...,48.0,71.0,Advertisement Supported by We need to stop wh...,"[advertisement, supported, need, stop, drives,...",www.nytimes.com,1,0,0,0,1,2,2020-02,9,nytimes.com,0,0,1,0,0,0,3


<br><br><br>
### Filtering Out Words from Articles

In [10]:
### remove numbers and symbols
df['article_text_preprocessed'] = df['article_text_preprocessed'].map(lambda x: " ".join(x))
df['article_text_preprocessed'] = df['article_text_preprocessed'].map(lambda x: re.sub(r"[^A-Za-z ]", "", x))

### remove words with fewer than 3 letters
df['article_text_preprocessed'] = df['article_text_preprocessed'].apply(lambda x: ' '.join([word for word in x.split() if len(word) >2]))


### remove common words that add noise
common_virus_words = ['vaccine', 'vaccines', 'coronavirus', 'covid', 'cov', 'sars', 'virus', 'health']
webpage_words = ['cache', 'url', 'feeds', 'address', 'ip', 'hostname', 'dns', 'cmu', 'ece', 'proxy',
                 'administrator', 'retrieve', 'edu', 'server', 'gmt', 'com', 'resolve', 'encountered',
                 'root', 'http', 'error', 'host', 'cookies', 'trademark', 'contributors', 'ads', 
                 'registered', 'agree', 'use', 'copyright', 'details', 'topnews', 'healthnews', 'help',
                 'support', 'update', 'website', 'content', 'google', 'fonts', 'tracking', 'youtube',
                 'terms', 'conditions', 'see', 'complete', 'list', 'submit', 'news', 'supportterms',
                 'contacting', 'contact', 'thanks', 'submission', 'received', 'frequently', 'asked',
                 'published', 'linkedin', 'look', 'pmc', 'video', 'www', 'gov', 'email', 'pandemic',
                 'https', 'comments', 'sitemap', 'reserved', 'tue', 'mon', 'sun', 'sat', 'wed', 'dec', 
                 'iduskbn', 'idinkbn', 'rata', 'www', 'gov', 'email', 'video', 'cart', 'service', 'online',
                 'far', 'editing', 'ask', 'said', 'says', 'monday', 'tuesday', 'wednesday', 'thursday', 'friday',
                 'saturday', 'sunday', 'advertisement', 'minutes', 'reporting', 'site', 'web', 'want',
                 'pinterest', 'href', 'permalink', 'squid', 'timestamp']

news_words = ['news', 'thomson', 'reuters', 'bbc', 'pmc', 'topnews', 'fox', 'nyp', 'newsmax', 'oann',
              'breitbart', 'cnn', 'nbc', 'wsj', 'nyt', 'msnbc', 'foxnews', 'rt', 'zerohedge', 'naturalnews',
              'companynews']

stop_words = stopwords.words('english')

words_to_remove = common_virus_words + webpage_words + news_words + stop_words

df['article_text_preprocessed'] = df['article_text_preprocessed'].apply(lambda x: ' '.join([word for word in x.split() if word not in words_to_remove]))


<br><br><br><br>
## Save 7824 tweets with labels

In [17]:
print(df.shape)

(7824, 32)


In [18]:
df.reset_index(drop=True, inplace=True)

save_path = '/Volumes/seagate_external_drive/anti_vax_embeddings/src/JAN2021/'
trimmed_name = '7800_tweets_known_domains_with_metadata.pkl'

with open(save_path + trimmed_name, 'wb') as f:
    pickle.dump(df, f)
