# Commonly Shared URLs and Domains in #HoldTheLine

This script extracts URLs and domains from links shared in tweets. It also identifies the most popular links.

TODO: Combine with userdata to show number of followers, track spread over time (put into same format as CT?)

In [2]:
import pandas as pd
import pymongo
import tldextract
from collections import Counter

In [3]:
client = pymongo.MongoClient()
db = client['GA_Runoffs']

In [4]:
# Return tweets based on a query, in this case tweets that use the hastag #HoldTheLine and that contain a URL
search_results = db.collected_tweets.find({"$and":[{"$text": {"$search":"#holdtheline"}}, {"entities.urls":{'$exists': True}}]}, 
                                    {'entities.hashtags.tag':1,'category':1,'created_at':1, 'author_id':1, 'entities.urls.expanded_url':1,
                                    'public_metrics':1})

In [5]:
# Convert the data to a pandas dataframe
search_results_df = pd.DataFrame(list(search_results))

In [6]:
# How many tweets did we collect?
len(search_results_df)

11357

In [7]:
# Unpack lists of URLs

# Initialize empty lists of URLs and domains
search_results_df['urls'] = pd.np.empty((len(search_results_df),0)).tolist()
search_results_df['domains'] = pd.np.empty((len(search_results_df),0)).tolist()


In [8]:
# Unpack the URLs into a single list
for index, value in search_results_df['entities'].items():
    for item in value['urls']:
        search_results_df['urls'].iloc[index].append(item['expanded_url'])

In [9]:
# Use tldextract library to extract root domains
for index, value in search_results_df['urls'].items():
    domain_info = tldextract.extract(value[0])
    search_results_df['domains'].iloc[index].append(domain_info.registered_domain)

In [10]:
# Analysis one: count the most common domains

domains_list = search_results_df['domains'].to_list()
domains_list = [y for x in domains_list for y in x]
top_domains = dict(Counter(domains_list).items())
top_domains = sorted(top_domains.items(), key=lambda x: x[1], reverse=True)
top_domains[0:20]

[('twitter.com', 10183),
 ('youtu.be', 288),
 ('givesendgo.com', 197),
 ('thegatewaypundit.com', 50),
 ('youtube.com', 47),
 ('rumble.com', 23),
 ('instagram.com', 21),
 ('faxcongress.org', 20),
 ('buff.ly', 16),
 ('rappler.com', 15),
 ('foxnews.com', 13),
 ('got-freedom.org', 13),
 ('theepochtimes.com', 12),
 ('blabber.buzz', 12),
 ('TrumpMarch.com', 11),
 ('facebook.com', 11),
 ('parler.com', 10),
 ('mic.com', 10),
 ('standup-patriots.com', 10),
 ('gopvote.co', 10)]

In [11]:
# Analysis two: identify the most common individual links
url_list = search_results_df['urls'].to_list()
url_list = [y for x in url_list for y in x]
top_urls = dict(Counter(url_list).items())
top_urls = sorted(top_urls.items(), key=lambda x: x[1], reverse=True)
top_urls[0:20]

[('https://givesendgo.com/Truepatriots', 196),
 ('https://twitter.com/HawleyMO/status/1344307458085412867?s=20-', 77),
 ('https://twitter.com/40_head/status/1345667856344600576', 22),
 ('http://www.faxcongress.org', 20),
 ('https://twitter.com/realDonaldTrump/status/1342245390540804096', 18),
 ('https://twitter.com/anonpatriotq/status/1343238789146304513', 17),
 ('https://twitter.com/GenFlynn/status/1344825204321767424', 16),
 ('https://twitter.com/GenFlynn/status/1344825206054023168', 15),
 ('https://twitter.com/ZNeveri/status/1342995101149245440', 15),
 ('https://buff.ly/390Y5UZ', 15),
 ('https://twitter.com/realDonaldTrump/status/1343663159085834248', 14),
 ('https://got-freedom.org/evidence/', 13),
 ('https://twitter.com/GenFlynn/status/1344139002077728769', 12),
 ('http://TrumpMarch.com', 11),
 ('https://twitter.com/laurenboebert/status/1345217043566649345', 10),
 ('https://twitter.com/SidneyPowell1/status/1342671477771030530', 10),
 ('https://twitter.com/realDonaldTrump/status/13

In [12]:
# People are not sharing much off-platform, apart from YouTube
# TODO: count percentage of tweets (and users) with off-platform sharing