In [None]:
# This script extracts URLs and domains from links shared in tweets. It also identifies the most popular links.
# TODO: Combine with userdata to show number of followers, track spread over time (put into same format as CT?)

In [1]:
import matplotlib.pyplot as plt
import itertools
import json
import pandas as pd
import pymongo
import tldextract
from collections import Counter

In [2]:
client = pymongo.MongoClient()
db = client['GA_Runoffs']

In [6]:
# Return tweets based on a query, in this case tweets that use the hastag #HoldTheLine and that contain a URL
search_results = db.collected_tweets.find({"$and":[{"$text": {"$search":"#demcastga"}}, {"entities.urls":{'$exists': True}}]}, 
                                    {'entities.hashtags.tag':1,'category':1,'created_at':1, 'author_id':1, 'entities.urls.expanded_url':1,
                                    'public_metrics':1})

In [7]:
# Convert the data to a pandas dataframe
search_results_df = pd.DataFrame(list(search_results))

In [8]:
# How many tweets did we collect?
len(search_results_df)

2056

In [9]:
# Unpack lists of URLs

# Initialize empty lists of URLs and domains
search_results_df['urls'] = pd.np.empty((len(search_results_df),0)).tolist()
search_results_df['domains'] = pd.np.empty((len(search_results_df),0)).tolist()

search_results_df.head(5)


Unnamed: 0,_id,author_id,created_at,entities,public_metrics,category,urls,domains
0,5fee3fea88da48afa84f5653,1083771575348781056,2020-12-27T15:38:38.000Z,{'hashtags': [{'tag': 'GeorgiansVoteRevWarnock...,"{'like_count': 0, 'quote_count': 0, 'reply_cou...",demcast_ecosystem,[],[]
1,5fee3fe088da48afa84f54b5,1254891029179830272,2020-12-29T18:48:02.000Z,"{'hashtags': [{'tag': 'VoteOssoffWarnock'}, {'...","{'like_count': 3, 'quote_count': 1, 'reply_cou...",demcast_ecosystem,[],[]
2,5ff36ebe93c05a63bd7a6837,3312487501,2021-01-01T02:56:51.000Z,"{'hashtags': [{'tag': 'DemCastGA'}], 'urls': [...","{'like_count': 2, 'quote_count': 0, 'reply_cou...",demcast_ecosystem,[],[]
3,5ff36ebe93c05a63bd7a6813,384456466,2021-01-01T17:07:27.000Z,"{'hashtags': [{'tag': 'DemCastGA'}], 'urls': [...","{'like_count': 0, 'quote_count': 0, 'reply_cou...",demcast_ecosystem,[],[]
4,5ff36ebb93c05a63bd7a6804,384456466,2021-01-01T17:51:55.000Z,"{'hashtags': [{'tag': 'DemCastGA'}], 'urls': [...","{'like_count': 0, 'quote_count': 0, 'reply_cou...",demcast_ecosystem,[],[]


In [10]:
# Unpack the URLs into a single list
for index, value in search_results_df['entities'].items():
    for item in value['urls']:
        search_results_df['urls'].iloc[index].append(item['expanded_url'])

In [11]:
# Inspect
search_results_df.tail(5)

Unnamed: 0,_id,author_id,created_at,entities,public_metrics,category,urls,domains
2051,5fee3fe588da48afa84f5583,1225971763437330434,2020-12-28T19:42:32.000Z,"{'hashtags': [{'tag': 'Loeffler'}, {'tag': 'Se...","{'like_count': 15, 'quote_count': 1, 'reply_co...",demcast_ecosystem,[https://twitter.com/BostonJudy3/status/134364...,[]
2052,5fee3fd088da48afa84f521f,1225971763437330434,2020-12-31T21:00:43.000Z,"{'hashtags': [{'tag': 'McConnell'}, {'tag': 'a...","{'like_count': 0, 'quote_count': 0, 'reply_cou...",demcast_ecosystem,[https://twitter.com/eileenvan55/status/134469...,[]
2053,5fee3fd888da48afa84f53a6,1225971763437330434,2020-12-30T16:56:49.000Z,"{'hashtags': [{'tag': 'Georgia'}, {'tag': 'Per...","{'like_count': 5, 'quote_count': 0, 'reply_cou...",demcast_ecosystem,[https://twitter.com/BostonJudy3/status/134432...,[]
2054,5fecb32d88da48afa84ed837,2402915880,2020-12-24T08:56:08.000Z,"{'hashtags': [{'tag': 'ThankYou'}, {'tag': 'FB...","{'like_count': 4, 'quote_count': 0, 'reply_cou...",hashtag_search,[https://twitter.com/mjfree/status/13389266095...,[]
2055,5fecb24888da48afa84eb5f8,1225971763437330434,2020-12-30T15:03:26.000Z,"{'hashtags': [{'tag': 'Warnock'}, {'tag': 'GAp...","{'like_count': 5, 'quote_count': 0, 'reply_cou...",hashtag_search,[https://app.speechifai.tech/s/pc7Ep3VjxfpxQxv...,[]


In [12]:
# Use tldextract library to extract root domains
for index, value in search_results_df['urls'].items():
    domain_info = tldextract.extract(value[0])
    search_results_df['domains'].iloc[index].append(domain_info.registered_domain)

In [13]:
# Inspect
search_results_df.tail(5)

Unnamed: 0,_id,author_id,created_at,entities,public_metrics,category,urls,domains
2051,5fee3fe588da48afa84f5583,1225971763437330434,2020-12-28T19:42:32.000Z,"{'hashtags': [{'tag': 'Loeffler'}, {'tag': 'Se...","{'like_count': 15, 'quote_count': 1, 'reply_co...",demcast_ecosystem,[https://twitter.com/BostonJudy3/status/134364...,[twitter.com]
2052,5fee3fd088da48afa84f521f,1225971763437330434,2020-12-31T21:00:43.000Z,"{'hashtags': [{'tag': 'McConnell'}, {'tag': 'a...","{'like_count': 0, 'quote_count': 0, 'reply_cou...",demcast_ecosystem,[https://twitter.com/eileenvan55/status/134469...,[twitter.com]
2053,5fee3fd888da48afa84f53a6,1225971763437330434,2020-12-30T16:56:49.000Z,"{'hashtags': [{'tag': 'Georgia'}, {'tag': 'Per...","{'like_count': 5, 'quote_count': 0, 'reply_cou...",demcast_ecosystem,[https://twitter.com/BostonJudy3/status/134432...,[twitter.com]
2054,5fecb32d88da48afa84ed837,2402915880,2020-12-24T08:56:08.000Z,"{'hashtags': [{'tag': 'ThankYou'}, {'tag': 'FB...","{'like_count': 4, 'quote_count': 0, 'reply_cou...",hashtag_search,[https://twitter.com/mjfree/status/13389266095...,[twitter.com]
2055,5fecb24888da48afa84eb5f8,1225971763437330434,2020-12-30T15:03:26.000Z,"{'hashtags': [{'tag': 'Warnock'}, {'tag': 'GAp...","{'like_count': 5, 'quote_count': 0, 'reply_cou...",hashtag_search,[https://app.speechifai.tech/s/pc7Ep3VjxfpxQxv...,[speechifai.tech]


In [14]:
# Analysis one: count the most common domains
domains_list = search_results_df['domains'].to_list()
domains_list = [y for x in domains_list for y in x]
top_domains = dict(Counter(domains_list).items())
top_domains = sorted(top_domains.items(), key=lambda x: x[1], reverse=True)
top_domains

[('twitter.com', 1398),
 ('demcastusa.com', 374),
 ('actblue.com', 35),
 ('Iwillvote.com', 29),
 ('mobilize.us', 28),
 ('tiktok.com', 20),
 ('youtube.com', 20),
 ('tinyurl.com', 11),
 ('nytimes.com', 10),
 ('cnn.com', 7),
 ('apple.news', 6),
 ('ajc.com', 5),
 ('iwillvote.com', 5),
 ('paper.li', 4),
 ('gaballotdropbox.org', 4),
 ('washingtonpost.com', 4),
 ('georgiademocrat.org', 4),
 ('electjon.com', 4),
 ('motherjones.com', 3),
 ('google.com', 3),
 ('youtu.be', 3),
 ('bloomberg.com', 3),
 ('speechifai.tech', 3),
 ('11alive.com', 3),
 ('boingboing.net', 3),
 ('eventbrite.com', 2),
 ('warnockforgeorgia.com', 2),
 ('reuters.com', 2),
 ('cbsnews.com', 2),
 ('salon.com', 2),
 ('baltimoresun.com', 2),
 ('actionnetwork.org', 2),
 ('thedailybeast.com', 2),
 ('theintercept.com', 2),
 ('zoom.us', 2),
 ('ga.gov', 2),
 ('politico.com', 2),
 ('wikipedia.org', 2),
 ('ElectJon.com', 1),
 ('nypost.com', 1),
 ('thehill.com', 1),
 ('bit.ly', 1),
 ('newsbreakapp.com', 1),
 ('ballottrax.net', 1),
 ('theg

In [15]:
# Analysis two: identify the most common individual links
url_list = search_results_df['urls'].to_list()
url_list = [y for x in url_list for y in x]
top_urls = dict(Counter(url_list).items())
top_urls = sorted(top_urls.items(), key=lambda x: x[1], reverse=True)
top_urls

[('http://Iwillvote.com', 26),
 ('http://GASenate.com', 20),
 ('https://share.demcastusa.com/s/5l5GknosPu23FEHSVVYSdA', 15),
 ('https://twitter.com/NickKnudsenUS/status/1341035662229704706/video/1', 14),
 ('https://share.demcastusa.com/s/fdBSQeXc2axnOs2QYTK5zQ', 13),
 ('https://share.demcastusa.com/s/0ZZgguMmx90um2fTJIKF0A', 13),
 ('https://share.demcastusa.com/s/Vb5bbYuGNITdav13cbd4VA', 13),
 ('https://share.demcastusa.com/s/53iCQsgKm-x_g56WvaZM4A', 13),
 ('https://share.demcastusa.com/s/2pkamYvPylZz_ctlKF-tqg', 13),
 ('https://share.demcastusa.com/s/HZIQZYScpcCUWR_W8xc7Sg', 13),
 ('https://share.demcastusa.com/s/Ic8WHm6L4KeU8DmgRsJZJA', 12),
 ('https://share.demcastusa.com/s/Fdmfg1Agwr4wh0r-meRtIQ', 11),
 ('https://share.demcastusa.com/s/NRScFKY8CJz7RMq2H-QrBA', 11),
 ('https://share.demcastusa.com/s/pflSpxPXdJB7Ut-70SybRg', 11),
 ('https://mobilize.us/envoysforhumanity/event/362988/', 11),
 ('http://secure.actblue.com', 11),
 ('https://gaballotdropbox.org/', 10),
 ('https://share.de

In [None]:
# People are not sharing much off-platform, apart from YouTube
# TODO: count percentage of tweets (and users) with off-platform sharing