In [None]:
# This script extracts URLs and domains from links shared in tweets. It also identifies the most popular links.
# TODO: Combine with userdata to show number of followers, track spread over time (put into same format as CT?)

In [2]:
import matplotlib.pyplot as plt
import itertools
import json
import pandas as pd
import pymongo
import tldextract
from collections import Counter

In [3]:
client = pymongo.MongoClient()
db = client['GA_Runoffs']

In [4]:
# Return tweets based on a query, in this case tweets that use the hastag #HoldTheLine and that contain a URL
search_results = db.collected_tweets.find({"$and":[{"$text": {"$search":"#demcastga"}}, {"entities.urls":{'$exists': True}}]}, 
                                    {'entities.hashtags.tag':1,'category':1,'created_at':1, 'author_id':1, 'entities.urls.expanded_url':1,
                                    'public_metrics':1})

In [5]:
# Convert the data to a pandas dataframe
search_results_df = pd.DataFrame(list(search_results))

In [6]:
# How many tweets did we collect?
len(search_results_df)

2193

In [8]:
# Unpack lists of URLs

# Initialize empty lists of URLs and domains
search_results_df['urls'] = pd.np.empty((len(search_results_df),0)).tolist()
search_results_df['domains'] = pd.np.empty((len(search_results_df),0)).tolist()

search_results_df.head(2)


Unnamed: 0,_id,author_id,created_at,entities,public_metrics,category,urls,domains
0,5fee3fea88da48afa84f5653,1083771575348781056,2020-12-27T15:38:38.000Z,{'hashtags': [{'tag': 'GeorgiansVoteRevWarnock...,"{'like_count': 0, 'quote_count': 0, 'reply_cou...",demcast_ecosystem,[],[]
1,5fee3fe088da48afa84f54b5,1254891029179830272,2020-12-29T18:48:02.000Z,"{'hashtags': [{'tag': 'VoteOssoffWarnock'}, {'...","{'like_count': 3, 'quote_count': 1, 'reply_cou...",demcast_ecosystem,[],[]


In [9]:
# Unpack the URLs into a single list
for index, value in search_results_df['entities'].items():
    for item in value['urls']:
        search_results_df['urls'].iloc[index].append(item['expanded_url'])

In [10]:
# Inspect
search_results_df.tail(2)

Unnamed: 0,_id,author_id,created_at,entities,public_metrics,category,urls,domains
2191,5fecb32d88da48afa84ed837,2402915880,2020-12-24T08:56:08.000Z,"{'hashtags': [{'tag': 'ThankYou'}, {'tag': 'FB...","{'like_count': 4, 'quote_count': 0, 'reply_cou...",hashtag_search,[https://twitter.com/mjfree/status/13389266095...,[]
2192,5fecb24888da48afa84eb5f8,1225971763437330434,2020-12-30T15:03:26.000Z,"{'hashtags': [{'tag': 'Warnock'}, {'tag': 'GAp...","{'like_count': 5, 'quote_count': 0, 'reply_cou...",hashtag_search,[https://app.speechifai.tech/s/pc7Ep3VjxfpxQxv...,[]


In [11]:
# Use tldextract library to extract root domains
for index, value in search_results_df['urls'].items():
    domain_info = tldextract.extract(value[0])
    search_results_df['domains'].iloc[index].append(domain_info.registered_domain)

In [12]:
# Inspect
search_results_df.tail(2)

Unnamed: 0,_id,author_id,created_at,entities,public_metrics,category,urls,domains
2191,5fecb32d88da48afa84ed837,2402915880,2020-12-24T08:56:08.000Z,"{'hashtags': [{'tag': 'ThankYou'}, {'tag': 'FB...","{'like_count': 4, 'quote_count': 0, 'reply_cou...",hashtag_search,[https://twitter.com/mjfree/status/13389266095...,[twitter.com]
2192,5fecb24888da48afa84eb5f8,1225971763437330434,2020-12-30T15:03:26.000Z,"{'hashtags': [{'tag': 'Warnock'}, {'tag': 'GAp...","{'like_count': 5, 'quote_count': 0, 'reply_cou...",hashtag_search,[https://app.speechifai.tech/s/pc7Ep3VjxfpxQxv...,[speechifai.tech]


In [13]:
# Analysis one: count the most common domains
domains_list = search_results_df['domains'].to_list()
domains_list = [y for x in domains_list for y in x]
top_domains = dict(Counter(domains_list).items())
top_domains = sorted(top_domains.items(), key=lambda x: x[1], reverse=True)
top_domains[0:20]

[('twitter.com', 1453),
 ('demcastusa.com', 445),
 ('actblue.com', 35),
 ('Iwillvote.com', 30),
 ('mobilize.us', 29),
 ('youtube.com', 22),
 ('tiktok.com', 20),
 ('nytimes.com', 11),
 ('tinyurl.com', 11),
 ('cnn.com', 7),
 ('apple.news', 6),
 ('ajc.com', 5),
 ('iwillvote.com', 5),
 ('paper.li', 4),
 ('gaballotdropbox.org', 4),
 ('washingtonpost.com', 4),
 ('speechifai.tech', 4),
 ('georgiademocrat.org', 4),
 ('electjon.com', 4),
 ('motherjones.com', 3)]

In [14]:
# Analysis two: identify the most common individual links
url_list = search_results_df['urls'].to_list()
url_list = [y for x in url_list for y in x]
top_urls = dict(Counter(url_list).items())
top_urls = sorted(top_urls.items(), key=lambda x: x[1], reverse=True)
top_urls[0:20]

[('http://Iwillvote.com', 26),
 ('http://GASenate.com', 22),
 ('https://share.demcastusa.com/s/5l5GknosPu23FEHSVVYSdA', 16),
 ('https://share.demcastusa.com/s/0ZZgguMmx90um2fTJIKF0A', 16),
 ('https://twitter.com/NickKnudsenUS/status/1341035662229704706/video/1', 16),
 ('https://share.demcastusa.com/s/HZIQZYScpcCUWR_W8xc7Sg', 16),
 ('https://share.demcastusa.com/s/Vb5bbYuGNITdav13cbd4VA', 15),
 ('https://share.demcastusa.com/s/53iCQsgKm-x_g56WvaZM4A', 15),
 ('https://share.demcastusa.com/s/2pkamYvPylZz_ctlKF-tqg', 15),
 ('https://share.demcastusa.com/s/fdBSQeXc2axnOs2QYTK5zQ', 14),
 ('https://share.demcastusa.com/s/Ic8WHm6L4KeU8DmgRsJZJA', 14),
 ('https://share.demcastusa.com/s/NRScFKY8CJz7RMq2H-QrBA', 13),
 ('https://share.demcastusa.com/s/FTKFJti4UKA05P5g5-xzcg', 13),
 ('https://share.demcastusa.com/s/IOu9DNuzqBIA4t5w52NwAA', 13),
 ('https://share.demcastusa.com/s/Vrbobj02o_zLHRgjZxlFRA', 12),
 ('https://share.demcastusa.com/s/Fdmfg1Agwr4wh0r-meRtIQ', 12),
 ('https://share.demcastusa.

In [None]:
# People are not sharing much off-platform, apart from YouTube
# TODO: count percentage of tweets (and users) with off-platform sharing