In [1]:
from notebook_utils import setup, load_tweet_df, load_media_df

setup()

In [2]:
tweet_df, recent_tweet_df = load_tweet_df()

tweet_df.info()

Loading 2696807 json lines
(4%): 100000 lines in ../data/14-nov/parsed_tweets.json processed (2.2798409461975098 sec)
(7%): 200000 lines in ../data/14-nov/parsed_tweets.json processed (2.299914836883545 sec)
(11%): 300000 lines in ../data/14-nov/parsed_tweets.json processed (2.6531641483306885 sec)
(15%): 400000 lines in ../data/14-nov/parsed_tweets.json processed (2.378187894821167 sec)
(19%): 500000 lines in ../data/14-nov/parsed_tweets.json processed (2.5499980449676514 sec)
(22%): 600000 lines in ../data/14-nov/parsed_tweets.json processed (2.8550190925598145 sec)
(26%): 700000 lines in ../data/14-nov/parsed_tweets.json processed (3.1365549564361572 sec)
(30%): 800000 lines in ../data/14-nov/parsed_tweets.json processed (1.7076022624969482 sec)
(33%): 900000 lines in ../data/14-nov/parsed_tweets.json processed (3.5984036922454834 sec)
(37%): 1000000 lines in ../data/14-nov/parsed_tweets.json processed (1.8618502616882324 sec)
(41%): 1100000 lines in ../data/14-nov/parsed_tweets.jso

# Top URLs in the dataset

In [87]:
from collections import defaultdict
import heapq
url_map = defaultdict(lambda: {
    "tweet_ids": set(),
    "aggregated_retweet_count": 0,
    "aggregated_quote_count": 0
})

total_tweet_count = 0
total_retweet_count = 0
total_quote_count = 0

for i, tweet_id, urls, retweet_count, quote_count in recent_tweet_df[["datastore_id", "urls", "retweet_count", "quote_count"]].itertuples():
    has_relevant_url = False
    for url in urls:
        if "twitter.com/" not in url:
            url = url.lower()
            has_relevant_url = True
            url_map[url]["tweet_ids"].add(tweet_id)
            url_map[url]["aggregated_retweet_count"] += retweet_count
            url_map[url]["aggregated_quote_count"] += quote_count
            total_retweet_count += retweet_count
            total_quote_count += quote_count
    if has_relevant_url:
        total_tweet_count += 1

In [88]:
print("Number of tweets with URLs (excluding twitter.com URLs): {:,}".format(total_tweet_count))
print("Unique URLs shared: {:,}".format(len(url_map.keys())))
print("URL share retweet count: {:,}".format(total_retweet_count))
print("URL share quote count: {:,}".format(total_quote_count))

Number of tweets with URLs (excluding twitter.com URLs): 214,666
Unique URLs shared: 58,529
URL share retweet count: 1,354,021
URL share quote count: 171,010


In [99]:
def top_urls_by_retweet_count(url_map, N = 10):
    for url in heapq.nlargest(N, url_map, key=lambda x: url_map.get(x)["aggregated_retweet_count"]):
        url_stats = url_map.get(url)
        tweet_count = len(url_stats["tweet_ids"])
        retweet_count = url_stats["aggregated_retweet_count"]
        print("{} retweets from {} tweets - {}".format(retweet_count, tweet_count, url))

def transform_url_map(url_map, filter_fn=lambda x: x, map_key=lambda x: x):
    new_map = {}
    for key, val in url_map.items():
        if filter_fn(key):
            mapped_key = map_key(key)
            if (mapped_key in new_map):
                existing_entry = new_map[mapped_key]
                existing_entry["tweet_ids"].update(val["tweet_ids"])
                existing_entry["aggregated_retweet_count"] += val["aggregated_retweet_count"]
                existing_entry["aggregated_quote_count"] += val["aggregated_quote_count"]
            else:
                new_map[mapped_key] = val

    return new_map

print("Top URLs in the dataset:")
top_urls_by_retweet_count(transform_url_map(url_map))


Top URLs in the dataset:
267885 retweets from 3106 tweets - https://thefederalist.com/2020/11/05/democrats-have-been-denying-trump-the-presidency-ever-since-his-first-victory/#.x6riz55se8q.twitter
157701 retweets from 26 tweets - https://hann.it/2uab4bp
153837 retweets from 3184 tweets - https://www.breitbart.com/2020-election/2020/11/04/report-trump-campaign-assembling-all-star-legal-team-election-challenges/
141187 retweets from 29623 tweets - https://youtu.be/p3ss8m1_3ke
134195 retweets from 7827 tweets - https://www.thegatewaypundit.com/2020/11/watch-suitcases-coolers-rolled-detroit-voting-center-4-brought-secure-counting-area/
132086 retweets from 1072 tweets - https://justthenews.com/politics-policy/elections/yes-america-there-voter-fraud-these-recent-cases-prove-it
94226 retweets from 2578 tweets - https://nypost.com/2020/08/29/political-insider-explains-voter-fraud-with-mail-in-ballots/
89205 retweets from 3965 tweets - https://www.nytimes.com/2020/11/04/technology/sharpies-bal

## Top Domains in the dataset

In [100]:
def map_to_domain(url):
    parsed = urlparse(url)

    return parsed.netloc.replace("www.", "")

domain_stats_map = transform_url_map(url_map, map_key=map_to_domain)
print("Unique domains in the dataset: {:,}".format(len(domain_url_map.keys())))
print("Top domains in the dataset:")
top_urls_by_retweet_count(domain_url_map)

Unique domains in the dataset: 7,276
Top domains in the dataset:
178590 retweets from 3106 tweets - thefederalist.com
106272 retweets from 26 tweets - hann.it
102689 retweets from 3184 tweets - breitbart.com
89470 retweets from 7826 tweets - thegatewaypundit.com
88141 retweets from 1070 tweets - justthenews.com
86985 retweets from 29623 tweets - youtu.be
67178 retweets from 2578 tweets - nypost.com
59475 retweets from 3965 tweets - nytimes.com
58170 retweets from 2599 tweets - zerohedge.com
57100 retweets from 219 tweets - ow.ly


## Top YouTube URLs in the dataset

In [101]:
from urllib.parse import parse_qs, urlencode, urlparse

deleted_url_params = set()

def detect_youtube_url(url):
    parsed = urlparse(url)
    return "youtu.be" in parsed.netloc or "youtube.com" in parsed.netloc

def normalize_youtube_url(url):
    parsed = urlparse(url)
    parsed = parsed._replace()
    query_params = parse_qs(parsed.query)
    if (parsed.path == '/watch' and "v" in query_params):
        updated_path = '/' + query_params["v"][0]
        del query_params["v"]
        parsed = parsed._replace(path=updated_path)
    deleted_url_params.update(query_params.keys())
    query_params = {}
    updated_query = urlencode(query_params, doseq=True)
    parsed = parsed._replace(scheme='https', netloc='youtu.be', query=updated_query)
    return parsed.geturl()

youtube_url_map = transform_url_map(
    url_map, 
    filter_fn=detect_youtube_url,
    map_key=normalize_youtube_url
)

print("Deleted URL params after normalizing Youtube URLs", deleted_url_params)

print()
print("Unique Youtube URLs in the dataset: {:,}".format(len(youtube_url_map.keys())))
print("Top Youtube URLs in the dataset:")
top_urls_by_retweet_count(youtube_url_map)

Deleted URL params after normalizing Youtube URLs {'has_verified', 'redir_token', 'attr_tag', 'bsft_mime_type', 'bsft_clkid', 'utm_source', 'utm_medium', 'bsft_utid', 'view_as', 'form', 'bsft_ek', 'ebc', 'bsft_aaid', 'utm_campaign', 'persist_app', 'reload', 'from', 'utm_content', 'pbjreload', 'amp;feature', 'ab_channel', 'pc', 'authuser', 'time_continue', 'fbclid', 'list', 'start_radio', 'bsft_mid', 'bsft_tv', 'lc', 'v', 'bsft_eid', 'bsft_lx', 'bsft_link_id', 'html_redirect', 'q', 'index', 'autoplay', 't', 'bsft_uid', 'd', 'app', 'feature', 'event', 'noapp', 'search_query'}

Unique Youtube URLs in the dataset: 5,530
Top Youtube URLs in the dataset:
202529 retweets from 29623 tweets - https://youtu.be/p3ss8m1_3ke
65284 retweets from 5997 tweets - https://youtu.be/9vwcjpbnuz0
15812 retweets from 30 tweets - https://youtu.be/byta1amljxy
14382 retweets from 336 tweets - https://youtu.be/96-bqaivopc
7325 retweets from 433 tweets - https://youtu.be/w7vkbipeyz4
5924 retweets from 1402 tweets 

# Top Media in the dataset

In [102]:
media_df = load_media_df()
media_df.info()

Loading 156311 json lines
(64%): 100000 lines in ../data/14-nov/parsed_media.json processed (0.5224850177764893 sec)
Done loading ../data/14-nov/parsed_media.json
156311 lines in ../data/14-nov/parsed_media.json processed (0.7744948863983154 sec)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156311 entries, 0 to 156310
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   media_url  156311 non-null  object
 1   tweet_id   156311 non-null  object
 2   media_id   156311 non-null  object
 3   type       156311 non-null  object
dtypes: object(4)
memory usage: 4.8+ MB


## Export to JSON

In [98]:
import json

def serialize_sets(obj):
    if isinstance(obj, set):
        return list(obj)

    return obj

with open("./data_export/url_stats/youtube_urls.json", "w", encoding="utf-8") as f:
    json.dump(youtube_url_map, f, sort_keys=True, indent=2, default=serialize_sets)

with open("./data_export/url_stats/domains.json", "w", encoding="utf-8") as f:
    json.dump(domain_stats_map, f, sort_keys=True, indent=2, default=serialize_sets)

with open("./data_export/url_stats/all_urls.json", "w", encoding="utf-8") as f:
    json.dump(url_map, f, sort_keys=True, indent=2, default=serialize_sets)