In [2]:
from notebook_utils import setup, load_tweet_df

setup()

In [3]:
tweet_df, recent_tweet_df = load_tweet_df()

tweet_df.info()

Loading 2696807 json lines
(4%): 100000 lines in ../data/14-nov/parsed_tweets.json processed (1.796416997909546 sec)
(7%): 200000 lines in ../data/14-nov/parsed_tweets.json processed (1.871419906616211 sec)
(11%): 300000 lines in ../data/14-nov/parsed_tweets.json processed (2.2562928199768066 sec)
(15%): 400000 lines in ../data/14-nov/parsed_tweets.json processed (1.9818439483642578 sec)
(19%): 500000 lines in ../data/14-nov/parsed_tweets.json processed (2.2397468090057373 sec)
(22%): 600000 lines in ../data/14-nov/parsed_tweets.json processed (2.4842288494110107 sec)
(26%): 700000 lines in ../data/14-nov/parsed_tweets.json processed (2.7024729251861572 sec)
(30%): 800000 lines in ../data/14-nov/parsed_tweets.json processed (1.3462560176849365 sec)
(33%): 900000 lines in ../data/14-nov/parsed_tweets.json processed (3.212778091430664 sec)
(37%): 1000000 lines in ../data/14-nov/parsed_tweets.json processed (1.3535559177398682 sec)
(41%): 1100000 lines in ../data/14-nov/parsed_tweets.json

# Top URLs in the dataset

In [8]:
from collections import defaultdict
import heapq
url_map = defaultdict(lambda: {
    "tweet_ids": set(),
    "retweet_count": 0,
    "quote_count": 0
})

total_tweet_count = 0
total_retweet_count = 0
total_quote_count = 0

for i, tweet_id, urls, retweet_count, quote_count in recent_tweet_df[["datastore_id", "urls", "retweet_count", "quote_count"]].itertuples():
    has_relevant_url = False
    for url in urls:
        if "twitter.com/" not in url:
            has_relevant_url = True
            url_map[url]["tweet_ids"].add(tweet_id)
            url_map[url]["retweet_count"] += retweet_count
            url_map[url]["quote_count"] += quote_count
            total_retweet_count += retweet_count
            total_quote_count += quote_count
    if has_relevant_url:
        total_tweet_count += 1

In [20]:
print("Number of tweets with URLs: {:,}".format(total_tweet_count))
print("Unique URLs shared: {:,}".format(len(url_map.keys())))
print("URL share retweet count: {:,}".format(total_retweet_count))
print("URL share quote count: {:,}".format(total_quote_count))

Number of tweets with URLs: 214,666
Unique URLs shared: 58,770
URL share retweet count: 1,354,021
URL share quote count: 171,010


In [21]:
def top_urls_by_retweet_count(url_map, N = 10):
    for url in heapq.nlargest(N, url_map, key=lambda x: url_map.get(x)["retweet_count"]):
        url_stats = url_map.get(url)
        tweet_count = len(url_stats["tweet_ids"])
        retweet_count = url_stats["retweet_count"]
        print("{} retweets from {} tweets - {}".format(retweet_count, tweet_count, url))

def filter_url_map(url_map, pred_fn=lambda x: x):
    return { key: url_map[key] for key in url_map.keys() if pred_fn(key) }

print("Top URLs in the dataset:")
top_urls_by_retweet_count(filter_url_map(url_map))

print()

youtube_url_map = filter_url_map(url_map, lambda x: "youtu.be" in x or "youtube" in x)
print("Youtube URLs in the dataset: {:,}".format(len(youtube_url_map.keys())))
print("Top Youtube URLs in the dataset:")
top_urls_by_retweet_count(youtube_url_map)


Top URLs in the dataset:
36770 retweets from 192 tweets - https://www.breitbart.com/2020-election/2020/11/07/republican-led-michigan-legislature-to-hold-hearings-on-election-fraud-claims/
26685 retweets from 64 tweets - https://thefederalist.com/2020/11/08/america-wont-trust-elections-until-the-voter-fraud-is-investigated/#.X6iHcjHduyR.twitter
26266 retweets from 199 tweets - http://djt45.co/stopfraud
21252 retweets from 563 tweets - https://breaking911.com/u-s-postal-worker-caught-at-canadian-border-with-stolen-ballots-in-car-trunk/
18282 retweets from 138 tweets - https://www.washingtonexaminer.com/news/lindsey-graham-possible-ballot-harvesting-in-pennsylvania-involving-25-000-nursing-home-residents
16945 retweets from 55 tweets - https://www.houstonchronicle.com/politics/texas/article/Texas-Lt-Gov-Dan-Patrick-offers-1-million-15716973.php?utm_campaign=CMS%20Sharing%20Tools%20(Premium)&utm_source=t.co&utm_medium=referral
15667 retweets from 3072 tweets - https://www.whitehouse.gov/pr