In [1]:
from notebook_utils import setup, load_tweet_df, load_media_df

setup()

In [2]:
tweet_df, recent_tweet_df = load_tweet_df()

tweet_df.info()

Loading 2696807 json lines
(4%): 100000 lines in ../data/14-nov/parsed_tweets.json processed (2.472034215927124 sec)
(7%): 200000 lines in ../data/14-nov/parsed_tweets.json processed (2.2573370933532715 sec)
(11%): 300000 lines in ../data/14-nov/parsed_tweets.json processed (2.6654880046844482 sec)
(15%): 400000 lines in ../data/14-nov/parsed_tweets.json processed (2.3025078773498535 sec)
(19%): 500000 lines in ../data/14-nov/parsed_tweets.json processed (2.856652021408081 sec)
(22%): 600000 lines in ../data/14-nov/parsed_tweets.json processed (2.842512845993042 sec)
(26%): 700000 lines in ../data/14-nov/parsed_tweets.json processed (2.9592249393463135 sec)
(30%): 800000 lines in ../data/14-nov/parsed_tweets.json processed (1.809798002243042 sec)
(33%): 900000 lines in ../data/14-nov/parsed_tweets.json processed (3.6938552856445312 sec)
(37%): 1000000 lines in ../data/14-nov/parsed_tweets.json processed (1.8063840866088867 sec)
(41%): 1100000 lines in ../data/14-nov/parsed_tweets.json 

# Top URLs in the dataset

In [121]:
from collections import defaultdict
import heapq
url_map = defaultdict(lambda: {
    "tweet_ids": set(),
    "aggregated_retweet_count": 0,
    "aggregated_quote_count": 0
})

total_tweet_count = 0
total_retweet_count = 0
total_quote_count = 0

for tweet_id, urls, retweet_count, quote_count in recent_tweet_df[["urls", "retweet_count", "quote_count"]].itertuples():
    has_relevant_url = False
    for url in urls:
        if "twitter.com/" not in url:
            url = url.lower()
            has_relevant_url = True
            url_map[url]["tweet_ids"].add(tweet_id)
            url_map[url]["aggregated_retweet_count"] += retweet_count
            url_map[url]["aggregated_quote_count"] += quote_count
            total_retweet_count += retweet_count
            total_quote_count += quote_count
    if has_relevant_url:
        total_tweet_count += 1

In [122]:
print("Number of tweets with URLs (excluding twitter.com URLs): {:,}".format(total_tweet_count))
print("Unique URLs shared: {:,}".format(len(url_map.keys())))
print("URL share retweet count: {:,}".format(total_retweet_count))
print("URL share quote count: {:,}".format(total_quote_count))

Number of tweets with URLs (excluding twitter.com URLs): 214,666
Unique URLs shared: 58,529
URL share retweet count: 1,354,021
URL share quote count: 171,010


In [123]:
def top_urls_by_retweet_count(url_map, N = 10):
    for url in heapq.nlargest(N, url_map, key=lambda x: url_map.get(x)["aggregated_retweet_count"]):
        url_stats = url_map.get(url)
        tweet_count = len(url_stats["tweet_ids"])
        retweet_count = url_stats["aggregated_retweet_count"]
        print("{} retweets from {} tweets - {}".format(retweet_count, tweet_count, url))

def transform_url_map(url_map, filter_fn=lambda x: x, map_key=lambda x: x):
    new_map = {}
    for key, val in url_map.items():
        if filter_fn(key):
            mapped_key = map_key(key)
            if (mapped_key in new_map):
                existing_entry = new_map[mapped_key]
                existing_entry["tweet_ids"].update(val["tweet_ids"])
                existing_entry["aggregated_retweet_count"] += val["aggregated_retweet_count"]
                existing_entry["aggregated_quote_count"] += val["aggregated_quote_count"]
                new_map[mapped_key] = existing_entry
            else:
                new_map[mapped_key] = val.copy()

    return new_map

print("Top URLs in the dataset:")
top_urls_by_retweet_count(url_map)


Top URLs in the dataset:
36770 retweets from 192 tweets - https://www.breitbart.com/2020-election/2020/11/07/republican-led-michigan-legislature-to-hold-hearings-on-election-fraud-claims/
26685 retweets from 64 tweets - https://thefederalist.com/2020/11/08/america-wont-trust-elections-until-the-voter-fraud-is-investigated/#.x6ihcjhduyr.twitter
26267 retweets from 204 tweets - http://djt45.co/stopfraud
21252 retweets from 563 tweets - https://breaking911.com/u-s-postal-worker-caught-at-canadian-border-with-stolen-ballots-in-car-trunk/
18282 retweets from 138 tweets - https://www.washingtonexaminer.com/news/lindsey-graham-possible-ballot-harvesting-in-pennsylvania-involving-25-000-nursing-home-residents
16945 retweets from 55 tweets - https://www.houstonchronicle.com/politics/texas/article/texas-lt-gov-dan-patrick-offers-1-million-15716973.php?utm_campaign=cms%20sharing%20tools%20(premium)&utm_source=t.co&utm_medium=referral
15667 retweets from 3073 tweets - https://www.whitehouse.gov/pr

## Top Domains in the dataset

In [124]:
from urllib.parse import parse_qs, urlencode, urlparse

def map_to_domain(url):
    parsed = urlparse(url)

    return parsed.netloc.replace("www.", "")

domain_url_map = transform_url_map(url_map, map_key=map_to_domain)
print("Unique domains in the dataset: {:,}".format(len(domain_url_map.keys())))
print("Top domains in the dataset:")

top_urls_by_retweet_count(domain_url_map)

Unique domains in the dataset: 6,924
Top domains in the dataset:
89295 retweets from 3106 tweets - thefederalist.com
54843 retweets from 26 tweets - hann.it
51541 retweets from 3184 tweets - breitbart.com
44745 retweets from 7827 tweets - thegatewaypundit.com
44196 retweets from 1072 tweets - justthenews.com
40130 retweets from 2578 tweets - nypost.com
39923 retweets from 26971 tweets - youtu.be
35060 retweets from 703 tweets - breaking911.com
29745 retweets from 3965 tweets - nytimes.com
29251 retweets from 2599 tweets - zerohedge.com


## Top YouTube URLs in the dataset

In [125]:
deleted_url_params = set()

def detect_youtube_url(url):
    parsed = urlparse(url)
    return "youtu.be" in parsed.netloc or "youtube.com" in parsed.netloc

def normalize_youtube_url(url):
    parsed = urlparse(url)
    parsed = parsed._replace()
    query_params = parse_qs(parsed.query)
    if (parsed.path == '/watch' and "v" in query_params):
        updated_path = '/' + query_params["v"][0]
        del query_params["v"]
        parsed = parsed._replace(path=updated_path)
    deleted_url_params.update(query_params.keys())
    query_params = {}
    updated_query = urlencode(query_params, doseq=True)
    parsed = parsed._replace(scheme='https', netloc='youtu.be', query=updated_query)
    return parsed.geturl()

youtube_url_map = transform_url_map(
    url_map, 
    filter_fn=detect_youtube_url,
    map_key=normalize_youtube_url
)

print("Deleted URL params after normalizing Youtube URLs", deleted_url_params)

print()
print("Unique Youtube URLs in the dataset: {:,}".format(len(youtube_url_map.keys())))
print("Top Youtube URLs in the dataset:")
top_urls_by_retweet_count(youtube_url_map)

Deleted URL params after normalizing Youtube URLs {'time_continue', 'start_radio', 'ab_channel', 'index', 'bsft_tv', 'utm_campaign', 'utm_medium', 't', 'ebc', 'feature', 'bsft_utid', 'pc', 'bsft_mid', 'bsft_lx', 'bsft_uid', 'amp;feature', 'html_redirect', 'search_query', 'from', 'bsft_ek', 'q', 'bsft_clkid', 'bsft_eid', 'view_as', 'reload', 'has_verified', 'list', 'bsft_link_id', 'noapp', 'utm_content', 'persist_app', 'd', 'authuser', 'utm_source', 'attr_tag', 'bsft_mime_type', 'pbjreload', 'app', 'redir_token', 'event', 'bsft_aaid', 'form', 'autoplay', 'lc', 'v', 'fbclid'}

Unique Youtube URLs in the dataset: 5,530
Top Youtube URLs in the dataset:
7301 retweets from 433 tweets - https://youtu.be/w7vkbipeyz4
4767 retweets from 336 tweets - https://youtu.be/96-bqaivopc
3953 retweets from 30 tweets - https://youtu.be/byta1amljxy
2504 retweets from 508 tweets - https://youtu.be/ztu5y5obwpk
1605 retweets from 145 tweets - https://youtu.be/vgmpdnwunqs
1404 retweets from 221 tweets - https:/

# Top Media in the dataset

In [61]:
media_df = load_media_df().drop_duplicates('media_id')
media_df.info()

Loading 156311 json lines
(64%): 100000 lines in ../data/14-nov/parsed_media.json processed (0.36994314193725586 sec)
Done loading ../data/14-nov/parsed_media.json
156311 lines in ../data/14-nov/parsed_media.json processed (0.5557003021240234 sec)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 78440 entries, 0 to 156298
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   media_url  78440 non-null  object
 1   tweet_id   78440 non-null  object
 2   media_id   78440 non-null  object
 3   type       78440 non-null  object
dtypes: object(4)
memory usage: 3.0+ MB


In [63]:
# Preserve types when joining
tweet_df_with_media = tweet_df[tweet_df['hasMedia'] == True]
col_types = tweet_df_with_media.select_dtypes(include=['int', 'int32']).dtypes
media_with_tweets_df = media_df.set_index('tweet_id').join(tweet_df_with_media, on='tweet_id')
for col, col_type in col_types.iteritems():
    media_with_tweets_df[col] = media_with_tweets_df[col].fillna(0).astype(col_type)

media_with_tweets_df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 78440 entries, 1324192339762520065 to 1324712263228497921
Data columns (total 47 columns):
 #   Column                      Non-Null Count  Dtype           
---  ------                      --------------  -----           
 0   media_url                   78440 non-null  object          
 1   media_id                    78440 non-null  object          
 2   type                        78440 non-null  object          
 3   hashtags                    78434 non-null  object          
 4   urls                        78434 non-null  object          
 5   hasMedia                    78434 non-null  object          
 6   quote_tweet                 31358 non-null  object          
 7   retweet_count               78440 non-null  int32           
 8   timestamp                   78434 non-null  object          
 9   quote_count                 78440 non-null  int32           
 10  user                        78434 non-null  object          
 11  t

In [71]:
def top_media_by_retweet_count(media_with_tweets_df, N = 25):
    for media_id, media in media_with_tweets_df.nlargest(N, ['retweet_count']).iterrows():
        retweet_count = media["retweet_count"]
        media_url = media["media_url"]
        print("{} retweets: {}".format(retweet_count, media_url))

top_media_by_retweet_count(media_with_tweets_df)

32145 retweets: http://pbs.twimg.com/media/EkM3ALlXsAAlZCR.jpg
19627 retweets: http://pbs.twimg.com/media/EmHTacqXgAAL7Zo.jpg
19546 retweets: http://pbs.twimg.com/media/EbwW3PeXQAAcjkD.jpg
18538 retweets: http://pbs.twimg.com/media/EcIXpAAWAAEFPV5.jpg
17252 retweets: http://pbs.twimg.com/amplify_video_thumb/1324105823845523456/img/2z47IGTqb_gZij3r.jpg
13755 retweets: http://pbs.twimg.com/media/Ej56lVnWoAA0xEi.jpg
13471 retweets: http://pbs.twimg.com/media/EmLU3OEWMAE-sV0.jpg
13373 retweets: http://pbs.twimg.com/amplify_video_thumb/1301288746680307712/img/rjkmu6VgO-ZZCDaL.jpg
12715 retweets: http://pbs.twimg.com/ext_tw_video_thumb/1324811238682013704/pu/img/eb79SStIIBfEfMcN.jpg
12434 retweets: http://pbs.twimg.com/ext_tw_video_thumb/1324476554446069763/pu/img/z-GoJ3__Ctp5WZpa.jpg
11384 retweets: http://pbs.twimg.com/media/EmF9szQXUAQ9-KL.jpg
10782 retweets: http://pbs.twimg.com/media/Emg71EpXUAYup9Z.jpg
10615 retweets: http://pbs.twimg.com/ext_tw_video_thumb/796043796618379264/pu/img/Z-

In [70]:
np.unique(media_with_tweets_df['media_url'])

array(['http://pbs.twimg.com/amplify_video_thumb/1073399918365003776/img/ejp_AQNgJRIhv8Fj.jpg',
       'http://pbs.twimg.com/amplify_video_thumb/1091352389825703936/img/dx1TMAkAtEJ3-ZKK.jpg',
       'http://pbs.twimg.com/amplify_video_thumb/1139265519721336832/img/9RoODfrkiRyrWNWi.jpg',
       ..., 'http://pbs.twimg.com/tweet_video_thumb/EmzocgMXIAA5brg.jpg',
       'http://pbs.twimg.com/tweet_video_thumb/Emzoj-bUwAAXZ3H.jpg',
       'http://pbs.twimg.com/tweet_video_thumb/EmzpOemVkAEmB05.jpg'],
      dtype=object)

## Export to JSON

In [126]:
import json

def serialize_sets(obj):
    if isinstance(obj, set):
        return list(obj)

    return obj

with open("./data_export/url_stats/youtube_urls.json", "w", encoding="utf-8") as f:
    json.dump(youtube_url_map, f, sort_keys=True, indent=2, default=serialize_sets)

with open("./data_export/url_stats/domains.json", "w", encoding="utf-8") as f:
    json.dump(domain_url_map, f, sort_keys=True, indent=2, default=serialize_sets)

with open("./data_export/url_stats/all_urls.json", "w", encoding="utf-8") as f:
    json.dump(url_map, f, sort_keys=True, indent=2, default=serialize_sets)