In [1]:
from notebook_utils import setup, load_tweet_df, load_media_df
import pandas as pd

setup()

In [2]:
DATE = "16-dec"
DATAFRAMES_DIR = "../data/dataframes/{}/".format(DATE)
EXPORT_DIR = "./data_export/url_stats/{}/".format(DATE)

In [4]:
df_recent_tweets = pd.read_pickle(DATAFRAMES_DIR + "df_recent_tweets.pickle")

# Top URLs in the dataset

In [5]:
from collections import defaultdict
import heapq
url_map = defaultdict(lambda: {
    "tweet_ids": set(),
    "aggregated_retweet_count": 0,
    "aggregated_quote_count": 0
})

total_tweet_count = 0
total_retweet_count = 0
total_quote_count = 0

for tweet_id, urls, retweet_count, quote_count in df_recent_tweets[["urls", "retweet_count", "quote_count"]].itertuples():
    has_relevant_url = False
    for url in urls:
        if "twitter.com/" not in url:
            has_relevant_url = True
            url_map[url]["tweet_ids"].add(tweet_id)
            url_map[url]["aggregated_retweet_count"] += retweet_count
            url_map[url]["aggregated_quote_count"] += quote_count
            total_retweet_count += retweet_count
            total_quote_count += quote_count
    if has_relevant_url:
        total_tweet_count += 1

In [6]:
print("Number of tweets with URLs (excluding twitter.com URLs): {:,}".format(total_tweet_count))
print("Unique URLs shared: {:,}".format(len(url_map.keys())))
print("URL share retweet count: {:,}".format(total_retweet_count))
print("URL share quote count: {:,}".format(total_quote_count))

Number of tweets with URLs (excluding twitter.com URLs): 609,901
Unique URLs shared: 155,064
URL share retweet count: 2,847,863
URL share quote count: 334,915


In [8]:
def top_urls_by_retweet_count(url_map, N = 10):
    for url in heapq.nlargest(N, url_map, key=lambda x: url_map.get(x)["aggregated_retweet_count"]):
        url_stats = url_map.get(url)
        tweet_count = len(url_stats["tweet_ids"])
        retweet_count = url_stats["aggregated_retweet_count"]
        print("{} retweets from {} tweets - {}".format(retweet_count, tweet_count, url))

def transform_url_map(url_map, filter_fn=lambda x: x, map_key=lambda x: x):
    new_map = {}
    for key, val in url_map.items():
        if filter_fn(key):
            mapped_key = map_key(key)
            if (mapped_key in new_map):
                existing_entry = new_map[mapped_key]
                existing_entry["tweet_ids"].update(val["tweet_ids"])
                existing_entry["aggregated_retweet_count"] += val["aggregated_retweet_count"]
                existing_entry["aggregated_quote_count"] += val["aggregated_quote_count"]
                new_map[mapped_key] = existing_entry
            else:
                new_map[mapped_key] = val.copy()

    return new_map

print("Top URLs in the dataset:")
top_urls_by_retweet_count(url_map)


Top URLs in the dataset:
49445 retweets from 19511 tweets - https://www.whitehouse.gov/presidential-actions/executive-order-imposing-certain-sanctions-event-foreign-interference-united-states-election/
46758 retweets from 540 tweets - https://www.breitbart.com/2020-election/2020/11/23/poll-79-of-trump-voters-believe-election-was-stolen-through-illegal-voting-and-fraud/
41078 retweets from 264 tweets - https://www.foxnews.com/opinion/tucker-carlson-2020-presidential-election-voter-fraud-dead-voters.amp
39142 retweets from 719 tweets - https://www.breitbart.com/2020-election/2020/11/19/rudy-giuliani-the-case-for-election-fraud-being-made-by-american-patriots-in-both-parties/
36198 retweets from 192 tweets - https://www.breitbart.com/2020-election/2020/11/07/republican-led-michigan-legislature-to-hold-hearings-on-election-fraud-claims/
32156 retweets from 330 tweets - https://www.breitbart.com/2020-election/2020/11/17/california-2-charged-with-voter-fraud-allegedly-submitted-thousands-of-

## Top Domains in the dataset

In [14]:
from urllib.parse import parse_qs, urlencode, urlparse

def map_to_domain(url):
    parsed = urlparse(url)

    return parsed.netloc.replace("www.", "").lower()

domain_url_map = transform_url_map(url_map, map_key=map_to_domain)
print("Unique domains in the dataset: {:,}".format(len(domain_url_map.keys())))
print("Top domains in the dataset:")

top_urls_by_retweet_count(domain_url_map)

Unique domains in the dataset: 12,634
Top domains in the dataset:
210320 retweets from 10184 tweets - breitbart.com
149799 retweets from 2614 tweets - pscp.tv
105089 retweets from 22853 tweets - thegatewaypundit.com
97824 retweets from 3380 tweets - justthenews.com
97080 retweets from 5264 tweets - thefederalist.com
93279 retweets from 82421 tweets - youtu.be
86205 retweets from 7914 tweets - foxnews.com
73007 retweets from 3921 tweets - oann.com
72211 retweets from 38 tweets - hann.it
52450 retweets from 1022 tweets - djhjmedia.com


## Top YouTube URLs in the dataset

In [15]:
deleted_url_params = set()

def detect_youtube_url(url):
    parsed = urlparse(url)
    return "youtu.be" in parsed.netloc or "youtube.com" in parsed.netloc

def normalize_youtube_url(url):
    parsed = urlparse(url)
    parsed = parsed._replace()
    query_params = parse_qs(parsed.query)
    if (parsed.path == '/watch' and "v" in query_params):
        updated_path = '/' + query_params["v"][0]
        del query_params["v"]
        parsed = parsed._replace(path=updated_path)
    deleted_url_params.update(query_params.keys())
    query_params = {}
    updated_query = urlencode(query_params, doseq=True)
    parsed = parsed._replace(scheme='https', netloc='youtu.be', query=updated_query)
    return parsed.geturl()

youtube_url_map = transform_url_map(
    url_map, 
    filter_fn=detect_youtube_url,
    map_key=normalize_youtube_url
)

print("Deleted URL params after normalizing Youtube URLs", deleted_url_params)

print()
print("Unique Youtube URLs in the dataset: {:,}".format(len(youtube_url_map.keys())))
print("Top Youtube URLs in the dataset:")
top_urls_by_retweet_count(youtube_url_map)

Deleted URL params after normalizing Youtube URLs {'utm_campaign', 'view_as', 'utm_medium', 'itct', 't', 'PC', 'list', 'utm_term', 'redir_token', 'bsft_lx', 'v', 'rootVe', 'has_verified', 'persist_app', 'ebc', 'ab_channel', '__s', 'bsft_eid', 'd', 'bsft_uid', 'search_query', 'start_radio', 'authuser', 'app', 'event', 'html_redirect', 'bsft_mid', 'zarsrc', 'bsft_utid', 'mid', 'attr_tag', 'utm_name', 'bsft_link_id', 'playnext', 'sub_confirmation', 'reload', 'utm_source', 'FORM', 'amp;feature', 'bsft_tv', 'feature', 'isappinstalled', 'disable_polymer', 'index', 'fbclid', 'time_continue', 'bsft_clkid', 'from', 'q', 'noapp', 'id', 'bsft_aaid', 'bsft_mime_type', 'autoplay', 'bsft_ek', 'lc', 'vl', 'pbjreload', 'utm_content'}

Unique Youtube URLs in the dataset: 14,051
Top Youtube URLs in the dataset:
13094 retweets from 25 tweets - https://youtu.be/LPdD8Cd5PGI
11909 retweets from 92 tweets - https://youtu.be/psGpIuNh_dU
7271 retweets from 436 tweets - https://youtu.be/w7vKBiPeyz4
5234 retweet

# Top Media in the dataset

In [3]:
df_media_with_tweets = pd.read_pickle(DATAFRAMES_DIR + 'df_media_with_tweets.pickle')
df_media_with_tweets.info()

<class 'pandas.core.frame.DataFrame'>
Index: 201461 entries, 5633333740437504 to 4895905840889856
Data columns (total 50 columns):
 #   Column                      Non-Null Count   Dtype           
---  ------                      --------------   -----           
 0   type                        201461 non-null  object          
 1   media_id                    201461 non-null  object          
 2   media_url                   201461 non-null  object          
 3   tweet_id                    201461 non-null  object          
 4   urls                        201209 non-null  object          
 5   hasMedia                    201209 non-null  object          
 6   hashtags                    201209 non-null  object          
 7   retweet_count               201461 non-null  int32           
 8   quote_count                 201461 non-null  int32           
 9   user                        201209 non-null  object          
 10  text                        201209 non-null  object         

In [4]:
df_media_with_tweets['media_id'].value_counts().sort_values(ascending=False)

1325868287020970000    175
1320106466028769282    162
1324507941987405826    142
1326595916002856963    132
1329720320555581441    109
                      ... 
1325487004579082242      1
1333236566576091136      1
1327929991007514625      1
1331722326552039426      1
1324151588387196930      1
Name: media_id, Length: 196517, dtype: int64

In [5]:
import pandas as pd
df_media_with_tweets.shape

for_export = df_media_with_tweets[[
    'media_url', 'media_id', 'tweet_id', 'hashtags', 'user', 'type', 'retweet_count', 'quote_count', 'timestamp'
]]

for_export['timestamp'] = pd.to_datetime(for_export['timestamp'])
for_export = for_export[for_export['timestamp'] > '2020-10-23 00:00:00']
print(for_export['timestamp'].min())
print(for_export['timestamp'].max())
print(for_export.shape)
print(for_export.columns)
for_export.head()
for_export.to_csv("media_joined_with_tweets-{}.csv".format(DATE), index_label="datastore_id")

2020-10-23 17:00:33+00:00
2020-12-16 12:28:05+00:00
(201209, 9)
Index(['media_url', 'media_id', 'tweet_id', 'hashtags', 'user', 'type',
       'retweet_count', 'quote_count', 'timestamp'],
      dtype='object')


In [8]:
# Test export
df = pd.read_csv("media_joined_with_tweets-{}.csv".format(DATE))
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 201209 entries, 0 to 201208
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   datastore_id   201209 non-null  int64 
 1   media_url      201209 non-null  object
 2   media_id       201209 non-null  int64 
 3   tweet_id       201209 non-null  int64 
 4   hashtags       201209 non-null  object
 5   user           201209 non-null  int64 
 6   type           201209 non-null  object
 7   retweet_count  201209 non-null  int64 
 8   quote_count    201209 non-null  int64 
 9   timestamp      201209 non-null  object
dtypes: int64(6), object(4)
memory usage: 15.4+ MB


In [32]:
def top_media_by_retweet_count(df_media_with_tweets, N = 25):
    for media_id, media in df_media_with_tweets.nlargest(N, ['retweet_count']).iterrows():
        retweet_count = media["retweet_count"]
        media_url = media["media_url"]
        print("{} retweets: {}".format(retweet_count, media_url))

top_media_by_retweet_count(df_media_with_tweets)

19400 retweets: http://pbs.twimg.com/media/EmHTacqXgAAL7Zo.jpg
17497 retweets: http://pbs.twimg.com/media/Enw48WQUUAUl01f.jpg
16804 retweets: http://pbs.twimg.com/amplify_video_thumb/1324105823845523456/img/2z47IGTqb_gZij3r.jpg
15526 retweets: http://pbs.twimg.com/media/EmAFt6BUcAAcs5r.jpg
12341 retweets: http://pbs.twimg.com/ext_tw_video_thumb/1324476554446069763/pu/img/z-GoJ3__Ctp5WZpa.jpg
12336 retweets: http://pbs.twimg.com/ext_tw_video_thumb/1324811238682013704/pu/img/eb79SStIIBfEfMcN.jpg
11012 retweets: http://pbs.twimg.com/media/EmF9szQXUAQ9-KL.jpg
10701 retweets: http://pbs.twimg.com/media/Emg71EpXUAYup9Z.jpg
9294 retweets: http://pbs.twimg.com/media/EmACdkyX0AQ7bDO.png
8853 retweets: http://pbs.twimg.com/media/Eo4KLDyXEAIy5Dl.jpg
8527 retweets: http://pbs.twimg.com/media/EmFpwPFXYAI_06X.jpg
7951 retweets: http://pbs.twimg.com/ext_tw_video_thumb/1324652359935352832/pu/img/--NEuAQfXmi8mupk.jpg
7683 retweets: http://pbs.twimg.com/amplify_video_thumb/1325130145477373952/img/43S-cx

In [34]:
import numpy as np
len(np.unique(df_media_with_tweets['media_url']))

196517

## Export to JSON

In [16]:
import json

def serialize_sets(obj):
    if isinstance(obj, set):
        return list(obj)

    return obj

with open(EXPORT_DIR + "youtube_urls.json", "w", encoding="utf-8") as f:
    json.dump(youtube_url_map, f, sort_keys=True, indent=2, default=serialize_sets)

with open(EXPORT_DIR + "domains.json", "w", encoding="utf-8") as f:
    json.dump(domain_url_map, f, sort_keys=True, indent=2, default=serialize_sets)

with open(EXPORT_DIR + "all_urls.json", "w", encoding="utf-8") as f:
    json.dump(url_map, f, sort_keys=True, indent=2, default=serialize_sets)

In [None]:
with open(EXPORT_DIR + "expanded_url_map.json", "r", encoding="utf-8") as f:
    expanded_map = json.load(url_map, f)

In [None]:
expanded_map