In [48]:
import os
import argparse
import json
import time

from googleapiclient.discovery import build
from googleapiclient.errors import HttpError

from urllib.parse import urlparse
from collections import defaultdict
import heapq

# Get YouTube data details

In [38]:
def video_id(url):
    """
    Examples:
    - http://youtu.be/SA2iWivDJiE
    - http://www.youtube.com/watch?v=_oPAwA_Udwc&feature=feedu
    - http://www.youtube.com/embed/SA2iWivDJiE
    - http://www.youtube.com/v/SA2iWivDJiE?version=3&amp;hl=en_US
    """
    o = urlparse(url)
    if o.netloc == 'youtu.be':
        return o.path[1:]
    elif o.netloc in ('www.youtube.com', 'youtube.com'):
        if o.path == '/watch':
            id_index = o.query.index('v=')
            return o.query[id_index+2:id_index+13]
        elif o.path[:7] == '/embed/':
            return o.path.split('/')[2]
        elif o.path[:3] == '/v/':
            return o.path.split('/')[2]
    return None

In [25]:
# Set DEVELOPER_KEY to the API key value from the APIs & auth > Registered apps
# tab of
#   https://cloud.google.com/console
# Please ensure that you have enabled the YouTube Data API for your project.
with open("../private/youtube", "r") as r:
  for line in r:
    DEVELOPER_KEY = line[:-1]
    break
YOUTUBE_API_SERVICE_NAME = 'youtube'
YOUTUBE_API_VERSION = 'v3'

In [26]:
def videos_list_multiple_ids(youtube, videoids):
  response = youtube.videos().list(
    
  ).execute()

  return response['items']

In [33]:
youtube = build(YOUTUBE_API_SERVICE_NAME, YOUTUBE_API_VERSION,
    developerKey=DEVELOPER_KEY)


links = []
videoIds = []
batchSize = 50
with open("./data_export/url_stats/16-dec/expanded_url_map.json", "r") as r:
  data = json.load(r)

In [80]:
youtube_data = []
videoIds = []
for key, vv in data.items():
  if 'url' in vv['expanded_url']:
    v = vv['expanded_url']['url']
  else:
    v = key

  try:
    vid = video_id(key)
  except:
    vid = None
  if vid is not None:
    videoIds.append(vid)
    record = {'video_id': vid}
    record.update(vv)
    record['url'] = v
    youtube_data.append(record)

In [40]:
videoIds = list(set(videoIds))

In [41]:
total = len(videoIds)

In [42]:
total

13611

In [45]:
cnt = 0
responses = []
while cnt < total:
  batch = videoIds[cnt:cnt+batchSize]
  cnt += batchSize
  try:
    video_response = youtube.videos().list(id=','.join(batch),
                part='snippet').execute()
    for item in video_response['items']:
      responses.append(item)
  except HttpError as e:
    print('An HTTP error {} occurred:\n{}'.format(e.resp.status, e.content))
    if e.resp.status == 403:
      cnt -= batchSize
      time.sleep(3600* 12) # Sleep for one hour if hits rate limit 
      print('Hits rate limit, sleep for one hour.')
  print("{} / {} processed".format(cnt, total))

50 / 13611 processed
100 / 13611 processed
150 / 13611 processed
200 / 13611 processed
250 / 13611 processed
300 / 13611 processed
350 / 13611 processed
400 / 13611 processed
450 / 13611 processed
500 / 13611 processed
550 / 13611 processed
600 / 13611 processed
650 / 13611 processed
700 / 13611 processed
750 / 13611 processed
800 / 13611 processed
850 / 13611 processed
900 / 13611 processed
950 / 13611 processed
1000 / 13611 processed
1050 / 13611 processed
1100 / 13611 processed
1150 / 13611 processed
1200 / 13611 processed
1250 / 13611 processed
1300 / 13611 processed
1350 / 13611 processed
1400 / 13611 processed
1450 / 13611 processed
1500 / 13611 processed
1550 / 13611 processed
1600 / 13611 processed
1650 / 13611 processed
1700 / 13611 processed
1750 / 13611 processed
1800 / 13611 processed
1850 / 13611 processed
1900 / 13611 processed
1950 / 13611 processed
2000 / 13611 processed
2050 / 13611 processed
2100 / 13611 processed
2150 / 13611 processed
2200 / 13611 processed
2250 / 1

In [46]:
with open("./data_export/url_stats/16-dec/youtube_data_details.json", "w") as w:
  for response in responses:
    w.write(json.dumps(response) + '\n')

# Top URLs in the dataset

In [47]:
with open("./data_export/url_stats/16-dec/all_urls.json", "r") as r:
  urls_meta = json.load(r)

In [69]:
urls_map = defaultdict(lambda: {
    "tweet_ids": set(),
    "aggregated_retweet_count": 0,
    "aggregated_quote_count": 0,
})

total_tweet_count = 0
total_retweet_count = 0
total_quote_count = 0

for key, vv in data.items():
  if 'url' in vv['expanded_url']:
    v = vv['expanded_url']['url']
    urls_map[v]['domain'] = vv['expanded_url']['domain']['domain']
  else:
    v = key
    urls_map[v]['domain'] = 'URL NOT AVAILABLE'

  urls_map[v]['tweet_ids'] = (set(urls_map[v]['tweet_ids']) | set(urls_meta[key]['tweet_ids']))
  urls_map[v]['aggregated_retweet_count'] += urls_meta[key]['aggregated_retweet_count']
  urls_map[v]['aggregated_quote_count'] += urls_meta[key]['aggregated_quote_count']
  total_retweet_count += urls_meta[key]['aggregated_retweet_count']
  total_quote_count += urls_meta[key]['aggregated_quote_count']
  total_tweet_count += len(urls_meta[key])

In [68]:
def top_urls_by_retweet_count(url_map, N = 10):
    for url in heapq.nlargest(N, url_map, key=lambda x: url_map.get(x)["aggregated_retweet_count"]):
        url_stats = url_map.get(url)
        tweet_count = len(url_stats["tweet_ids"])
        retweet_count = url_stats["aggregated_retweet_count"]
        print("{} retweets from {} tweets - {}".format(retweet_count, tweet_count, url))

def transform_url_map(url_map, filter_fn=lambda x: x, map_key=lambda x: x):
    new_map = {}
    for key, val in url_map.items():
        if filter_fn(key):
            mapped_key = map_key(key)
            if (mapped_key in new_map):
                existing_entry = new_map[mapped_key]
                existing_entry["tweet_ids"].update(val["tweet_ids"])
                existing_entry["aggregated_retweet_count"] += val["aggregated_retweet_count"]
                existing_entry["aggregated_quote_count"] += val["aggregated_quote_count"]
                new_map[mapped_key] = existing_entry
            else:
                new_map[mapped_key] = val.copy()

    return new_map

print("Top URLs in the dataset:")
top_urls_by_retweet_count(urls_map)


Top URLs in the dataset:
49638 retweets from 22106 tweets - https://www.whitehouse.gov/presidential-actions/executive-order-imposing-certain-sanctions-event-foreign-interference-united-states-election/
46781 retweets from 603 tweets - https://www.breitbart.com/2020-election/2020/11/23/poll-79-of-trump-voters-believe-election-was-stolen-through-illegal-voting-and-fraud/
41078 retweets from 264 tweets - https://www.foxnews.com/opinion/tucker-carlson-2020-presidential-election-voter-fraud-dead-voters.amp
39154 retweets from 776 tweets - https://www.breitbart.com/2020-election/2020/11/19/rudy-giuliani-the-case-for-election-fraud-being-made-by-american-patriots-in-both-parties/
36201 retweets from 195 tweets - https://www.breitbart.com/2020-election/2020/11/07/republican-led-michigan-legislature-to-hold-hearings-on-election-fraud-claims/
32156 retweets from 334 tweets - https://www.breitbart.com/2020-election/2020/11/17/california-2-charged-with-voter-fraud-allegedly-submitted-thousands-of-

In [65]:
print("Number of tweets with URLs (excluding twitter.com URLs): {:,}".format(total_tweet_count))
print("Unique URLs shared: {:,}".format(len(urls_map.keys())))
print("URL share retweet count: {:,}".format(total_retweet_count))
print("URL share quote count: {:,}".format(total_quote_count))

Number of tweets with URLs (excluding twitter.com URLs): 465,192
Unique URLs shared: 138,970
URL share retweet count: 2,847,863
URL share quote count: 334,915


## Top Domains in the dataset

In [71]:
def map_to_domain(url):
  return urls_map[url]['domain']

domain_url_map = transform_url_map(urls_map, map_key=map_to_domain)
print("Unique domains in the dataset: {:,}".format(len(domain_url_map.keys())))
print("Top domains in the dataset:")

top_urls_by_retweet_count(domain_url_map)

Unique domains in the dataset: 9,650
Top domains in the dataset:
211496 retweets from 10828 tweets - breitbart
149806 retweets from 2618 tweets - pscp
130059 retweets from 91838 tweets - google
105188 retweets from 23010 tweets - thegatewaypundit
99581 retweets from 3500 tweets - justthenews
97132 retweets from 5351 tweets - thefederalist
92837 retweets from 18105 tweets - theepochtimes
90845 retweets from 9429 tweets - foxnews
73073 retweets from 4460 tweets - oann
53863 retweets from 16290 tweets - rumble


## Top YouTube URLs in the dataset

In [82]:
youtube_urls = {}
for yturl in youtube_data:
  youtube_urls[yturl['url']] = yturl

In [83]:
len(youtube_urls.keys())

17319

In [84]:
def detect_youtube_url(url):
    return url in youtube_urls

def normalize_youtube_url(url):
    return youtube_urls[url]['video_id']

youtube_url_map = transform_url_map(
    urls_map, 
    filter_fn=detect_youtube_url,
    map_key=normalize_youtube_url
)

print()
print("Unique Youtube URLs in the dataset: {:,}".format(len(youtube_url_map.keys())))
print("Top Youtube IDs in the dataset:")
top_urls_by_retweet_count(youtube_url_map)


Unique Youtube URLs in the dataset: 13,611
Top Youtube IDs in the dataset:
13094 retweets from 25 tweets - LPdD8Cd5PGI
11909 retweets from 93 tweets - psGpIuNh_dU
7271 retweets from 436 tweets - w7vKBiPeyz4
5204 retweets from 115 tweets - QNN9I0xxZRE
4739 retweets from 337 tweets - 96-BQaIVOpc
3819 retweets from 32 tweets - bYTa1AMLJxY
2465 retweets from 675 tweets - Ztu5Y5obWPk
2260 retweets from 1467 tweets - p2MkvWh7poY
1605 retweets from 150 tweets - VgMPDnWunqs
1406 retweets from 223 tweets - g9_SgYJnbKo


In [85]:
yt_details = {}
for res in responses:
  yt_details[res['id']] = res

In [95]:
def detect_youtube_url(url):
    return url in youtube_urls

def get_channel_name(url):
    if youtube_urls[url]['video_id'] not in yt_details:
      return None
    if 'channelTitle' in yt_details[youtube_urls[url]['video_id']]['snippet']:
        return yt_details[youtube_urls[url]['video_id']]['snippet']['channelTitle']
    return None

youtube_url_map = transform_url_map(
    urls_map, 
    filter_fn=detect_youtube_url,
    map_key=get_channel_name
)

print()
print("Unique Youtube URLs in the dataset: {:,}".format(len(youtube_url_map.keys())))
print("Top Youtube channels in the dataset:")
top_urls_by_retweet_count(youtube_url_map)


Unique Youtube URLs in the dataset: 5,086
Top Youtube channels in the dataset:
13404 retweets from 426 tweets - Gateway Pundit
12238 retweets from 706 tweets - Precinct 13
8567 retweets from 2543 tweets - Project Veritas
8058 retweets from 543 tweets - AWK NEWS
6274 retweets from 82470 tweets - One America News Network
5549 retweets from 1141 tweets - StevenCrowder
5205 retweets from 120 tweets - Destiny Image
5168 retweets from 8192 tweets - None
3821 retweets from 67 tweets - ignant hunter
3636 retweets from 2819 tweets - Right Side Broadcasting Network
