<a href="https://colab.research.google.com/github/sahilfaizal01/Semantic-Search-over-YT-Videos/blob/main/Extracting_Video_IDs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import requests
import json
import polars as pl
from youtube_api_key import API_KEY

In [3]:
def getVideoRecords(response: requests.models.Response) -> list:
  """
  Function to extract YouTube video data from GET request response
  """
  video_record_list = []

  for raw_item in json.loads(response.text)['items']:
    # only execute for youtube videos
    if raw_item['id']['kind'] != "youtube#video":
      continue

    video_record = {}
    video_record['video_id'] = raw_item['id']['videoId']
    video_record['datetime'] = raw_item['snippet']['publishedAt']
    video_record['title'] = raw_item['snippet']['title']

    video_record_list.append(video_record)

  return video_record_list

In [4]:
# define channel ID
channel_id = 'UCCxmzh3f-wwRZ-bEdLb80Og'

# define url for API
url = 'https://www.googleapis.com/youtube/v3/search'

# initialize page token
page_token = None

# intialize list to store video data
video_record_list = []

In [5]:
%%time
# extract video data across multiple search result pages
while page_token != 0:
  # define parameters for API call
  params = {"key": API_KEY, 'channelId': channel_id, 'part': ["snippet","id"], 'order': "date", 'maxResults':50, 'pageToken': page_token}
  # make get request
  response = requests.get(url, params=params)

  # append video records to list
  video_record_list += getVideoRecords(response)

  try:
    # grab next page token
    page_token = json.loads(response.text)['nextPageToken']
  except:
    # if no next page token kill while loop
    page_token = 0

CPU times: user 1.24 s, sys: 34.3 ms, total: 1.28 s
Wall time: 6.5 s


In [7]:
video_record_list

[{'video_id': 'XT0WblJZqSo',
  'datetime': '2024-07-04T20:00:09Z',
  'title': 'Sheldon Gets Fired | The Big Bang Theory'},
 {'video_id': 'P2A82K1sG48',
  'datetime': '2024-07-04T12:00:21Z',
  'title': 'Nerds and Their Moms | The Big Bang Theory #shorts'},
 {'video_id': 'looMyVTPOn8',
  'datetime': '2024-07-04T07:00:21Z',
  'title': 'Funny Moments from Seasons 11 and 12 | The Big Bang Theory'},
 {'video_id': 'j6YOTFV_w5M',
  'datetime': '2024-07-03T20:00:16Z',
  'title': 'Is Sheldon a Robot? | The Big Bang Theory'},
 {'video_id': 'UUTQluNYg5I',
  'datetime': '2024-07-03T17:00:19Z',
  'title': 'Professor Proton Picks Leonard Over Sheldon | The Big Bang Theory'},
 {'video_id': 'W6YoST3hOPQ',
  'datetime': '2024-07-03T12:00:26Z',
  'title': 'These Comic Book Guys Stay Lurking 👀 | The Big Bang Theory #shorts'},
 {'video_id': 'Csnef25I-h4',
  'datetime': '2024-07-02T20:00:04Z',
  'title': 'The Sword of Azeroth | The Big Bang Theory'},
 {'video_id': 'yRn_ESdW8Es',
  'datetime': '2024-07-02T17

In [8]:
# write data to file
pl.DataFrame(video_record_list).write_parquet('video-ids.parquet')
pl.DataFrame(video_record_list).write_csv('video-ids.csv')