<a href="https://colab.research.google.com/github/sahilfaizal01/Semantic-Search-over-YT-Videos/blob/main/Extracting_Video_Transcripts.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install youtube-transcript-api

Collecting youtube-transcript-api
  Downloading youtube_transcript_api-0.6.2-py3-none-any.whl (24 kB)
Installing collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-0.6.2


In [2]:
import polars as pl
from youtube_transcript_api import YouTubeTranscriptApi

In [3]:
def extract_text(transcript: list) -> str:
  """
  Function to extract text from transcript dictionary
  """
  text_list = [transcript[i]['text'] for i in range(len(transcript))]
  return ' '.join(text_list)

In [4]:
df = pl.read_parquet('video-ids.parquet')
print(df.head())

shape: (5, 3)
┌─────────────┬──────────────────────┬───────────────────────────────────┐
│ video_id    ┆ datetime             ┆ title                             │
│ ---         ┆ ---                  ┆ ---                               │
│ str         ┆ str                  ┆ str                               │
╞═════════════╪══════════════════════╪═══════════════════════════════════╡
│ XT0WblJZqSo ┆ 2024-07-04T20:00:09Z ┆ Sheldon Gets Fired | The Big Ban… │
│ P2A82K1sG48 ┆ 2024-07-04T12:00:21Z ┆ Nerds and Their Moms | The Big B… │
│ looMyVTPOn8 ┆ 2024-07-04T07:00:21Z ┆ Funny Moments from Seasons 11 an… │
│ j6YOTFV_w5M ┆ 2024-07-03T20:00:16Z ┆ Is Sheldon a Robot? | The Big Ba… │
│ UUTQluNYg5I ┆ 2024-07-03T17:00:19Z ┆ Professor Proton Picks Leonard O… │
└─────────────┴──────────────────────┴───────────────────────────────────┘


In [5]:
%%time
transcript_text_list = []

for i in range(len(df)):
  # try to extract captions
  try:
    transcript = YouTubeTranscriptApi.get_transcript(df['video_id'][i])
    transcript_text = extract_text(transcript)
  # if not available set as n/a
  except:
    transcript_text = "n/a"
  # append to list
  transcript_text_list.append(transcript_text)

CPU times: user 1min 31s, sys: 2.26 s, total: 1min 33s
Wall time: 5min 52s


In [6]:
# add transcripts to dataframe
df = df.with_columns(pl.Series(name="transcript", values=transcript_text_list))
print(df.head())

shape: (5, 4)
┌─────────────┬──────────────────────┬──────────────────────────────┬──────────────────────────────┐
│ video_id    ┆ datetime             ┆ title                        ┆ transcript                   │
│ ---         ┆ ---                  ┆ ---                          ┆ ---                          │
│ str         ┆ str                  ┆ str                          ┆ str                          │
╞═════════════╪══════════════════════╪══════════════════════════════╪══════════════════════════════╡
│ XT0WblJZqSo ┆ 2024-07-04T20:00:09Z ┆ Sheldon Gets Fired | The Big ┆ Here Comes our new boss be   │
│             ┆                      ┆ Ban…                         ┆ polit…                       │
│ P2A82K1sG48 ┆ 2024-07-04T12:00:21Z ┆ Nerds and Their Moms | The   ┆ you guys need me to call     │
│             ┆                      ┆ Big B…                       ┆ someone…                     │
│ looMyVTPOn8 ┆ 2024-07-04T07:00:21Z ┆ Funny Moments from Seasons   ┆ hello L

In [7]:
df.shape

(510, 4)

In [8]:
# write data to file
df.write_parquet('video-transcripts.parquet')
df.write_csv('video-transcripts.csv')