# Obtaining YouTube Transcripts From My Channel For Building A Search Engine

## Steps:
- [x] Get list of video IDs
- [x] Use YouTube Transcript API to get transcript for each of those videos (hopefully don't get rate limited)
- [x] Make a dataframe from the transcripts with the following columns
    - Video url
    - Text
    - Begin timestamp
    - End timestamp

## Step 1: Get list of video IDs

In [None]:
import urllib.request
import json

In [None]:
def get_all_video_in_channel(channel_id):
    api_key = 'your google api key goes here'

    base_video_url = 'https://www.youtube.com/watch?v='
    base_search_url = 'https://www.googleapis.com/youtube/v3/search?'

    first_url = base_search_url+'key={}&channelId={}&part=snippet,id&order=date&maxResults=25'.format(api_key, channel_id)

    video_links = []
    url = first_url
    while True:
        inp = urllib.request.urlopen(url)
        resp = json.load(inp)

        for i in resp['items']:
            if i['id']['kind'] == "youtube#video":
                video_links.append(i['id']['videoId'])

        try:
            next_page_token = resp['nextPageToken']
            url = first_url + '&pageToken={}'.format(next_page_token)
        except:
            break
    return video_links

In [None]:
video_ids = get_all_video_in_channel('UC5opyqV7wblMILfowGlPPnA')

In [None]:
video_ids

## Get Video Transcripts

In [None]:
!pip install youtube_transcript_api

In [None]:
from youtube_transcript_api import YouTubeTranscriptApi

In [None]:
transcripts = []

In [None]:
for video_id in video_ids:
    try:
        transcript = YouTubeTranscriptApi.get_transcript(video_id)
        transcripts.append([video_id, transcript])
    except:
        print(f"Couldn't get transcript for https://www.youtube.com/watch?v={video_id}")

## Create Dataframe

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(columns=['text', 'video_link', 'start_time'])

In [None]:
df

In [None]:
for transcript in transcripts:
    video_url = f"https://www.youtube.com/watch?v={transcript[0]}"
    for d in transcript[1]:
        txt = d['text']
        start_time = d['start']
        df.loc[len(df.index)] = [txt, video_url, start_time]

In [None]:
df.to_csv('video_transcripts.csv')