# YouTube Transcript Search

This function uses a list of dictionaries generated by youtube_transcript_api and a list of URLs generated by pytube. It will look for matches that are spread across multiple continuous subtitle segments, and strips characters that might cause false negatives due to YouTube's auto-generated captions. It accepts YouTube video and playlist URLS in following formats:

    https://www.youtube.com/watch?v=aZbHd4suAnQ
    https://youtu.be/aZbHd4suAnQ
    https://www.youtube.com/playlist?list=PLFPUGjQjckXHbmXRXAw2wMRXLlYEf2O-N
    https://www.youtube.com/watch?v=aZbHd4suAnQ&list=PLFPUGjQjckXHbmXRXAw2wMRXLlYEf2O-N

Elements in URLS following the video ID are acceptable, e.g.:

    https://youtu.be/aZbHd4suAnQ?si=vC5e2YMsnqavO8_K
    https://www.youtube.com/watch?v=aZbHd4suAnQ&t=14&feature=youtu.be

Usage: run youtube_search() and enter inputs for video/playlist and search query.

In [1]:
import re
# pip install youtube-transcript-api
# pip install pytube
# pip install inflect
from youtube_transcript_api import YouTubeTranscriptApi
from pytube import Playlist
from pytube import YouTube
import inflect

In [2]:
def youtube_search():
    vid = input('Please Enter Youtube URL (Video or Playlist):')
    if 'youtu.be' in vid:
        vid = 'watch?v=' + vid[vid.index('youtu.be')+9:len(vid)]
    elif 'watch?v=' not in vid:
        return('Error: Enter valid Youtube video/playlist URL.')
    if 'list' in vid:
        urls = Playlist(vid).video_urls
    else:
        urls = [vid]
    query = input('Please enter search term:')
    query = query.lower()
    chars = ["'",'"',',','.','?','!','@','#','$','%','^','&','*','(',')','-','_','=','+',';',':']
    query.translate({ord(x): '' for x in chars})
    dicts = []
    for url in urls:
        url = url[url.index('watch?v=')+8:len(url)]
        trans = YouTubeTranscriptApi.get_transcript(url, preserve_formatting = False)
        working_query = query
        counter = 0
        query_slice = ''
        for i in range(0,len(trans)):
            line = trans[i].get('text')
            line = line.replace('\xa0',' ')
            line = line.replace('\n','')
            line.translate({ord(x): '' for x in chars})
            line = line.lower()
            if working_query in line:
                dicts.append({url: str(trans[i-counter].get('start'))})
            else:
                spaces = [m.start() for m in re.finditer(' ', working_query)]
                for s in reversed(spaces):
                    query_slice = working_query[0:s]
                    if query_slice in line and line.index(query_slice) + len(query_slice) == len(line):
                        break
                if query_slice in line and line.index(query_slice) + len(query_slice) == len(line):
                    working_query = working_query[s+1:len(working_query)]
                    counter = counter + 1
                    continue
            if working_query != query:
                working_query = query
                i = i - 1
                counter = 0
    for j in dicts:
        url = list(j.keys())[0]
        start = j.get(url)
        print('Match found for video: ' + YouTube('http://youtube.com/watch?v='+url).title + ' at ' + start + ' seconds: http://youtube.com/watch?v=' + url + '&t='+ start[0:start.index('.')]+'s')

In [3]:
youtube_search()

Please Enter Youtube URL (Video or Playlist): https://www.youtube.com/watch?v=enx8buKYnMw
Please enter search term: kitchen


Match found for video: The History of Barbecue at 3.27 seconds: http://youtube.com/watch?v=enx8buKYnMw&t=3s
Match found for video: The History of Barbecue at 24.801 seconds: http://youtube.com/watch?v=enx8buKYnMw&t=24s
Match found for video: The History of Barbecue at 189.805 seconds: http://youtube.com/watch?v=enx8buKYnMw&t=189s
Match found for video: The History of Barbecue at 1195.792 seconds: http://youtube.com/watch?v=enx8buKYnMw&t=1195s
