In [129]:
!pip install -Uqq youtube-transcript-api
!pip install -Uqq pytube
!pip3 install -Uqq thirdai[neural_db]

In [130]:
from thirdai import licensing, neural_db as ndb
import warnings
import os
warnings.filterwarnings('ignore')
licensing.deactivate()
licensing.activate("1FB7DD-CAC3EC-832A67-84208D-C4E39E-V3")

In [131]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
from pytube import Playlist
import re

In [132]:
def get_video_links_from_playlist(playlist_url):
    playlist = Playlist(playlist_url)
    
    video_links = [video.watch_url for video in playlist.videos]
    return video_links

In [133]:
def extract_video_id(video_url):
    # Regular expression pattern to match YouTube video ID
    pattern = r"(?:v=|v\/|embed\/|youtu.be\/|\/v\/|\/e\/|watch\?v=|youtube.com\/user\/[^#]*#([^\/]*?\/)*)((\w|-){11})"

    match = re.search(pattern, video_url)
    if match:
        return match.group(2)
    else:
        return None

In [134]:
def create_csv(transcript_lst):
   
    flat_data = [item for sublist in transcript_lst for item in sublist]
    df = pd.DataFrame(flat_data)
    df['end_time'] = df['start'] + df['duration']
    df = df[['text', 'start', 'duration', 'end_time', 'video_serial_number', 'video_id']]
    return df

In [135]:
def create_data(playlist_link):
    all_playlist_videos_lst = get_video_links_from_playlist(playlist_link)
    # create individual video id list
    video_id_lst = []
    for video_link in all_playlist_videos_lst:
        video_id_lst.append(extract_video_id(video_link))
    # get transcripts for all video_ids
    transcript_lst = []
    for video_serial in range(len(video_id_lst)):
        video_id = video_id_lst[video_serial]
        try:
            video_transcript = YouTubeTranscriptApi.get_transcript(video_id)
            
            new_transcript = []
            for i in range(0, len(video_transcript), 8):
                start_time = video_transcript[i]['start']
                duration = 0
                text = ''
                for j in range(8):
                    index = i + j
                    if index < len(video_transcript):
                        text += video_transcript[index]['text']
                        text += ' '
                        duration += video_transcript[index]['duration']
                dict = {'text' : text,
                        'start' : start_time,
                        'duration' : duration,
                        'video_serial_number': video_serial + 1,
                        'video_id': video_id}
                new_transcript.append(dict)
            transcript_lst.append(new_transcript)
        except:
            continue

    # make dataframe from transcript list
    if len(transcript_lst)!=0:
        csv_file = create_csv(transcript_lst)
        csv_file['text'] = csv_file['text'].str.replace(r'\n', ' ')
        return csv_file
    return 0

In [136]:
trained_db = ndb.NeuralDB.from_checkpoint("/kaggle/input/model-db-third-ai/trained_ndb/kaggle/working/trained_ndb.db", on_progress=lambda fraction: print(f"{fraction}% done with loading."))

0.16666666666666666% done with loading.
0.3333333333333333% done with loading.
0.5% done with loading.
0.6666666666666666% done with loading.
0.8333333333333334% done with loading.
1.0% done with loading.


In [137]:
playlist_url = 'https://www.youtube.com/playlist?list=PLkDaE6sCZn6FNC6YRfRQc_FbeQrF8BwGI'
# playlist_url = 'https://www.youtube.com/watch?v=P127jhj-8-Y&list=PLoROMvodv4rNiJRchCzutFw5ItR_Z27CM' #stanford transformer united
test_csv_file = create_data(playlist_url)
test_csv_file

Unnamed: 0,text,start,duration,end_time,video_serial_number,video_id
0,welcome to machine learning what is machine le...,3.920,39.399,43.319,1,vStJoetOxJg
1,software has figured out how to rank web pages...,23.699,34.380,58.079,1,vStJoetOxJg
2,learning or if you've just finished watching a...,41.399,33.241,74.640,1,vStJoetOxJg
3,voice to text on your phone to write a text me...,57.600,37.139,94.739,1,vStJoetOxJg
4,congratulations you've won a million dollars w...,76.080,41.400,117.480,1,vStJoetOxJg
...,...,...,...,...,...,...
868,or w dot product X plus b and it turns out tha...,433.380,49.459,482.839,40,jhrrw8Iuus0
869,regularization term which is Lambda over 2m ti...,458.039,49.139,507.178,40,jhrrw8Iuus0
870,over here and finally remember that W X plus b...,482.819,40.401,523.220,40,jhrrw8Iuus0
871,regularized linear regression using this you r...,507.180,35.519,542.699,40,jhrrw8Iuus0


In [138]:
# save csv file
test_csv_file['index_col'] = test_csv_file.index
test_csv_file.iloc[:, 1:4] = test_csv_file.iloc[:, 1:4].astype(str)
test_csv_file.to_csv('test_file.csv')

In [139]:
insertable_docs = []
csv_files = ['/kaggle/working/test_file.csv']

for file in csv_files:
    csv_doc = ndb.CSV(
        path=file,
        id_column="index_col",
        strong_columns=["text"],
        weak_columns=[],  #"start","duration",
        reference_columns=["text"],
        save_extra_info=True)
    insertable_docs.append(csv_doc)
insertable_docs

[<thirdai.neural_db.documents.CSV at 0x7c84ec0e1a20>]

In [140]:
source_ids = trained_db.insert(insertable_docs, train=False)

loaded data | source 'Documents:
test_file.csv' | vectors 873 | batches 1 | time 0s | complete



In [164]:
search_results = trained_db.search(
    query="What is overfitting?",
    top_k=5,
    on_error=lambda error_msg: print(f"Error! {error_msg}"))

for idx, result in enumerate(search_results):
    
    result_text = result.text
    result_index = result.metadata['index_col']
    result_video_number = result.metadata['video_serial_number']
    result_start_time = result.metadata['start']
    result_video_id = test_csv_file.iloc[result_index]['video_id']
    
    http_link = f'https://www.youtube.com/watch?v={result_video_id}&t={int(result_start_time)}'
    print(f'Video Link {str(idx+1)}', http_link)

Video Link 1 https://www.youtube.com/watch?v=1kgcON0Eauc&t=441
Video Link 2 https://www.youtube.com/watch?v=tHDDbqYfflM&t=302
Video Link 3 https://www.youtube.com/watch?v=RGL_XUjPkGo&t=92
Video Link 4 https://www.youtube.com/watch?v=6dTL76DWYQU&t=42
Video Link 5 https://www.youtube.com/watch?v=ecOdZlY9jsQ&t=170


In [165]:
search_results = trained_db.search(
    query="What is regularization?",
    top_k=5,
    on_error=lambda error_msg: print(f"Error! {error_msg}"))

for idx, result in enumerate(search_results):
    
    result_text = result.text
    result_index = result.metadata['index_col']
    result_video_number = result.metadata['video_serial_number']
    result_start_time = result.metadata['start']
    result_video_id = test_csv_file.iloc[result_index]['video_id']
    
    http_link = f'https://www.youtube.com/watch?v={result_video_id}&t={int(result_start_time)}'
    print(f'Video Link {str(idx+1)}', http_link)

Video Link 1 https://www.youtube.com/watch?v=NIiZZY7nlfU&t=154
Video Link 2 https://www.youtube.com/watch?v=tHDDbqYfflM&t=52
Video Link 3 https://www.youtube.com/watch?v=L5INhX5cbWU&t=165
Video Link 4 https://www.youtube.com/watch?v=WtlvKq_zxPI&t=181
Video Link 5 https://www.youtube.com/watch?v=6dTL76DWYQU&t=82


In [166]:
search_results = trained_db.search(
    query="What is gradient descent?",
    top_k=5,
    on_error=lambda error_msg: print(f"Error! {error_msg}"))

for idx, result in enumerate(search_results):
    
    result_text = result.text
    result_index = result.metadata['index_col']
    result_video_number = result.metadata['video_serial_number']
    result_start_time = result.metadata['start']
    result_video_id = test_csv_file.iloc[result_index]['video_id']
    
    http_link = f'https://www.youtube.com/watch?v={result_video_id}&t={int(result_start_time)}'
    print(f'Video Link {str(idx+1)}', http_link)

Video Link 1 https://www.youtube.com/watch?v=0az8RjxLLPQ&t=637
Video Link 2 https://www.youtube.com/watch?v=RGL_XUjPkGo&t=92
Video Link 3 https://www.youtube.com/watch?v=XtlwSmJfUs4&t=18
Video Link 4 https://www.youtube.com/watch?v=YVtP5UGdgXg&t=299
Video Link 5 https://www.youtube.com/watch?v=jhrrw8Iuus0&t=343


In [169]:
search_results = trained_db.search(
    query="Support Vector Machines?",
    top_k=5,
    on_error=lambda error_msg: print(f"Error! {error_msg}"))

for idx, result in enumerate(search_results):
    
    result_text = result.text
    result_index = result.metadata['index_col']
    result_video_number = result.metadata['video_serial_number']
    result_start_time = result.metadata['start']
    result_video_id = test_csv_file.iloc[result_index]['video_id']
    
    http_link = f'https://www.youtube.com/watch?v={result_video_id}&t={int(result_start_time)}'
    print(f'Video Link {str(idx+1)}', http_link)

Video Link 1 https://www.youtube.com/watch?v=L5INhX5cbWU&t=165
Video Link 2 https://www.youtube.com/watch?v=dLc-lfEEYss&t=403
Video Link 3 https://www.youtube.com/watch?v=gG_wI_uGfIE&t=233
Video Link 4 https://www.youtube.com/watch?v=YjpCQof9tI8&t=455
Video Link 5 https://www.youtube.com/watch?v=L5INhX5cbWU&t=254
