In [157]:
!pip install -Uqq youtube-transcript-api
!pip install -Uqq pytube
!pip3 install -Uqq thirdai[neural_db]

In [158]:
from thirdai import licensing, neural_db as ndb
import warnings
warnings.filterwarnings('ignore')
licensing.deactivate()
licensing.activate("1FB7DD-CAC3EC-832A67-84208D-C4E39E-V3")

In [159]:
from youtube_transcript_api import YouTubeTranscriptApi
import pandas as pd
from pytube import Playlist
import re

In [160]:
def get_video_links_from_playlist(playlist_url):
    playlist = Playlist(playlist_url)
    
    video_links = [video.watch_url for video in playlist.videos]
    return video_links

In [161]:
def extract_video_id(video_url):
    # Regular expression pattern to match YouTube video ID
    pattern = r"(?:v=|v\/|embed\/|youtu.be\/|\/v\/|\/e\/|watch\?v=|youtube.com\/user\/[^#]*#([^\/]*?\/)*)((\w|-){11})"

    match = re.search(pattern, video_url)
    if match:
        return match.group(2)
    else:
        return None

In [162]:
def create_csv(transcript_lst):
   
    flat_data = [item for sublist in transcript_lst for item in sublist]
    df = pd.DataFrame(flat_data)
    df['end_time'] = df['start'] + df['duration']
    df = df[['text', 'start', 'duration', 'end_time', 'video_serial_number', 'video_id']]
    return df

In [163]:
def create_data(playlist_link):
    all_playlist_videos_lst = get_video_links_from_playlist(playlist_link)
    # create individual video id list
    video_id_lst = []
    for video_link in all_playlist_videos_lst:
        video_id_lst.append(extract_video_id(video_link))
    # get transcripts for all video_ids
    transcript_lst = []
    for video_serial in range(len(video_id_lst)):
        video_id = video_id_lst[video_serial]
        try:
            video_transcript = YouTubeTranscriptApi.get_transcript(video_id)
            
            new_transcript = []
            for i in range(0, len(video_transcript), 8):
                start_time = video_transcript[i]['start']
                duration = 0
                text = ''
                for j in range(8):
                    index = i + j
                    if index < len(video_transcript):
                        text += video_transcript[index]['text']
                        text += ' '
                        duration += video_transcript[index]['duration']
                dict = {'text' : text,
                        'start' : start_time,
                        'duration' : duration,
                        'video_serial_number': video_serial + 1,
                        'video_id': video_id}
                new_transcript.append(dict)
            transcript_lst.append(new_transcript)
        except:
            continue
            
    # make dataframe from transcript list
    if len(transcript_lst)!=0:
        csv_file = create_csv(transcript_lst)
        csv_file['text'] = csv_file['text'].str.replace(r'\n', ' ')
        return csv_file
    return 0

In [2]:
list_playlist = ['https://www.youtube.com/playlist?list=PLC1qU-LWwrF64f4QKQT-Vg5Wr4qEE1Zxk', 
                 'https://www.youtube.com/playlist?list=PLtBw6njQRU-rwp5__7C0oIVt26ZgjG9NI',
                 'https://www.youtube.com/playlist?list=PLl8OlHZGYOQ7bkVbuRthEsaLr7bONzbXS',
                 'https://www.youtube.com/playlist?list=PLoROMvodv4rOSH4v6133s9LFPRHjEmbmJ',
                 'https://www.youtube.com/playlist?list=PLbMVogVj5nJSlpmy0ni_5-RgbseafOViy',
                 'https://www.youtube.com/playlist?list=PLoROMvodv4rNiJRchCzutFw5ItR_Z27CM',
                 'https://www.youtube.com/playlist?list=PLfYUBJiXbdtSvpQjSnJJ_PmDQB_VyT5iU']

In [3]:
list_playlist

['https://www.youtube.com/playlist?list=PLC1qU-LWwrF64f4QKQT-Vg5Wr4qEE1Zxk',
 'https://www.youtube.com/playlist?list=PLtBw6njQRU-rwp5__7C0oIVt26ZgjG9NI',
 'https://www.youtube.com/playlist?list=PLl8OlHZGYOQ7bkVbuRthEsaLr7bONzbXS',
 'https://www.youtube.com/playlist?list=PLoROMvodv4rOSH4v6133s9LFPRHjEmbmJ',
 'https://www.youtube.com/playlist?list=PLbMVogVj5nJSlpmy0ni_5-RgbseafOViy',
 'https://www.youtube.com/playlist?list=PLoROMvodv4rNiJRchCzutFw5ItR_Z27CM',
 'https://www.youtube.com/playlist?list=PLfYUBJiXbdtSvpQjSnJJ_PmDQB_VyT5iU']

In [119]:
from tqdm import tqdm

In [126]:
full_csv_files = []
for playlist_url in tqdm(list_playlist):
    csv_file = create_data(playlist_url)
    full_csv_files.append(csv_file)
full_csv_files = pd.concat(full_csv_files, ignore_index=True)
full_csv_files

100%|██████████| 7/7 [01:57<00:00, 16.83s/it]


Unnamed: 0,text,start,duration,end_time,video_serial_number
0,"Okay, so welcome to lecture two of CS231N. On ...",6.971,16.368,23.339,2
1,exactly how some of these learning algorithms ...,23.339,16.278,39.617,2
2,"So, today we'll see our first learning algorit...",39.617,14.880,54.497,2
3,Which means that there are several hundred of ...,55.357,19.032,74.389,2
4,should really go to Piazza. You'll probably ge...,74.389,21.069,95.458,2
...,...,...,...,...,...
24737,"Okay, I don't know how to pronounce the name ...",5567.840,45.920,5613.760,8
24738,"Could be a new language, it could be a new al...",5622.320,47.360,5669.680,8
24739,a whole bunch of tools and skills and ideas wh...,5675.600,44.400,5720.000,8
24740,do a lot of coding related stuff. So I create...,5731.280,49.840,5781.120,8


In [127]:
len(full_csv_files)

24742

In [131]:
# save csv file
full_csv_files['index_col'] = full_csv_files.index
full_csv_files.iloc[:, 1:4] = full_csv_files.iloc[:, 1:4].astype(str)
full_csv_files.to_csv('file.csv')

In [134]:
db = ndb.NeuralDB(user_id="team_iisc") # you can use any username, in the future, this username will let you push models to the model hub

In [136]:
insertable_docs = []
csv_files = ['/kaggle/working/file.csv']

for file in csv_files:
    csv_doc = ndb.CSV(
        path=file,
        id_column="index_col",
        strong_columns=["text"],
        weak_columns=[],  #"start","duration",
        reference_columns=["text"],
        save_extra_info=True)
    insertable_docs.append(csv_doc)
insertable_docs

[<thirdai.neural_db.documents.CSV at 0x7f568993eaa0>]

In [138]:
# insert db
source_ids = db.insert(insertable_docs, train=True)

loaded data | source 'Documents:
file.csv' | vectors 24742 | batches 13 | time 0s | complete

train | epoch 0 | train_steps 13 | train_hash_precision@5=0.0142187  | train_batches 13 | time 51s

train | epoch 1 | train_steps 26 | train_hash_precision@5=0.106184  | train_batches 13 | time 40s

train | epoch 2 | train_steps 39 | train_hash_precision@5=0.189475  | train_batches 13 | time 40s

train | epoch 3 | train_steps 52 | train_hash_precision@5=0.349592  | train_batches 13 | time 40s

train | epoch 4 | train_steps 65 | train_hash_precision@5=0.528138  | train_batches 13 | time 40s

train | epoch 5 | train_steps 78 | train_hash_precision@5=0.60173  | train_batches 13 | time 40s

train | epoch 6 | train_steps 91 | train_hash_precision@5=0.740094  | train_batches 13 | time 40s

train | epoch 7 | train_steps 104 | train_hash_precision@5=0.912036  | train_batches 13 | time 40s

train | epoch 8 | train_steps 117 | train_hash_precision@5=0.91025  | train_batches 13 | time 40s

train | epoch 

In [140]:
search_results = db.search(
    query="What is regularization?",
    top_k=10,
    on_error=lambda error_msg: print(f"Error! {error_msg}"))

for result in search_results:
    print(result.text)
#     print(result.context(radius=1))
#     print(result.source)
    print('video_number = ',result.metadata['video_serial_number'])
    print('start_time =',result.metadata['start'])
    print('************')

here at stanford this paper by percy liang and students um which is to argue that really what dropout gives you is a strong regularizer that isn't a uniform regularizer like l2 that regularizes everything with an l2 last but can learn a feature dependent regularization and so that dropout has just emerged as in 
video_number =  5
start_time = 1950.72
************
just like before where the goal is to capture the difference between our input data and the reconstructed output and now for the vae we've introduced a second term to the loss what we call the regularization term often you'll maybe even see this referred to as a vae loss 
video_number =  4
start_time = 1136.52
************
to learn not to use the layers that it doesn't need. In addition, it kind of adds this interpretation to L2 regularization in the context of these neural networks, cause once you put L2 regularization, remember, on your, on the weights of your network, that's going to drive all the parameters towards zero. 


In [141]:
db.save("trained_ndb.db")

'trained_ndb.db'

**TEST**

In [101]:
# trained_db = ndb.NeuralDB(user_id="team_iisc") 

In [164]:
trained_db = ndb.NeuralDB.from_checkpoint("trained_ndb.db", on_progress=lambda fraction: print(f"{fraction}% done with loading."))

0.16666666666666666% done with loading.
0.3333333333333333% done with loading.
0.5% done with loading.
0.6666666666666666% done with loading.
0.8333333333333334% done with loading.
1.0% done with loading.


In [154]:
!zip -r trained_ndb.zip /kaggle/working/trained_ndb.db

  adding: kaggle/working/trained_ndb.db/ (stored 0%)
  adding: kaggle/working/trained_ndb.db/documents.pkl (deflated 70%)
  adding: kaggle/working/trained_ndb.db/model.pkl (deflated 14%)
  adding: kaggle/working/trained_ndb.db/documents/ (stored 0%)
  adding: kaggle/working/trained_ndb.db/documents/0/ (stored 0%)
  adding: kaggle/working/trained_ndb.db/documents/0/file.csv (deflated 68%)
  adding: kaggle/working/trained_ndb.db/logger/ (stored 0%)
  adding: kaggle/working/trained_ndb.db/logger/in_memory/ (stored 0%)
  adding: kaggle/working/trained_ndb.db/logger.pkl (deflated 36%)
  adding: kaggle/working/trained_ndb.db/model/ (stored 0%)
