### Setup Notebook

In [12]:
import os
import pandas as pd
from math import ceil
from youtube_transcript_api import YouTubeTranscriptApi

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import pipeline

### Extract Transcript

In [2]:
video_id = 'zjkBMFhNj_g'
srt = YouTubeTranscriptApi.get_transcript(video_id)
df_srt = pd.DataFrame(srt)

In [3]:
def parse_transcript(df):
    out_text =""
    for _, row in df.iterrows():
        out_text += " " + row['text']
    return out_text

In [4]:
yt_transcribed = parse_transcript(df_srt)
len(yt_transcribed), len(yt_transcribed.split()), yt_transcribed[:200]

(64353,
 12151,
 ' hi everyone so recently I gave a 30-minute talk on large language models just kind of like an intro talk um unfortunately that talk was not recorded but a lot of people came to me after the talk and ')

### Chunking | Create Input Blocks

In [5]:
def create_blocks(df, block_size=5, stride=1, max_duration=120):
    '''
    Use sliding window of size 'block_size' minutes with stride of 'stride' minutes to generate text blocks.
    Generated blocks wil be limited to 'max_blocks' and can be changed depending upon the processing power.
    Default parameters allow videos of upto 2hrs. to be included.
    '''
    max_blocks = ceil(((max_duration-block_size)/stride)+1)
    max_len = ceil(max(df['start'])/60)
    df_out = pd.DataFrame()

    print(f"INFO: initiated block creation of video transcript")
    print(f"INFO: video length {max_len} | block size {block_size} | stride {stride} | max blocks {max_blocks}")

    for i in range(max_blocks):
        start = i*stride
        stop = block_size + i*stride
        df_block = df[(df['start']>= 60*start) & (df['start']<= 60*stop)]
        if (i + 1) % 5 == 0 or i + 1 == max_blocks:
            print(f"INFO: generated block {i+1} | start {start} | stop {stop} | rows combined {df_block.shape[0]}")
            print(f"INFO: reached max blocks limit")
        transcribed = parse_transcript(df_block)
        df_block = pd.DataFrame({'Block':[i+1], 'text':[transcribed], 'start_time': [min(df_block['start'])]})
        df_out = pd.concat([df_out, df_block])
        if stop >= max_len:
            print(f"INFO: generated block {i+1} | start {start} | stop {stop} | rows combined {df_block.shape[0]}")
            print(f"INFO: reached end of video")
            break
    
    df_out.reset_index(drop=True, inplace=True)
    print(f"INFO: original data {df.shape} | block data {df_out.shape}")
    return df_out

In [8]:
df_srt_chunks = create_blocks(df_srt)

df_srt_chunks['text_length'] = df_srt_chunks['text'].apply(len)
df_srt_chunks['word_count'] = df_srt_chunks['text'].apply(lambda x : len(x.split()))
df_srt_chunks.head(2)

INFO: initiated block creation of video transcript
INFO: video length 60 | block size 5 | stride 1 | max blocks 116
INFO: generated block 5 | start 4 | stop 9 | rows combined 140
INFO: reached max blocks limit
INFO: generated block 10 | start 9 | stop 14 | rows combined 150
INFO: reached max blocks limit
INFO: generated block 15 | start 14 | stop 19 | rows combined 140
INFO: reached max blocks limit
INFO: generated block 20 | start 19 | stop 24 | rows combined 143
INFO: reached max blocks limit
INFO: generated block 25 | start 24 | stop 29 | rows combined 143
INFO: reached max blocks limit
INFO: generated block 30 | start 29 | stop 34 | rows combined 139
INFO: reached max blocks limit
INFO: generated block 35 | start 34 | stop 39 | rows combined 149
INFO: reached max blocks limit
INFO: generated block 40 | start 39 | stop 44 | rows combined 146
INFO: reached max blocks limit
INFO: generated block 45 | start 44 | stop 49 | rows combined 137
INFO: reached max blocks limit
INFO: generated

Unnamed: 0,Block,text,start_time,text_length,word_count
0,1,hi everyone so recently I gave a 30-minute ta...,0.16,5356,1015
1,2,biggest one now many people like this model s...,60.039,5378,1021


### Generate Clean Summary

In [10]:
# If running on windows in local, use line below to change hf model download location
hf_cache_dir = "D:\.cache\huggingface"
os.environ['HF_HOME'] = "D:\.cache\huggingface"

In [11]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn", )
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-cnn")

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [13]:
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")



In [26]:
len(df_srt_chunks.loc[0, 'text'].split())

1015

In [34]:
article = " ".join(df_srt_chunks.loc[0, 'text'].split()[:950])
print(summarizer(article, do_sample=False, min_length=300, max_length= 600))

[{'summary_text': "A large language model is just two files. The parameters are basically the weights or the parameters of this neural network that is the language model. Every one of those parameters is stored as 2 bytes and so therefore the parameters file here is 140 gigabytes and it's two bytes because this is a float 16 as the data type. This is a fully self-contained package this is everything that's necessary you don't need any connectivity to the internet or anything else you can take these two files you compile your C code you get a binary that you can point at the parameters and you can talk to this language model so for example you can send it text like for example write a poem about the company scale Ai and this model will start generating text and in this case it will follow the directions and give you a poems about scale AI. This talk was not actually in terms of the speed of this uh video here this was not running a 70 billion parameter model it was only running a 7 bill