In [1]:
from openai import OpenAI
openai_client = OpenAI()

def llm(user_prompt, instructions=None, model ='gpt-4o-mini'):
    messages= []
    if instructions :
        messages.append({
            "role":"system",
            "content":instructions
        })
    messages.append({
        "role":"user",
        "content":user_prompt
    })
    response = openai_client.responses.create(
        model=model,
        input=messages
    )
    return response.output_text
        

In [3]:
from youtube_transcript_api import YouTubeTranscriptApi

In [5]:
video_id = 'ph1PxZIkz1o'
yt_api=YouTubeTranscriptApi()
transcript = yt_api.fetch(video_id)

In [14]:
import pickle
with open (f'{video_id}.bin','rb') as f_in:
    transcript = pickle.load(f_in)

In [15]:
transcript[:10]

[FetchedTranscriptSnippet(text='So hi everyone. Uh today we are going to', start=0.0, duration=5.04),
 FetchedTranscriptSnippet(text='talk about our upcoming course. The', start=2.96, duration=3.52),
 FetchedTranscriptSnippet(text='upcoming course is called machine', start=5.04, duration=5.92),
 FetchedTranscriptSnippet(text='learning zoom camp. And um this is', start=6.48, duration=5.92),
 FetchedTranscriptSnippet(text='already I put the link in the', start=10.96, duration=3.599),
 FetchedTranscriptSnippet(text="description. So if you're watching um", start=12.4, duration=4.719),
 FetchedTranscriptSnippet(text="this video in recording or you're", start=14.559, duration=4.88),
 FetchedTranscriptSnippet(text='watching it live, you go here in the', start=17.119, duration=4.561),
 FetchedTranscriptSnippet(text='description after under this video and', start=19.439, duration=5.6),
 FetchedTranscriptSnippet(text='then you see a link course. uh click on', start=21.68, duration=6.24)]

In [20]:
def format_timestamp(seconds: float) -> str:
    """Convert seconds to H:MM:SS if > 1 hour, else M:SS"""
    total_seconds = int(seconds)
    hours, remainder = divmod(total_seconds, 3600)
    minutes, secs = divmod(remainder, 60)

    if hours > 0:
        return f"{hours}:{minutes:02}:{secs:02}"
    else:
        return f"{minutes}:{secs:02}"
def make_subtitles(transcript) :
    lines = []
    for entry in transcript:
        ts=format_timestamp(entry.start)
        text=entry.text.replace('\n',' ')
        lines.append(ts+' '+text)
    return '\n'.join(lines)
    

In [22]:
subtitles=make_subtitles(transcript)
print(subtitles[:100])

0:00 So hi everyone. Uh today we are going to
0:02 talk about our upcoming course. The
0:05 upcoming


In [23]:
instructions = """
Summarize the transcript and describe the main purpose of the video
and the main ideas. 

Also output chapters with time. Use usual sentence case, not Title Case for the chapter.

Output format: 

<OUTPUT>
Summary

timestamp chapter 
timestamp chapter
...
timestamp chapter
</OUTPUT>

Don't include <OUTPUT> in the output
"""

In [24]:
answer= llm(subtitles,instructions)

In [25]:
print(answer)

Summary

The video is an introduction to the "Machine Learning Zoom Camp," a course that is set to start on September 15th. The host details the course structure, its purpose, and answers frequently asked questions from potential participants. The aim is to equip learners with practical machine learning skills, emphasizing deployment and ML engineering rather than theoretical data science.

Main ideas include:
- Course overview and sign-up process
- Content updates based on the latest technologies (like switching from TensorFlow to PyTorch)
- Focus on practical skills, especially on deployment and engineering aspects of machine learning
- No job placement assistance but a strong track record of past participants finding jobs
- Prerequisites include basic programming knowledge, with an emphasis on Python and command-line use
- The course accommodates various backgrounds, emphasizing that anyone with determination can succeed.

timestamp chapter
0:00 introduction to the course
2:38 cours

### structured output with pydantic

In [26]:
from pydantic import BaseModel
class chapter(BaseModel):
    timestamp : str
    title : str
class ytsummaryresponse(BaseModel):
    summary : str
    chapters : list[chapter]

In [27]:
def llm_structured(instructions, user_prompt, output_type, model="gpt-4o-mini"):
    messages = [
        {"role": "system", "content": instructions},
        {"role": "user", "content": user_prompt}
    ]

    response = openai_client.responses.parse(
        model=model,
        input=messages,
        text_format=output_type
    )

    return response.output_parsed

In [28]:
summary = llm_structured(
    instructions=instructions,
    user_prompt=subtitles,
    output_type=ytsummaryresponse
)

In [29]:
print(summary)

summary='The video focuses on discussing the upcoming "Machine Learning Zoom Camp" course, slated to begin on September 15th. The instructor introduces the course structure, prerequisites, module updates, and addresses participants\' questions regarding job placement opportunities, depth of content, programming requirements, and deployment processes in machine learning. The course aims to equip learners with essential machine learning engineering skills through practical projects, and while it covers some foundational mathematics, the focus remains on engineering rather than theoretical concepts. Participants also receive guidance on utilizing resources like GitHub Code Spaces and ChatGPT during their learning process. The aim is to prepare students for entry-level machine learning engineer positions, with an emphasis on hands-on learning and project execution.' chapters=[chapter(timestamp='0:00', title='Introduction to machine learning zoom camp'), chapter(timestamp='1:48', title='Cou

In [30]:
print(summary.summary)

The video focuses on discussing the upcoming "Machine Learning Zoom Camp" course, slated to begin on September 15th. The instructor introduces the course structure, prerequisites, module updates, and addresses participants' questions regarding job placement opportunities, depth of content, programming requirements, and deployment processes in machine learning. The course aims to equip learners with essential machine learning engineering skills through practical projects, and while it covers some foundational mathematics, the focus remains on engineering rather than theoretical concepts. Participants also receive guidance on utilizing resources like GitHub Code Spaces and ChatGPT during their learning process. The aim is to prepare students for entry-level machine learning engineer positions, with an emphasis on hands-on learning and project execution.


In [31]:
for c in summary.chapters:
    print(c.timestamp,c.title)

0:00 Introduction to machine learning zoom camp
1:48 Course updates and module changes
2:58 Job placement opportunities
4:11 Depth of computer vision and neural networks
6:08 Prerequisites needed for the course
10:37 Using command line and software engineering skills
12:23 Target audience for the course
13:23 Recommended companion book
17:03 Hardware requirements for the course
18:54 Using AI tools for learning
20:57 Paths to explore after completing the course
24:03 Course structure and deadlines
29:01 Projects and submission requirements
30:35 Participant interactions and peer reviews
33:43 Final thoughts and resources


### chunking youtube transcripts

In [41]:
## sliding window chunking
def join_lines(transcript):
    lines=[]
    for entry in transcript:
        text=entry.text.replace('\n',' ')
        lines.append(text)
    return ' '.join(lines)
    
def format_chunk(chunk):
    """Format a chunk with start/end timestamps and text."""
    time_start = format_timestamp(chunk[0].start)
    time_end = format_timestamp(chunk[-1].start)
    text = join_lines(chunk)

    return {
        'start': time_start,
        'end': time_end,
        'text': text
    }
    
    

In [60]:
#range(start, stop, step)
def sliding_window(seq,size,step):
    #create overlapping chunk
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")
    n= len(seq)
    result=[]
    for i in range(0,n,step):
        group = seq[i:i+size]
        result.append(group)
        if i + size >= n :
            break
    return result

In [45]:
print(sliding_window(list(range(10)),5,2))

[[0, 1, 2, 3, 4], [2, 3, 4, 5, 6], [4, 5, 6, 7, 8], [6, 7, 8, 9]]


In [69]:
chunks=[]
for chunk in sliding_window(transcript,60,30):
    processed=format_chunk(chunk)
    chunks.append(processed)
print(f"created {len(chunks)} chunks")

created 46 chunks


In [70]:
print(chunks[:10])

[{'start': '0:00', 'end': '2:38', 'text': "So hi everyone. Uh today we are going to talk about our upcoming course. The upcoming course is called machine learning zoom camp. And um this is already I put the link in the description. So if you're watching um this video in recording or you're watching it live, you go here in the description after under this video and then you see a link course. uh click on that link and this bring you will bring you to this website this GitHub page. This GitHub page is the main entry point to our course and um yeah I think it's more or less self-explanatory. If you want to sign up this is the button you click and the actual course starts in on September 15th. it means that it's uh slightly less than one one month before the course starts and the purpose of today's um session is to just answer your questions. So you have some questions and uh you can ask these questions using uh you can ask your questions using the pinned link. So there's a pinned link in 

In [71]:
from minsearch import Index
index= Index(text_fields=['text'])
index.fit(chunks)

<minsearch.minsearch.Index at 0x7619c1146990>

In [73]:
result = index.search('can i find job after course?',num_results=2)

In [74]:
import json

def search(query):
    """Search for relevant documents."""
    return index.search(
        query=query,
        num_results=15
    )

instructions = """
Answer the QUESTION based on the CONTEXT from the subtitles of a YouTube video.

Use only the facts from the CONTEXT when answering the QUESTION.

When answering the question, 
provide the citation in form of the video URL pointing at the timestamp where
this is discussed. If the question is discussed in multiple documents,
cite all of them.

Don't use markdown or any formatting in the output.
""".strip()


In [75]:
prompt_template = """
<VIDEO_ID>
{video_id}
</VIDEO_ID>

<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()

def build_prompt(question, search_results):
    context = json.dumps(search_results)
    return prompt_template.format(
        question=question,
        context=context,
        video_id=video_id
    ).strip()

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    response = llm(prompt, instructions=instructions)
    return response

In [76]:
answer = rag('Can I find a job after the course?')
print(answer)

Yes, you can find a job after completing the course. While the course itself does not provide job placement services, many participants from past iterations have successfully found jobs after completing the course. It's important to put in effort during the course and work on projects to build your skills and portfolio, which will enhance your job readiness. Engaging in practical projects, including volunteering, is highly recommended to gain real experience and improve job prospects.

For more information, you can refer to the video at the following timestamps: 1:21 - 3:49 and 51:23 - 53:52.
