# Fetch transcript

In [53]:
from modules.youtube import fetch_youtube_transcript
from modules.helpers import save_response_as_file
from modules.helpers import num_tokens_from_string

video_url = "https://youtu.be/5HINgMMTzPE?si=geHQHMGVm9Atzg32"
transcript = fetch_youtube_transcript(video_url)

In [54]:
from modules.youtube import get_video_metadata

meta = get_video_metadata(video_url)
video_title = meta['name']
save_response_as_file("unprocessed_trancsript", video_title, transcript)

# Split transcript into chunks

In [55]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=4000,
    chunk_overlap=16,
    length_function=num_tokens_from_string,
    is_separator_regex=False,
)

# Split the transcript into chunks 
transcript_excerpts = text_splitter.create_documents([transcript])

In [56]:
num_tokens_transcript = num_tokens_from_string(transcript, encoding_name="cl100k_base")
print(f"The transcript has {num_tokens_transcript} tokens.")

The transcript has 1371 tokens.


# Initialize LLM and prompts

In [57]:
from os import getenv
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv

load_dotenv()

llm = ChatOpenAI(
    api_key=getenv("OPENAI_API_KEY"),
    temperature=0.7,
    model="gpt-3.5-turbo",
    max_tokens=4096
)

In [58]:
from langchain_core.prompts.chat import SystemMessage, HumanMessagePromptTemplate

user_prompt = HumanMessagePromptTemplate.from_template(
    """Here is part {number}, delimited by ---

    ---
    {transcript_excerpt}
    ---
    """
)

In [59]:
system_prompt = "You are giong to receive excerpts from an automatically generated video transcript. Your task is to convert every excerpt into structured text. Ensure that the content of the excerpts remains unchanged. Add appropriate punctuation, correct any grammatical errors, remove filler words and divide the text into logical paragraphs, separating them with a single new line. The final output should be in plain text and only include the modified transcript excerpt without any prelude."
print("Token number in system prompt: " + str(num_tokens_from_string(system_prompt)))

# Process transcript

In [63]:
batch_messages = []
for num, excerpt in enumerate(transcript_excerpts):
    batch_messages.append([
        SystemMessage(content=system_prompt),
        user_prompt.format(number=num, transcript_excerpt=excerpt.page_content)
    ])
response = llm.generate(batch_messages)

Token number in system prompt: 85


In [60]:
result =  "\n\n".join(gen[0].text for gen in response.generations)
print(result)

The question I often get is how long should I try to focus? Well, the research literature points to the key importance of so-called ultradian cycles. You've all probably heard of circadian cycles or circadian biology, circa, the day. Circadian is about a 24-hour cycle. Our brain and body operate within each and every day with 90-minute ultradian cycles. So my suggestion would be anytime you're going to sit down and try to focus, you're going to try to do a focused bout of physical exercise or skill learning or musical learning, or maybe you're even just having a conversation. Maybe you're a therapist or you're attending therapy or a class. How long should it be? And the ideal duration is about 90 minutes. Not exactly 90 minutes, but we can reliably say 90 minutes or less.

Okay, it doesn't have to be the full 90 minutes, but trying to push yourself to be able to drop into two hours of focus or three hours of focus, while possible, is not really in line with what we know about the under

In [61]:
num_tokens_response = num_tokens_from_string(result, encoding_name="cl100k_base")
print(f"The response has {num_tokens_response} tokens.")

The response has 1210 tokens.


In [65]:
save_response_as_file(dir_name="transcripts_processed", filename=video_title, file_content=result)