In [1]:
from dotenv import load_dotenv

# for PYTHONPATH
# set PYTHONPATH to the root directory of this project on your system
load_dotenv()

True

In [2]:
import textwrap

def print_wrapped(text: str, width: int = 128):
    wrapped_content = textwrap.fill(text=text, width=width)
    print(wrapped_content)

# Set up SQLite

It's useful to save some metadata about the video for future use.

In [3]:
from peewee import SqliteDatabase, Model, CharField, BooleanField, IntegerField, DateTimeField
sql_db = SqliteDatabase('../data/videos.sqlite3')

In [8]:
class Video(Model):
    yt_video_id = CharField(unique=True)
    title = CharField()
    language = CharField(null=True)
    channel = CharField(null=True)
    saved_on = DateTimeField(null=True)
    preprocessed = BooleanField(null=True)
    chunk_size = IntegerField(null=True)
    transcript_token_num = IntegerField(null=True)

    class Meta:
        database = sql_db

In [13]:
sql_db.connect(reuse_if_open=True)

False

In [14]:
sql_db.get_tables()

['video']

In [15]:
if not sql_db.table_exists('video'):
    sql_db.create_tables([Video])
else:
    print("Table already exists")

Table already exists


# Fetch transcript

In [16]:
from modules.youtube import fetch_youtube_transcript, extract_youtube_video_id
from modules.helpers import save_response_as_file
from modules.helpers import num_tokens_from_string

video_url = "https://youtu.be/qe6dSDq5GV0?si=wN13pRWZPqzeyrCz"
video_id = extract_youtube_video_id(video_url)
transcript = fetch_youtube_transcript(video_url)

In [31]:
from modules.youtube import get_video_metadata

meta = get_video_metadata(video_url)
video_title = meta['name']
print(video_title)
save_response_as_file("../transcripts", video_title, transcript)

How Fasting & Caloric Restriction Impact Health | Dr. Satchin Panda & Dr. Andrew Huberman


In [18]:
video = Video.create(
    yt_video_id = video_id,
    title = video_title,
    channel = meta['channel']
)

In [19]:
from datetime import datetime

Video.update({Video.saved_on: datetime.now()}).where(Video.yt_video_id == video_id).execute()

1

# Split yet unprocessed transcript into chunks

A relatively small chunk size is used, as the model tends to ignore the middle part of the transcript, if it's too long. Probably because of "Lost in the middle".

- https://arxiv.org/abs/2307.03172

In [20]:
CHUNK_SIZE_FOR_UNPROCESSED_TRANSCRIPT = 932

In [21]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE_FOR_UNPROCESSED_TRANSCRIPT,
    chunk_overlap=32,
    length_function=num_tokens_from_string,
    is_separator_regex=False,
)

# Split the transcript into chunks 
transcript_excerpts = text_splitter.create_documents([transcript])
print(f"Split unprocessed transcript into {len(transcript_excerpts)} chunks.")

Split unprocessed transcript into 4 chunks.


In [22]:
num_tokens_transcript = num_tokens_from_string(transcript, encoding_name="cl100k_base")
print_wrapped(f"The unprocessed transcript has {num_tokens_transcript} tokens.")

The unprocessed transcript has 2845 tokens.


# Initialize LLM and prompts

## Option 1: OpenAI (GPT-3.5-turbo)

In [23]:
from os import getenv
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    api_key=getenv("OPENAI_API_KEY"),
    temperature=0.3,
    model="gpt-3.5-turbo",
    max_tokens=2048
)

In [24]:
from langchain_core.prompts.chat import SystemMessage, HumanMessagePromptTemplate

user_prompt = HumanMessagePromptTemplate.from_template(
    """Here is part {number}, delimited by ---

    ---
    {transcript_excerpt}
    ---
    """
)

In [25]:
system_prompt = "You are giong to receive excerpts from an automatically generated video transcript. Your task is to convert every excerpt into structured text. Ensure that the content of the excerpts remains unchanged. Add appropriate punctuation, correct any grammatical errors, remove filler words and divide the text into logical paragraphs, separating them with a single new line. The final output should be in plain text and only include the modified transcript excerpt without any prelude."
print("Token number in system prompt: " + str(num_tokens_from_string(system_prompt)))

Token number in system prompt: 85


# Process transcript

In [26]:
batch_messages = []
for num, excerpt in enumerate(transcript_excerpts):
    batch_messages.append([
        SystemMessage(content=system_prompt),
        user_prompt.format(number=num, transcript_excerpt=excerpt.page_content)
    ])
response = llm.generate(batch_messages)

In [27]:
result =  "\n\n".join(gen[0].text for gen in response.generations)

In [28]:
num_tokens_response = num_tokens_from_string(result, encoding_name="cl100k_base")
print(f"The initial transcript has {num_tokens_transcript} tokens.")
print(f"The response has {num_tokens_response} tokens.")

The initial transcript has 2845 tokens.
The response has 2320 tokens.


In [30]:
save_response_as_file(dir_name="../transcripts_processed", filename=video_title, file_content=result)

In [32]:
Video.update(
    {
        Video.preprocessed: True,
        Video.transcript_token_num: num_tokens_transcript
    }).where(Video.yt_video_id == video_id).execute()

1

# Split the processed transcript

In [33]:
CHUNK_SIZE_FOR_PROCESSED_TRANSCRIPT = 1024

In [34]:
Video.update(
    {
        Video.chunk_size: CHUNK_SIZE_FOR_PROCESSED_TRANSCRIPT
    }).where(Video.yt_video_id == video_id).execute()

1

In [35]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=0)
chunks = splitter.create_documents([result])
for chunk in chunks:
    print_wrapped(chunk.page_content)
    print("----------------")

So there's a famous experiment that was published last year by Joe Takahashi's lab, and it came out in Science, and that relates
to caloric restriction. We kind of started with this idea, discussing that the rat experiments were done with caloric
restriction. Researchers gave reduced calorie conjunction by 20% or 30%, and the rats, and subsequently mice, all lived longer.
What is interesting is, in all those experiments, the researchers gave a bolus of food at one time, whereas the ad libitum fed
mice or rats had access to food all the time, eating constantly. The rats given 20% less food consumed it all within two to four
hours, following an OMAD diet, one meal a day, concept. They either finished eating in three to four hours or had a four-hour
eating period followed by 20 hours of fasting.
----------------
The question arose whether the benefit of caloric restriction is due to reduced calories or time-restricted feeding. There is a
timing component to it, as the animals consumed all

# Create a vector DB

## Option 1: OpenAI embeddings

In [36]:
from langchain_openai import OpenAIEmbeddings

# https://platform.openai.com/docs/models/embeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

In [37]:
import chromadb
from langchain_chroma import Chroma
from chromadb.config import Settings

chroma_settings = Settings(allow_reset=True)
chroma_client = chromadb.HttpClient(settings=chroma_settings)

db = Chroma(
    client=chroma_client, collection_name=f"{video_id}_{CHUNK_SIZE_FOR_PROCESSED_TRANSCRIPT}", embedding_function=embeddings
)

In [38]:
import uuid

collection = chroma_client.get_or_create_collection(name=f"{video_id}_{CHUNK_SIZE_FOR_PROCESSED_TRANSCRIPT}")
if collection.count() <= 0:
    for d in chunks:
        response = embeddings.embed_query(d.page_content)
        collection.add(
            ids=[str(uuid.uuid1())],
            embeddings=[response],
            documents=[d.page_content],
            #metadatas=[d.metadata]
        )

# Test generation - answer a question

In [39]:
question = "What did the caloric restriction experiment in mice and rats show?"

In [40]:
retriever = db.as_retriever(search_kwargs={"k": 3})
relevant_docs = retriever.invoke(input=question)

In [41]:
for doc in relevant_docs:
    print_wrapped(doc.page_content)
    print("----------------------")

So there's a famous experiment that was published last year by Joe Takahashi's lab, and it came out in Science, and that relates
to caloric restriction. We kind of started with this idea, discussing that the rat experiments were done with caloric
restriction. Researchers gave reduced calorie conjunction by 20% or 30%, and the rats, and subsequently mice, all lived longer.
What is interesting is, in all those experiments, the researchers gave a bolus of food at one time, whereas the ad libitum fed
mice or rats had access to food all the time, eating constantly. The rats given 20% less food consumed it all within two to four
hours, following an OMAD diet, one meal a day, concept. They either finished eating in three to four hours or had a four-hour
eating period followed by 20 hours of fasting.
----------------------
The question arose whether the benefit of caloric restriction is due to reduced calories or time-restricted feeding. There is a
timing component to it, as the animals consum

In [42]:
from langchain_core.prompts import PromptTemplate

rag_prompt = PromptTemplate.from_template("""Context: {context}
                                          
Answer the question based on the context provided above. Keep your answer ground in the facts of the context.
If the context does not contain the facts to answer the question, apologize and say that you don't know the answer.
                                          
Here is the question: {question}

""")

In [43]:
def format_docs_for_context(docs):
    return "\n\n---\n\n".join(doc.page_content for doc in docs)

In [44]:
from langchain_core.output_parsers import StrOutputParser

rag_chain = rag_prompt | llm | StrOutputParser()

answer = rag_chain.invoke({"question": question, "context": format_docs_for_context(relevant_docs)})

In [45]:
print_wrapped(answer)

The caloric restriction experiment in mice and rats showed that reducing calorie intake by 20% or 30% led to longer lifespans
compared to ad libitum fed mice or rats. The animals given reduced calories consumed their food within a few hours, followed by
a long fasting period, resembling an OMAD diet. Additionally, a study by Joe Takahashi's lab demonstrated that caloric
restriction extended lifespan by 10% when compared to mice who snacked throughout the day and night without extended fasting
periods.
