In [3]:
from dotenv import load_dotenv
import os
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
ACTIVELOOP_TOKEN = os.getenv("ACTIVELOOP_TOKEN")


In [37]:
import yt_dlp
import warnings
warnings.filterwarnings("ignore")

In [6]:
# Function to download the best quality mp4 video file from any YouTube link and save it to the specified path and filename.

def download_mp4_from_youtube(url):
    # set the details for the download

    filename = "llm_video.mp4"
    ydl_opts = {
        format: 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]',
        'outtmpl': filename,
        'quiet': True
    }

    # download the video
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        result = ydl.extract_info(url, download=True)

In [None]:
url = "https://www.youtube.com/watch?v=zjkBMFhNj_g"
download_mp4_from_youtube(url)

In [8]:
# Using whisper to transcribe the video

import whisper

model = whisper.load_model("base")
result = model.transcribe("llm_video.mp4.webm")
print(result["text"])

100%|███████████████████████████████████████| 139M/139M [00:13<00:00, 10.9MiB/s]
  checkpoint = torch.load(fp, map_location=device)


 Hi everyone. So recently I gave a 30 minute talk on large language models just kind of like an intro talk Um, unfortunately that talk was not recorded But a lot of people came to me after the talk and they told me that they really liked the talk So I was just I thought I was just re-recorded and basically put it up on YouTube So here we go the busy persons intro to large language models director scut Okay, so let's begin First of all, what is a large language model really? Well a large language model is just two files, right? Um, there will be two files in this hypothetical directory So for example, where can we do a specific example of the llama to 70b model? This is a large language model released by meta AI And this is basically the llama series of language models the second iteration of it and this is the 70 billion parameter model of Of this series. So there's multiple models belong to the llama to series 7 billion 13 billion 34 billion and 70 billion as the biggest one Now many 

In [9]:
with open("transcript.txt", "w") as file:
    file.write(result["text"])

## Summarization using LangChain

In [5]:
from langchain_openai import OpenAI
from langchain_core.prompts import PromptTemplate
from langchain.chains.summarize import load_summarize_chain

In [6]:
llm = OpenAI(model="gpt-3.5-turbo-instruct", 
             temperature=0)

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, 
    chunk_overlap=0,
    separators = [" ",",","\n"]
    )

In [6]:
from langchain.docstore.document import Document
with open("transcript.txt") as file:
    text = file.read()

In [7]:
# Split the transcript text into smaller chunks using the text splitter
texts = text_splitter.split_text(text)

# Convert all text chunks into Document objects
docs = [Document(page_content=t) for t in texts]

In [8]:
print(docs)

[Document(metadata={}, page_content="Hi everyone. So recently I gave a 30 minute talk on large language models just kind of like an intro talk Um, unfortunately that talk was not recorded But a lot of people came to me after the talk and they told me that they really liked the talk So I was just I thought I was just re-recorded and basically put it up on YouTube So here we go the busy persons intro to large language models director scut Okay, so let's begin First of all, what is a large language model really? Well a large language model is just two files, right? Um, there will be two files in this hypothetical directory So for example, where can we do a specific example of the llama to 70b model? This is a large language model released by meta AI And this is basically the llama series of language models the second iteration of it and this is the 70 billion parameter model of Of this series. So there's multiple models belong to the llama to series 7 billion 13 billion 34 billion and 70 

### Summarization using "Stuff" method - using create_stuff_documents_chain

In [10]:
# Import required modules from LangChain
from langchain.chains.combine_documents import create_stuff_documents_chain  # For creating a chain that combines documents using the "stuff" method
from langchain_core.prompts import ChatPromptTemplate  # For creating chat-style prompts

# Define the prompt template that will be used for summarization
# ChatPromptTemplate.from_messages creates a prompt from a list of (role, content) tuples
# Here we use a "system" role with instructions to write a concise summary
# {context} is a placeholder that will be filled with our document content
prompt = ChatPromptTemplate.from_messages(
    [("system", "Write a concise summary of the following:\\n\\n{context}")]
)

# Create a chain that will:
# 1. Take our documents (docs)
# 2. "Stuff" them all into the prompt's {context}
# 3. Send to the LLM (defined earlier as 'llm')
# 4. Get back a summary
chain = create_stuff_documents_chain(llm, prompt)

# Execute the chain by:
# 1. Passing our documents in a dictionary with "context" as the key
# 2. The chain will process them according to the steps above
# 3. Store the summary result in 'result'
result = chain.invoke({"context": docs})
print(result)  # Print the generated summary

The speaker recently delivered a 30-minute introductory talk on large language models (LLMs), specifically focusing on the Llama 2 70B model released by Meta AI. LLMs consist of two main files: a parameters file containing the model's weights and a code file that runs the model. The Llama 2 70B model is notable for being one of the most powerful open-weight models available, allowing users to run it on their own hardware without internet connectivity.

Training LLMs involves a complex process of compressing vast amounts of internet text (around 10 terabytes) into the model's parameters, which requires significant computational resources, including thousands of GPUs and substantial financial investment. The model's primary function is to predict the next word in a sequence, which allows it to generate coherent text based on the knowledge it has learned during training.

The talk also covered the two main stages of training LLMs: pre-training, which involves learning from a large dataset

In [12]:
import textwrap

wrapped_text = textwrap.fill(result, width=100)
print(wrapped_text)


The speaker recently delivered a 30-minute introductory talk on large language models (LLMs),
specifically focusing on the Llama 2 70B model released by Meta AI. LLMs consist of two main files:
a parameters file containing the model's weights and a code file that runs the model. The Llama 2
70B model is notable for being one of the most powerful open-weight models available, allowing users
to run it on their own hardware without internet connectivity.  Training LLMs involves a complex
process of compressing vast amounts of internet text (around 10 terabytes) into the model's
parameters, which requires significant computational resources, including thousands of GPUs and
substantial financial investment. The model's primary function is to predict the next word in a
sequence, which allows it to generate coherent text based on the knowledge it has learned during
training.  The talk also covered the two main stages of training LLMs: pre-training, which involves
learning from a large dataset

In [27]:
from langchain.prompts import PromptTemplate

prompt_template = """Write a concise bullet point summary of the following:

{context}

CONSCISE SUMMARY IN BULLET POINTS:"""

bullet_point_prompt = PromptTemplate(template=prompt_template, 
                        input_variables=["context"])

chain = create_stuff_documents_chain(llm, bullet_point_prompt)

result = chain.invoke({"context": docs})
print(result)  # Print the generated summary


- **Talk Overview**: The speaker presented an introductory talk on large language models (LLMs) that was well-received but not recorded, prompting a re-recording for YouTube.

- **Definition of LLMs**: LLMs consist of two main files: a parameters file (weights) and a code file to run the model, exemplified by the Llama 2 70B model from Meta AI.

- **Model Characteristics**: 
  - Llama 2 70B is an open-source model with 70 billion parameters, making it one of the most powerful available.
  - The parameters file is approximately 140 GB, and the model can be run locally without internet connectivity.

- **Training Process**: 
  - Model training involves compressing vast amounts of internet text (around 10 terabytes) using a GPU cluster (6000 GPUs over 12 days, costing about $2 million).
  - The training process is computationally intensive, while inference (running the model) is relatively cheap.

- **Next Word Prediction**: LLMs predict the next word in a sequence, learning a significant

In [34]:
wrapped_text = textwrap.fill(result, 
                             width=1000,
                             break_long_words=False,
                             replace_whitespace=False)
print(wrapped_text)

- **Talk Overview**: The speaker presented an introductory talk on large language models (LLMs) that was well-received but not recorded, prompting a re-recording for YouTube.

- **Definition of LLMs**: LLMs consist of two main files: a parameters file (weights) and a code file to run the model, exemplified by the Llama 2 70B model from Meta AI.

- **Model Characteristics**: 
  - Llama 2 70B is an open-source model with 70 billion parameters, making it one of the most powerful available.
  - The parameters file is approximately 140 GB, and the model can be run locally without internet connectivity.

- **Training Process**: 
  - Model training involves compressing vast amounts of internet text (around 10 terabytes) using a GPU cluster (6000 GPUs over 12 days, costing about $2 million).
  - The training process is computationally intensive, while inference (running the model) is relatively cheap.

- **Next Word Prediction**: LLMs predict the next word in a sequence, learning a significant

## Adding Transcripts to Deep Lake

When we have a large number of transcripts, we can store them in a Deep Lake database.

In [35]:
import yt_dlp


In [38]:
# Function to download multiple YouTube videos as MP4 files
def download_mp4_from_youtube(urls, job_id):
    # Initialize empty list to store video information
    video_info = []

    # Loop through each URL with an index counter
    for i, url in enumerate(urls):
        # Create unique filename using job_id and index
        file_temp = f'./{job_id}_{i}.mp4'
        
        # Configure youtube-dl options:
        # - Download best quality MP4 video and audio
        # - Save with the temporary filename
        # - Run quietly without progress output
        ydl_opts = {
            'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]',
            'outtmpl': file_temp,
            'quiet': True
        }
        
        # Download the video using youtube-dl
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            # Extract video info and download
            result = ydl.extract_info(url, download=True)
            # Get video title and author, default to empty string if not found
            title = result.get('title', "")
            author = result.get('uploader', "")
        
        # Store filename, title and author for each video
        video_info.append((file_temp, title, author))
    
    # Return list of tuples containing info for all downloaded videos
    return video_info   

# List of YouTube video URLs to download
urls = ["https://www.youtube.com/watch?v=2IK3DFHRFfw&t=616s", "https://www.youtube.com/watch?v=fkIvmfqX-t0", "https://www.youtube.com/watch?v=KrRD7r7y7NY"]

# Download the videos and get their details
vides_details = download_mp4_from_youtube(urls, 1)

                                                                         

Now, transcribe the videos using Whisper as we previously saw and save the results in a text file.