# RAG IMPLEMENTATION

In [1]:
import openai

# pdf loading
from langchain.document_loaders import PyPDFLoader

# YT audio loading
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

# langchain-community module(s)
from langchain_community.document_loaders.parsers.audio import FasterWhisperParser

import os
from pprint import pprint
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

openai.api_key = os.environ['OPENAI_API_KEY']  

### Document Loading

In [2]:
# pdf loading

pdf_loader = PyPDFLoader("./pdfs/MachineLearning-Lecture01.pdf")
pages = pdf_loader.load()

In [3]:
page = pages[0]
pprint(page.page_content[:500])

('MachineLearning-Lecture01  \n'
 'Instructor (Andrew Ng): Okay. Good morning. Welcome to CS229, the machine \n'
 'learning class. So what I wanna do today is just spend a little time going '
 'over the logistics \n'
 "of the class, and then we'll start to talk a bit about machine learning.  \n"
 "By way of introduction, my name's Andrew Ng and I'll be instructor for this "
 'class. And so \n'
 "I personally work in machine learning, and I've worked on it for about 15 "
 'years now, and \n'
 'I actually think that machine learning is the ')


In [4]:
pprint(page.metadata)

{'author': '',
 'creationdate': '2008-07-11T11:25:23-07:00',
 'creator': 'PScript5.dll Version 5.2.2',
 'moddate': '2008-07-11T11:25:23-07:00',
 'page': 0,
 'page_label': '1',
 'producer': 'Acrobat Distiller 8.1.0 (Windows)',
 'source': './pdfs/MachineLearning-Lecture01.pdf',
 'title': '',
 'total_pages': 22}


In [5]:
# YT audio loading

url = 'https://www.youtube.com/watch?v=I2ZK3ngNvvI'    # Hardik Pandya
save_dir = './audios'
yt_loader = GenericLoader(
    YoutubeAudioLoader([url], save_dir),
    FasterWhisperParser(model_size='tiny', device='cpu')
)
docs = yt_loader.load()

[youtube] Extracting URL: https://www.youtube.com/watch?v=I2ZK3ngNvvI
[youtube] I2ZK3ngNvvI: Downloading webpage
[youtube] I2ZK3ngNvvI: Downloading tv client config
[youtube] I2ZK3ngNvvI: Downloading tv player API JSON
[youtube] I2ZK3ngNvvI: Downloading ios player API JSON
[youtube] I2ZK3ngNvvI: Downloading m3u8 information
[info] I2ZK3ngNvvI: Downloading 1 format(s): 140
[download] Destination: ./audios/Advice for machine learning beginners ｜ Andrej Karpathy and Lex Fridman.m4a
[download] 100% of    5.36MiB in 00:00:02 at 2.14MiB/s   
[FixupM4a] Correcting container of "./audios/Advice for machine learning beginners ｜ Andrej Karpathy and Lex Fridman.m4a"
[ExtractAudio] Not converting audio ./audios/Advice for machine learning beginners ｜ Andrej Karpathy and Lex Fridman.m4a; file is already in target format m4a




In [9]:
final_dialouge = ""
for doc in docs:    
    page_content = doc.page_content.strip()
    final_dialouge += page_content + ' '


pprint(final_dialouge[:1000])

("You're one of the greatest teachers of machine learning AI ever from CS231N "
 'to today. What advice would you give to beginners interested in getting into '
 'machine learning? Beginners are often focused on like what to do and I think '
 'the focus should be more like how much you do. So I am kind of like believer '
 'on the high level in this 10,000 hours kind of concept where you just kind '
 'of have to just pick the things where you can spend time and you care about '
 "and you're interested in. You literally have to put in 10,000 hours of work. "
 "It doesn't even like matter as much like where you put it and you'll iterate "
 "and you'll improve and you'll waste some time. I don't know if there's a "
 "better way. You need to put in 10,000 hours. But I think it's actually "
 "really nice because I feel like there's some sense of determinism about "
 'being an expert at a thing if you spend 10,000 hours. You can literally pick '
 'an arbitrary thing and I think if you spend 1