# RAG IMPLEMENTATION

In [1]:
import openai

# pdf loading
from langchain.document_loaders import PyPDFLoader

# YT audio loading
from langchain.document_loaders.generic import GenericLoader
from langchain.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader

# langchain-community module(s)
from langchain_community.document_loaders.parsers.audio import FasterWhisperParser

import os
from pprint import pprint
from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())

openai.api_key = os.environ['OPENAI_API_KEY']  

### Document Loading

In [2]:
# pdf loading

pdf_loader = PyPDFLoader("./pdfs/MachineLearning-Lecture01.pdf")
pages = pdf_loader.load()

In [3]:
page = pages[0]
pprint(page.page_content[:500])

('MachineLearning-Lecture01  \n'
 'Instructor (Andrew Ng): Okay. Good morning. Welcome to CS229, the machine \n'
 'learning class. So what I wanna do today is just spend a little time going '
 'over the logistics \n'
 "of the class, and then we'll start to talk a bit about machine learning.  \n"
 "By way of introduction, my name's Andrew Ng and I'll be instructor for this "
 'class. And so \n'
 "I personally work in machine learning, and I've worked on it for about 15 "
 'years now, and \n'
 'I actually think that machine learning is the ')


In [4]:
pprint(page.metadata)

{'author': '',
 'creationdate': '2008-07-11T11:25:23-07:00',
 'creator': 'PScript5.dll Version 5.2.2',
 'moddate': '2008-07-11T11:25:23-07:00',
 'page': 0,
 'page_label': '1',
 'producer': 'Acrobat Distiller 8.1.0 (Windows)',
 'source': './pdfs/MachineLearning-Lecture01.pdf',
 'title': '',
 'total_pages': 22}


In [5]:
# YT audio loading

url = 'https://www.youtube.com/watch?v=I2ZK3ngNvvI'    # Hardik Pandya
save_dir = './audios'
yt_loader = GenericLoader(
    YoutubeAudioLoader([url], save_dir),
    FasterWhisperParser(model_size='tiny', device='cpu')
)
docs = yt_loader.load()

[youtube] Extracting URL: https://www.youtube.com/watch?v=I2ZK3ngNvvI
[youtube] I2ZK3ngNvvI: Downloading webpage
[youtube] I2ZK3ngNvvI: Downloading tv client config
[youtube] I2ZK3ngNvvI: Downloading tv player API JSON
[youtube] I2ZK3ngNvvI: Downloading ios player API JSON
[youtube] I2ZK3ngNvvI: Downloading m3u8 information
[info] I2ZK3ngNvvI: Downloading 1 format(s): 140
[download] Destination: ./audios/Advice for machine learning beginners ｜ Andrej Karpathy and Lex Fridman.m4a
[download] 100% of    5.36MiB in 00:00:02 at 2.14MiB/s   
[FixupM4a] Correcting container of "./audios/Advice for machine learning beginners ｜ Andrej Karpathy and Lex Fridman.m4a"
[ExtractAudio] Not converting audio ./audios/Advice for machine learning beginners ｜ Andrej Karpathy and Lex Fridman.m4a; file is already in target format m4a




In [9]:
final_dialouge = ""
for doc in docs:    
    page_content = doc.page_content.strip()
    final_dialouge += page_content + ' '


pprint(final_dialouge[:1000])

("You're one of the greatest teachers of machine learning AI ever from CS231N "
 'to today. What advice would you give to beginners interested in getting into '
 'machine learning? Beginners are often focused on like what to do and I think '
 'the focus should be more like how much you do. So I am kind of like believer '
 'on the high level in this 10,000 hours kind of concept where you just kind '
 'of have to just pick the things where you can spend time and you care about '
 "and you're interested in. You literally have to put in 10,000 hours of work. "
 "It doesn't even like matter as much like where you put it and you'll iterate "
 "and you'll improve and you'll waste some time. I don't know if there's a "
 "better way. You need to put in 10,000 hours. But I think it's actually "
 "really nice because I feel like there's some sense of determinism about "
 'being an expert at a thing if you spend 10,000 hours. You can literally pick '
 'an arbitrary thing and I think if you spend 1

## LOCAL PDF RAG

yt@pixegami

In [1]:
from langchain_community.document_loaders import PyPDFDirectoryLoader    # langchain.document_loader depricated
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain_ollama import OllamaEmbeddings    # ollama embeddings

import pprint

In [2]:
DATA_PATH = "./pdfs"


def load_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    return document_loader.load()

In [3]:
def split_documents(documents: list[Document]):    # type hinting
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800,
        chunk_overlap=80,
        length_function=len,
        is_separator_regex=False,
    )

    return text_splitter.split_documents(documents)

In [4]:
def get_embedding():
    embeddings = OllamaEmbeddings(model="nomic-embed-text")    # best embedding model avialable in ollama
    return embeddings

In [5]:

documents = load_documents()
chunks = split_documents(documents)

In [6]:
print(len(documents))
print(len(chunks))

42
153


In [7]:
pprint.pp(chunks[0].metadata)

{'producer': 'Acrobat Distiller 8.1.0 (Windows)',
 'creator': 'PScript5.dll Version 5.2.2',
 'creationdate': '2008-07-11T11:25:23-07:00',
 'author': '',
 'moddate': '2008-07-11T11:25:23-07:00',
 'title': '',
 'source': 'pdfs/MachineLearning-Lecture01.pdf',
 'total_pages': 22,
 'page': 0,
 'page_label': '1'}


In [8]:
def calculate_chunk_ids(chunks):
    curr_chunk_idx = 0
    prev_page_id = ""

    for chunk in chunks:
        curr_page_id = f"{chunk.metadata['source']}:{chunk.metadata['page']}"
        
        if curr_page_id == prev_page_id:
            curr_chunk_idx += 1
        else:
            curr_chunk_idx = 0    # reset

        prev_page_id = curr_page_id
        chunk_id = f"{curr_page_id}:{curr_chunk_idx}"

        chunk.metadata["id"] = chunk_id

    return chunks

In [9]:
chunks_with_ids = calculate_chunk_ids(chunks)

In [10]:
chunks_with_ids[50].metadata['id']

'pdfs/MachineLearning-Lecture01.pdf:11:2'

In [11]:
# updating database
from langchain_chroma import Chroma

CHROMA_PATH = "./chroma_db"
COLLECTION_NAME = "rag_tutorial"

def add_to_chroma(chunks: list[Document]):
    db = Chroma(
        collection_name=COLLECTION_NAME,
        embedding_function=get_embedding(),
        persist_directory=CHROMA_PATH
    )
    
    curr_items = db.get(include=[])    # ids included by default
    curr_ids = set(curr_items["ids"])
    print(f"Current Documents: {len(curr_items)}")

    # adding docs not in db
    new_chunks = []
    new_chunk_ids = []
    for chunk in chunks:
        if not chunk.page_content.strip():
            print(f"[WARNING] Empty content for chunk id: {chunk.metadata['id']}")

        else:
            if chunk.metadata['id'] not in curr_ids:
                new_chunks.append(chunk)
                new_chunk_ids.append(chunk.metadata['id'])

    if new_chunks:
        db.add_documents(new_chunks, ids=new_chunk_ids)

    else:
        print("NO VALID CHUNKS TO AVAILABLE!")
    # db.persist() [automatically done in newer versions]

    print(f"Newly Added Documents: {len(new_chunks)}")

In [12]:
add_to_chroma(chunks_with_ids)

Current Documents: 7
Newly Added Documents: 153


In [13]:
from langchain.prompts import  ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
# reference: [https://python.langchain.com/docs/integrations/llms/ollama/]

def query_rag(query_txt: str):

    db = Chroma(
        collection_name=COLLECTION_NAME,
        embedding_function=get_embedding(),
        persist_directory=CHROMA_PATH
    )

    PROMPT_TEMPLATE = """
    Answer any question only on the following context:
    {context}

    ---
    Answer the question based on the above context: {question}
    """

    # retrieve most relevant chunks to our question
    results = db.similarity_search_with_score(query_txt, k=5)     

    context_txt = "\n\n---\n\n".join([doc.page_content for doc, _ in results])
    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_txt, question=query_txt)

    model = OllamaLLM(model="llama3.2:3b")
    response_txt = model.invoke(prompt)
    print(response_txt)

    sources = [doc.metadata.get("id", None) for doc, _ in results]
    print(sources)

In [23]:
query_rag("Which topic did Anderw NG Taught in the lecture?")

Based on the given context, Andrew Ng taught the following topics:

1. Logistics of the class
2. Introduction to machine learning (with some discussion about his personal experience and excitement about machine learning)
3. Convex optimization (to be covered in discussion sections)
4. Hidden Markov models (a type of machine learning algorithm for modeling time series, to be covered in discussion sections)

Note that these topics are mentioned as part of the lecture outline or discussed during the class, but not necessarily covered in detail in the main lectures.
['pdfs/MachineLearning-Lecture01.pdf:9:3', 'pdfs/MachineLearning-Lecture01.pdf:0:0', 'pdfs/MachineLearning-Lecture01.pdf:1:1', 'pdfs/MachineLearning-Lecture01.pdf:9:0', 'pdfs/MachineLearning-Lecture01.pdf:8:4']
