In [9]:
!pip3 install certifi
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv
!pip3 install --upgrade --quiet "unstructured[all-docs]" onnx==1.16.0  # for proccess pdf with images


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip

In [10]:
# Import langchain modules
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.vectorstores import Chroma
from langchain_core.runnables import RunnablePassthrough
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import Any
from unstructured.partition.pdf import partition_pdf

# other modules and packages
import os
import tempfile
import streamlit as st
import pandas as pd
from dotenv import load_dotenv

import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')



[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/sgnclexus/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/sgnclexus/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [3]:
load_dotenv()

True

In [4]:
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

## Define our LLM

In [5]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
llm.invoke("Tell me why animals like dogs its eyes seem to bright in the dark?")

AIMessage(content="The bright appearance of dogs' eyes in the dark is primarily due to a structure called the tapetum lucidum. This is a layer of tissue located behind the retina that reflects light that passes through the retina back into the eye, giving the photoreceptors a second chance to capture the light. This adaptation enhances their night vision, allowing them to see better in low-light conditions.\n\nWhen light hits a dog’s eyes at night, especially from sources like streetlights or car headlights, the tapetum lucidum reflects it, making their eyes appear to glow or shine. This phenomenon is not exclusive to dogs; many animals that are nocturnal or crepuscular (active during twilight) have a tapetum lucidum, which aids them in hunting and navigating in the dark. The intensity and color of the glow can vary among different species and even among individuals of the same species.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 179, 

## Process PDF Document 

### Load PDF Document

In [None]:
loader = PyPDFLoader("data/grok_system_design_interview.pdf")
pages = loader.load()
pages 

In [None]:
# Get elements
raw_pdf_elements = partition_pdf(
    filename="data/grok_system_design_interview.pdf",
    
    # Using pdf format to find embedded image blocks
    extract_images_in_pdf=True,
    
    # Use layout model (YOLOX) to get bounding boxes (for tables) and find titles
    # Titles are any sub-section of the document
    infer_table_structure=True,
    ocr_languages="eng",
    # Post processing to aggregate text once we have the title
    chunking_strategy="by_title",
    # Chunking params to aggregate text blocks
    # Attempt to create a new chunk 3800 chars
    # Attempt to keep chunks > 2000 chars
    # Hard max on chunks
    max_characters=4000,
    new_after_n_chars=3800,
    combine_text_under_n_chars=2000,
    image_output_dir_path="data/",
)

TypeError: get_model() got an unexpected keyword argument 'ocr_languages'

### Split document

In [None]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,
                                               chunk_overlap=500,
                                               length_function=len,
                                               separators=["\n\n","\n"," "])
chunks = text_splitter.split_documents(pages)
chunks

### Create embeddings

In [9]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_type=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
test_vector = embedding_function.embed_query("dog")

In [None]:
from langchain.evaluation import load_evaluator

evaluator = load_evaluator(evaluator="embedding_distance",
                           embeddings=embedding_function)

evaluator.evaluate_strings(prediction="Amsterdam", reference="coffeeshop")

In [None]:
evaluator.evaluate_strings(prediction="Paris", reference="coffeeshop")

### Create a vector database

In [26]:
import uuid

def create_vectorstore(chunks, embedding_function, vectorstore_path):

    # create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]

    # ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = []
    for chunk, id in zip(chunks, ids):
        if id not in unique_ids:
            unique_ids.add(id)
            unique_chunks.append(chunk)


    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(documents=chunks,
                                        ids=list(unique_ids),
                                        embedding=embedding_function,
                                        persist_directory="vectorstore")

    vectorstore.persist()

    return vectorstore

In [27]:
 # create vectorstore
vectorstore = create_vectorstore(chunks=chunks,
                                 embedding_function=embedding_function,
                                 vectorstore_path="vectorstore_chroma"
                                 )

 ## 2. Query for relevant data

In [28]:
# load vectorstore
vectorstore = Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)

In [None]:
# create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")
relevant_chunks = retriever.invoke("What is the title of the document?")
relevant_chunks

In [30]:
PROMPT_TEMPLATE = """
    You are an assistant for question-answering task. 
    Use the following pieces of retrieved context to answer
    the question. If you don't know the answer, say that you
    don't know. DON'T MAKE UP ANYTHING.

    {context}

    ---

    Answer the question based on the above context: {question}
"""



In [None]:
# concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, question="What is the title of the paper?")
print(prompt)

## 3. Generate responses

In [None]:
llm.invoke(prompt)