# 1. Installing the required library. 
TODO: Add a package manager like poetry in the final version.

In [1]:
# ! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain

# 2. Resolving the OpenAI API access requirements

In [2]:
import os
from config import OPENAI_API_KEY

In [3]:
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

# 3. Resolving the Langchain/Langsmith access requirements

In [4]:
from config import LANGSMITH_API_KEY

In [5]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = LANGSMITH_API_KEY

# 4. Building a Rudimentary RAG

In [6]:
import bs4
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

### 4.1. Loading a small chunk of data

In [7]:
# utilized the information on the webpage of my Winter Quarter class on Scalable Data Systems.
loader = WebBaseLoader(
    web_paths=("https://hao-ai-lab.github.io/dsc204a-w24/syllabus/",
               "https://hao-ai-lab.github.io/dsc204a-w24/resources/"),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("main-content")
        )
    ),
)
info = loader.load()

In [8]:
# Checking information scraped from the "Syllabus" page.
info[0].page_content[:200]

'\n\n Syllabus\n        \n        \n      \n\n Table of Contents\n        \n        \n      \n\nSyllabus \nLogistics\nCourse Content and Format \nLectures\n3 Programming Assignments (PAs)\nExams\nScribe notes\nReading Su'

In [9]:
# # Checking information scraped from the "Resources" page.
info[1].page_content[:200]

'\n\n Resources\n        \n        \n      \n\n Table of Contents\n        \n        \n      \n\nBook\nAdditional Books\nPast Offerings\nMaterials \nStudent Materials Folder\nResources on Ray\nRelated Documentation and '

### 4.2. Splitting data into chunks

In [10]:
splits = RecursiveCharacterTextSplitter(chunk_size=999, chunk_overlap=200).split_documents(info)

In [11]:
print(f"No of splits made: {len(splits)}")
print(f"\n{'-'*50} \n")
print(f"First split: \n {splits[0]}")
print(f"\n{'-'*50} \n")
print(f"Last split: \n {splits[-1]}")

No of splits made: 18

-------------------------------------------------- 

First split: 
 page_content='Syllabus\n        \n        \n      \n\n Table of Contents\n        \n        \n      \n\nSyllabus \nLogistics\nCourse Content and Format \nLectures\n3 Programming Assignments (PAs)\nExams\nScribe notes\nReading Summary\nParticipation\n\n\nPre-requisites\nGrading \nComponents\nCutoffs\n\n\nClassroom Rules\n\n\n\nThe course is organized into four parts, covering the following topics.\n\nFoundations of Data Systems: Data models, big data storage and retrieval, and how to encode information when you store data.\nScaling Distributed Systems: Cluster, cloud, edge, network, replication, partition, consistency, ACID.\nData Processing and Programming model: Batch processing, stream processing, MapReduce, Hadoop, Spark, Ray.\nMachine Learning Systems: GPUs, TensorFlow, PyTorch, data and model parallelism, LLM training and serving.' metadata={'source': 'https://hao-ai-lab.github.io/dsc204a-w2

### 4.3. Generating Embeddings: 
Since I intend to use GPT3-Turbo model for this POC, I will be using the OpenAIEmbeddings generator function.

I will get back a vectorstore which can be used as a retriever. 

In [12]:
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

In [13]:
retriever = vectorstore.as_retriever()

### 4.4. RAG Prompt:
Using a community template for the RAG prompt.<br> 
Link: https://smith.langchain.com/hub/rlm/rag-prompt?organizationId=d1f8dd50-f543-5244-a666-0e199c97fd76

In [14]:
prompt = hub.pull("rlm/rag-prompt")

### 4.5. Setting the LLM:
I have decided to use GPT 3.5 turbo for this POC.

In [15]:
# setting temperature as 0 to curb creative output generation.
# TODO: Explore other GPT base versions like davinci & babbage.
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

### 4.6. Setting up Post-processing:

In [16]:
def format_docs(info):
    return "\n\n".join(document.page_content for document in info)

### 4.7. Building an RAG chain

In [17]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

### 4.8. Tests v.0

In [18]:
# Test v.0.0:
# Checking a topic present in the info(context). It is a header topic. 
rag_chain.invoke("What is the course content and format?")

'The course consists of 50-minute lectures held three times a week in person, with attendance encouraged but not mandatory. There are scribe notes and reading summaries required for each lecture, along with 3 programming assignments. The course covers topics such as data systems foundations, scaling distributed systems, data processing, and machine learning systems.'

<b>Assessment:</b><br>
Checking this link: https://hao-ai-lab.github.io/dsc204a-w24/syllabus/#course-content-and-format
<br>As we can see the main information is mostly well summarized but as a student who has taken the course, I know that for:<br><br>
<i>"scribe notes and reading summaries required for each lecture"</i><br><br>
We do not have the accurate information. 
<ul><li>We had readings assigned for each lecture but the reading summaries were assigned once a week.
<li>Further, the scribe notes were prepared for each lecture by a group of 2-3 rotating student groups.</ul><br>
<b>Possible Adjustment:</b><br> 
<ul>
    <li> Return the source links/documents. Additionally, provide a disclaimer.
</ul>

In [19]:
# Test v.0.1
# Adding the Link to the response.
altered_rag_chain = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=altered_rag_chain)

result = rag_chain_with_source.invoke("What is the course content and format?")

# TODO: Optimize
relevant_docs = set()
for doc in result.get('context'):
    relevant_docs.add(doc.metadata.get('source'))

print(f"Answer:\n\n{result.get('answer')}\n\nDISCLAIMER: LLM Generated Summary. Summary is generated from the sources seen below.\n{list(relevant_docs)}")

Answer:

The course consists of 50-minute lectures held three times a week in person, with attendance encouraged but not mandatory. There are scribe notes and reading summaries required for each lecture, along with 3 programming assignments. The course covers topics such as data systems, distributed systems, data processing, and machine learning systems.

DISCLAIMER: LLM Generated Summary. Summary is generated from the sources seen below.
['https://hao-ai-lab.github.io/dsc204a-w24/syllabus/']


<b>Assessment:</b><br>
<ul>
    <li> Generated Data might still be misleading. Root cause not addressed.
    <li> Langchain Runtime Comparison shows that this type of query is slower. (v.0.0 = 1.88s and v.0.1 = ~2.25s)
    <li> Disclaimer might be problematic.
    <li> If there are many sources returned then processing step might be slow. Something similar to lazy loading required? 
</ul>
<b>Possible Adjustment:</b><br> 
<ul>
    <li> Reseaeched and found a Vector Similarity Check that can be performed. 
</ul>

### 4.9. Vector Similarity:

In [26]:
%timeit vectorstore.similarity_search_with_score("What is the course content and format?")

similar_docs = vectorstore.similarity_search_with_score("What is the course content and format?")
for doc in similar_docs:
    print(doc)
    print("\n")

150 ms ± 47.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
(Document(page_content='Course Content and Format\n        \n        \n      \n\n Lectures\n        \n        \n      \nThe class meets 3 times a week for 50-minute lectures in person.\n\nAttending the lectures is not mandatory but highly encouraged. All lectures will be automatically podcast here afterward.\nThere will be scribe notes required for each lecture. Students should form groups of 2 - 4 people and sign up one slot. See details below.\nThere will be reading summary required per week. Everyone needs to submit their reading summary. See details below.\nWe will use Piazza for asynchronous discussions and questions.\n\n\n 3 Programming Assignments (PAs)\n        \n        \n      \n\nSee the assignments page for updates on the PA schedule and details.\nThere are no late days for the PAs. Plan your work accordingly.\n\n\n Exams', metadata={'source': 'https://hao-ai-lab.github.io/dsc204a-w24/syllabus/'}), 0.3072

<b>Assessment:</b><br>
<ul>
    <li> + Seems pretty fast. Faster the previous LLM based querying. 
    <li> + Also has a method that performs the search with a score.
    <li> - Cosine distance is used which has pitfalls as mentioned here: https://marketbrew.ai/a/cosine-similarity#what-are-the-potential-challenges-or-limitations-of-using-cosine-similarity-in-certain-scenarios
</ul>
<b>Possible Adjustment:</b><br> 
<ul>
    <li> Discuss with Prof. how to best overcome this. 
    <li> Combination method?
</ul>

### 4.10. Tests v.0 contd.

In [29]:
# Test v.0.2:
# Checking a topic absent in the info(context). 
rag_chain.invoke("What is Chemistry?")

'Chemistry is the study of matter, its properties, composition, and interactions. It involves understanding the structure of atoms and molecules, as well as the changes they undergo. Chemistry is a fundamental science that plays a crucial role in various fields such as medicine, engineering, and environmental science.'

In [28]:
# Test v.0.3
# Adding the Link to the response.
altered_rag_chain = (
    RunnablePassthrough.assign(context=(lambda x: format_docs(x["context"])))
    | prompt
    | llm
    | StrOutputParser()
)

rag_chain_with_source = RunnableParallel(
    {"context": retriever, "question": RunnablePassthrough()}
).assign(answer=altered_rag_chain)

result = rag_chain_with_source.invoke("What is Chemistry?")

# TODO: Optimize
relevant_docs = set()
for doc in result.get('context'):
    relevant_docs.add(doc.metadata.get('source'))

print(f"Answer:\n\n{result.get('answer')}\n\nDISCLAIMER: LLM Generated Summary. Summary is generated from the sources seen below.\n{list(relevant_docs)}")

Answer:

Chemistry is the study of matter, its properties, composition, and interactions. It involves understanding the structure of atoms and molecules, as well as the changes they undergo. Chemistry is a fundamental science that plays a crucial role in various fields such as medicine, engineering, and environmental science.

DISCLAIMER: LLM Generated Summary. Summary is generated from the sources seen below.
['https://hao-ai-lab.github.io/dsc204a-w24/syllabus/']


Failed to batch ingest runs: LangSmithError("Failed to post https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPSConnectionPool(host='api.smith.langchain.com', port=443): Read timed out. (read timeout=10.0)\n")


In [27]:
%timeit vectorstore.similarity_search_with_score("What is Chemistry?")

similar_docs = vectorstore.similarity_search_with_score("What is Chemistry?")
for doc in similar_docs:
    print(doc)
    print("\n")

The slowest run took 10.13 times longer than the fastest. This could mean that an intermediate result is being cached.
300 ms ± 281 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
(Document(page_content='Syllabus\n        \n        \n      \n\n Table of Contents\n        \n        \n      \n\nSyllabus \nLogistics\nCourse Content and Format \nLectures\n3 Programming Assignments (PAs)\nExams\nScribe notes\nReading Summary\nParticipation\n\n\nPre-requisites\nGrading \nComponents\nCutoffs\n\n\nClassroom Rules\n\n\n\nThe course is organized into four parts, covering the following topics.\n\nFoundations of Data Systems: Data models, big data storage and retrieval, and how to encode information when you store data.\nScaling Distributed Systems: Cluster, cloud, edge, network, replication, partition, consistency, ACID.\nData Processing and Programming model: Batch processing, stream processing, MapReduce, Hadoop, Spark, Ray.\nMachine Learning Systems: GPUs, TensorFlow, PyTorch, data and m

<b>Assessment:</b><br>
<ul>
    <li> - LLM Method: Prompt may need to be adjusted. The LLM seems to be returning an answer. One-shot prompt?
    <li> + Similarity score seems to be higher for the documents when the prompt is not related.
    <li> - How to go about setting a threshold? Discuss this with the Prof. 
</ul>
<b>Possible Adjustment:</b><br> 
<ul>
    <li> Research Maximal Marginal Relevance(MMR) to potential improve the similarity check.
    <li> Research other methods for document similarity check in this context, TF-IDF? Doc2Vec?
    <li> Research different Embedding methods. MTEB leaderboard? 
</ul>