<a href="https://colab.research.google.com/github/sugarhy/RAG-Implementation/blob/master/RAG_Implementation2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Install and import libraries**

In [None]:
!pip install langchain openai weaviate-client
!pip install python-dotenv
!pip install tiktoken

Collecting langchain
  Downloading langchain-0.1.5-py3-none-any.whl (806 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m806.7/806.7 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting openai
  Downloading openai-1.11.1-py3-none-any.whl (226 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m226.1/226.1 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting weaviate-client
  Downloading weaviate_client-4.4.2-py3-none-any.whl (293 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.5/293.5 kB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain)
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl (12 kB)
Collecting langchain-community<0.1,>=0.0.17 (from langchain)
  Downloading langchain_community-0.0.17-py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━

In [None]:
import dotenv

import requests
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter

import openai
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Weaviate
import weaviate
from weaviate.embedded import EmbeddedOptions

from langchain.prompts import ChatPromptTemplate

from langchain.chat_models import ChatOpenAI
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser



In [None]:
# OpenAI API Key
OPENAI_API_KEY="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"

In [None]:
# load the relevant environment variable
dotenv.load_dotenv()

False

# **Preparation**
Preparing a vector database as an external knowledge source that holds all additional information
1. Collect and load your data
2. Transform (chunk) your documents
3. Embed and store chunks

### **Collecting and Loading your data**

In [None]:
# data (additional context)
url = "https://raw.githubusercontent.com/sugarhy/RAG-Implementation/master/verstappen.txt" #Max Verstappen Time Magazine article
res = requests.get(url)
with open("verstappen.txt", "w") as f:
    f.write(res.text)

loader = TextLoader('./verstappen.txt')
documents = loader.load()

### **Transform (chunk) your documents**

In [None]:
# splits text into chunks of 1000 characters each with a 150-character overlap (provides surrounding context to the LLM).
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=150)
chunks = text_splitter.split_documents(documents)

### **Embed and store the chunks**
To enable semantic search across the text chunks, we need to generate the vector embeddings for each chunk and then store them together with their embeddings in a vector database.

To generate the vector embeddings, we will use the OpenAI Embedding model and the Weaviate vector database to store them. Calling the .from_documents() will populate the vector database with the chunks.

In [None]:
from dotenv import load_dotenv,find_dotenv

# Load OpenAI API key from .env file
load_dotenv(find_dotenv())

client = weaviate.Client(
  embedded_options = EmbeddedOptions()
)

vectorstore = Weaviate.from_documents(
    client = client,
    documents = chunks,
    embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY),
    by_text = False
)

Binary /root/.cache/weaviate-embedded did not exist. Downloading binary from https://github.com/weaviate/weaviate/releases/download/v1.23.7/weaviate-v1.23.7-Linux-amd64.tar.gz
Started /root/.cache/weaviate-embedded: process ID 1196


  warn_deprecated(


#**Step 1: Retrieve**
The retriever component fetches the additional context based on the semantic similarity between the user query and the embedded chunks

In [None]:
retriever = vectorstore.as_retriever()

#**Step 2: Augment**
To augment the prompt with the additional context, you need to prepare a prompt template. The prompt can be easily customised from a prompt template.

In [None]:
# COSTAR
template = """You are a journalist working for a reputable magazine tasked on question-answering.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use at least three sentences and keep the answer concise.
Question: {question}
Context: {context}
Answer:
"""
prompt = ChatPromptTemplate.from_template(template)

print(prompt)

input_variables=['context', 'question'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template="You are a journalist working for a reputable magazine tasked on question-answering. \nUse the following pieces of retrieved context to answer the question.\nIf you don't know the answer, just say that you don't know.\nUse at least three sentences and keep the answer concise.\nQuestion: {question}\nContext: {context}\nAnswer:\n"))]


#**Step 3: Generate**
Build a chain for the RAG pipeline, chaining together the retriever, the prompt template and the LLM. Once the RAG chain is defined, you can invoke it

In [None]:
# Define the parameters
max_tokens = 512  # Adjust as needed
temperature = 0.7  # Adjust as needed
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7, openai_api_key="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")

rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser(max_tokens=max_tokens, temperature=temperature)
)

query = "What factors have contributed to the recent surge in popularity of Formula One, particularly in the United States??"
rag_chain.invoke(query)

  warn_deprecated(


'The recent surge in popularity of Formula One, particularly in the United States, can be attributed to several factors. One major factor is the success of the Netflix series "Formula 1: Drive to Survive," which has turned F1 figures into recognizable stars and has attracted a larger audience to the sport. Additionally, F1\'s modernized marketing and proactive approach to social media under Liberty Media have helped to increase its popularity, with a significant rise in subscribers to F1\'s YouTube channel. The introduction of new races in the United States, such as the Miami and Las Vegas Grand Prix, has also contributed to the sport\'s growing popularity in the country.'

In [None]:
# Define the parameters
max_tokens = 512  # Adjust as needed
temperature = 0.7  # Adjust as needed
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7, openai_api_key="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")

rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser(max_tokens=max_tokens, temperature=temperature)
)

query = "How did Jos treat Max?"
rag_chain.invoke(query)

"Jos treated Max with a strict and demanding parenting style. There were arguments and tense moments between them, and Jos even left Max at a gas station and didn't speak to him for a week as a form of punishment. While Jos denies abusing Max, he has admitted to hitting him once on the helmet before a race."

In [None]:
template1 = """ Use the following pieces of retrieved context to answer the question.
Question: {question}
Context: {context}
Answer:
"""
prompt1 = ChatPromptTemplate.from_template(template1)

print(prompt1)

input_variables=['context', 'question'] messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], template=' Use the following pieces of retrieved context to answer the question.\nQuestion: {question}\nContext: {context}\nAnswer:\n'))]


In [None]:
# Define the parameters
max_tokens = 512  # Adjust as needed
temperature = 0.7  # Adjust as needed
#llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7, openai_api_key="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")
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0.7, openai_api_key=OPENAI_API_KEY)

rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()}
    | prompt1
    | llm
    | StrOutputParser(max_tokens=max_tokens, temperature=temperature)
)

query = "What factors have contributed to the recent surge in popularity of Formula One, particularly in the United States??"
rag_chain.invoke(query)

'Factors that have contributed to the recent surge in popularity of Formula One, particularly in the United States, include the success of Max Verstappen, the Netflix series "Formula 1: Drive to Survive," and the efforts of Liberty Media to modernize marketing and expand social media presence. Verstappen\'s rise as a talented driver and his replacement of Lewis Hamilton as the face of Formula One has garnered attention and interest. The Netflix series has turned Formula One figures into recognizable stars and has attracted a larger audience, especially in the United States. Additionally, under Liberty Media\'s ownership, Formula One has taken a more proactive approach to social media, resulting in a significant increase in subscribers to its YouTube channel. These factors, along with the introduction of new races in the United States, have contributed to the recent surge in popularity of Formula One.'

In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
Installing collected pa

In [None]:
from datasets import Dataset

questions = ["What did the Christian Horner say about max verstappen?",
             "What factors contributed to the recent surge in popularity in f1, particularly in the U.S?",
             "What record has Max Verstappen broken in F1?",
            ]
ground_truths = [["Christian Horner praised Max Verstappen's remarkable talent and hailed him as the best driver in Formula 1, emphasizing his consistency and performance throughout the season."],
                ["Factors such as engaging digital content, competitive races, and the rise of talented young drivers like Max Verstappen and Lewis Hamilton."],
                ["Verstappen has broken the record for the youngest Formula 1 world champion in history."]]
answers = []
contexts = []

# Inference
for query in questions:
  answers.append(rag_chain.invoke(query))
  contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# To dict
data = {
    "question": questions,
    "answer": answers,
    "contexts": contexts,
    "ground_truths": ground_truths
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

In [None]:
pip install ragas

Collecting ragas
  Downloading ragas-0.0.22-py3-none-any.whl (52 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/52.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.4/52.4 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting pysbd>=0.3.4 (from ragas)
  Downloading pysbd-0.3.4-py3-none-any.whl (71 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/71.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.1/71.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pysbd, ragas
Successfully installed pysbd-0.3.4 ragas-0.0.22


In [None]:
# import os
# print(os.environ.get("OPENAI_API_KEY"))

None


In [None]:
import pandas

In [None]:
#from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

result = evaluate(
    dataset = dataset,
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ]
)

df = result.to_pandas()
print(df)

evaluating with [context_precision]


100%|██████████| 1/1 [00:02<00:00,  2.72s/it]


evaluating with [context_recall]


100%|██████████| 1/1 [00:04<00:00,  4.04s/it]


evaluating with [faithfulness]


100%|██████████| 1/1 [00:10<00:00, 10.30s/it]


evaluating with [answer_relevancy]


100%|██████████| 1/1 [00:02<00:00,  2.65s/it]


                                            question  \
0  What did the Christian Horner say about max ve...   
1  What factors contributed to the recent surge i...   
2       What record has Max Verstappen broken in F1?   

                                              answer  \
0  Christian Horner said that Max Verstappen was ...   
1  The factors that contributed to the recent sur...   
2  Max Verstappen has broken the record for the m...   

                                            contexts  \
0  [Competitors weren’t exactly enamored with an ...   
1  [Drive to Survive has turned F1 figures into r...   
2  [He tells TIME he hit him just once—on the hel...   

                                       ground_truths  context_precision  \
0  [Christian Horner praised Max Verstappen's rem...                0.0   
1  [Factors such as engaging digital content, com...                1.0   
2  [Verstappen has broken the record for the youn...                0.5   

   context_recall  faith