<a href="https://colab.research.google.com/github/powervnc/RAG_Jane_Austen/blob/main/RAG_with_Jane_Austen_Novels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# INSTALLING LIBRARIES

In [None]:
! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain

In [3]:
!pip install python-dotenv



- langchain community -> we wil use it for OpenAiEmbeddings
- tiktoken -> using for splitting, tokenization
- langchainhub -> for prompt template
- chromadb -> LOCAL vector database

# SETTING UP THE ENV KEYS


In [1]:
%%writefile .env
OPENAI_API_KEY=
LANGSMITH_TRACING=true
LANGSMITH_API_KEY=

Overwriting .env


In [2]:

from dotenv import load_dotenv
import os

load_dotenv()


True

# DOWNLOADING THE JANE AUSTEN NOVELS FROM PROJECT GUTENBERG

In [20]:
novels_info = {
    "sense_and_sensibility": "https://www.gutenberg.org/files/161/161-0.txt",
    "pride_and_prejudice": "https://www.gutenberg.org/files/1342/1342-0.txt",
    "emma": "https://www.gutenberg.org/files/158/158-0.txt"
}


In [21]:
from pathlib import Path

novels_folder = Path("novels")
novels_folder.mkdir(exist_ok=True)


In [22]:
import requests

MAX_TRIES = 5

for name, url in novels_info.items():
  complete_path = novels_folder/f"{name}.txt"
  if not complete_path.exists():
    for i in range(MAX_TRIES):
        try:
            print(f"Downloading {name}")
            req = requests.get(url)
            req.raise_for_status() #checks status of https request
            complete_path.write_text(req.text, encoding="utf-8")
            print(f"Finished downloading {name}")
            break
        except requests.exceptions.RequestException as e:
          print(f"Error downloading {name}: {e}")
          if i==MAX_TRIES:
            print(f"Reached maximum downloading tried for {name}")
  else:
    print(f"Already downloaded {name}")


Already downloaded sense_and_sensibility
Already downloaded pride_and_prejudice
Already downloaded emma


# CREATING THE CORRESPONDING DATABASES FOR THE NOVELS


In [23]:
from langchain.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings


text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 300,
    chunk_overlap = 50
)

vectorstores = {}

for name in novels_info.keys():
  loader = TextLoader(str(novels_folder / f"{name}.txt"), encoding="utf-8")
  docs = loader.load()
  splits = text_splitter.split_documents(docs)
  vectorstores[name] = Chroma.from_documents(
      documents = splits,
      embedding = OpenAIEmbeddings(),
      collection_name=name
  )




In [24]:
vectorstores

{'sense_and_sensibility': <langchain_community.vectorstores.chroma.Chroma at 0x7da1d430e6f0>,
 'pride_and_prejudice': <langchain_community.vectorstores.chroma.Chroma at 0x7da1d430fd70>,
 'emma': <langchain_community.vectorstores.chroma.Chroma at 0x7da1d430fd40>}

# ROUTING TO THE APPROPIATE DATABASE

In [25]:
from typing import Literal

from langchain_core.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from langchain_openai import ChatOpenAI


#restricting the output of the querry
class RouteQuery(BaseModel):
    datasource: Literal["emma", "pride_and_prejudice", "sense_and_sensibility"] = Field(
        ...,
        description="Given a user question choose which datasource(novel) would be most relevant for answering their question",
    )

llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0)
llm = llm.with_structured_output(RouteQuery)

system = "You are an expert at routing a user question to the appropriate data source. Based on the programming language the question is referring to, route it to the relevant data source."


prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{question}"),
    ]
)

router = prompt | llm




In [26]:
question = "What is Elizabeth's opinions about love and courtship"
result = router.invoke({"question": question})

result

RouteQuery(datasource='pride_and_prejudice')

In [27]:
result.datasource

'pride_and_prejudice'

In [28]:
def chose_retriever(datasource):
  retriever = None
  if datasource in ['pride_and_prejudice', 'emma', 'sense_and_sensibility']:
    retriever = vectorstores[datasource].as_retriever()
  elif datasource == 'emma':
    retriever = vectorstores['pride_and_prejudice'].as_retriever()
  return retriever


In [29]:
retriever = chose_retriever(result.datasource)

# MULTI QUERY APPOACH TO QUERRY TRANSLATION
see template

In [30]:
from langchain.prompts import ChatPromptTemplate

template = """You are an AI language model assistant. Your task is to generate seven
different versions of the given user question to retrieve relevant documents from a vector
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search.
Provide these alternative questions separated by newlines. Original question: {question}"""

prompt_perspective = ChatPromptTemplate.from_template(template)

from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI


generate_queries = (
    prompt_perspective
    | ChatOpenAI(temperature = 0)
    | StrOutputParser()
    | (lambda x : x.split("\n"))
)



In [31]:
from langchain.load import dumps, loads


def get_unique_docs(documents: list[list]):
  # flatten list oof lists and convert docs to strings
  flattened = [dumps(doc) for sublist in documents for doc in sublist]

  unique_docs = list(set(flattened))
  return [loads(doc) for doc in unique_docs]

In [32]:


# map() is a LangChain Runnable method that:
# Takes a list of queries
# Performs retrieval for each query individually
# Returns a list of lists of documents corresponding to each query

chain = (
    generate_queries
    | retriever.map()
    | get_unique_docs
)
docs = chain.invoke({"question": question})
len(docs)



3

In [33]:
from operator import itemgetter
from langchain_openai import ChatOpenAI
from langchain_core.runnables import RunnablePassthrough

# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""


In [34]:
prompt = ChatPromptTemplate.from_template(template)

In [35]:
llm = ChatOpenAI(temperature=0)


In [36]:
final_chain = (
     {"context": chain,
     "question": itemgetter("question")}  ##itemgetter("question") tells the pipeline to grab "question" from the input when invoked
     | prompt
     | llm
     | StrOutputParser()
)

final_chain.invoke({"question": question})


"Elizabeth seems to have a more practical and realistic view of love and courtship. She acknowledges that Lydia's behavior is shocking and lacks decency and virtue, but also recognizes that Lydia is young and has been influenced by her surroundings. Elizabeth understands that Wickham's charm and charisma can captivate a woman, but she also seems to have a more grounded perspective on the nature of love and relationships."