# Imports

In [1]:
from glob import glob
import os

from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [2]:
MODEL= "text-embedding-ada-002"
VECDB_DIR = 'Vectordb/chroma/'

In [3]:
glob("Data/*.pdf")

['Data\\TrinityPostgraduateProspectus2024.pdf']

# PyPDFLoader

Load PDF using pypdf into list of documents.

In [4]:
docs = []
for each_file in glob("Data/*.pdf"):
    loader = PyPDFLoader(each_file)
    # print(loader)
    docs.extend(loader.load())

# Split Documents

When you want to deal with long pieces of text, you need to split it into smaller chunks that can fit into your model's context window.

At a high level, text splitters work as following:

- Split the text up into smaller chunks.
- Start combining these small chunks into a larger chunk until you reach a certain size.
- Once you reach that size, make that chunk its own piece of text and then start creating a new chunk of text with some overlap.

To customize your text splitter, you need to know

- How the text is split
- How the chunk size is measured

**RecursiveCharacterTextSplitter** - It tries to split on ( "\n\n", "\n", " ", "" ) in order until the chunks are small enough. This has the effect of trying to keep all paragraphs (and then sentences, and then words) together as long as possible, as those would generically seem to be the strongest semantically related pieces of text.

In [5]:
#intiate

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2500,
    chunk_overlap = 250
)

In [6]:
# split

splits = text_splitter.split_documents(docs)
len(splits)

187

# Embeddings

Convert your text into a vector (a series of numbers that hold the semantic 'meaning' of your text).

Word vectors are used to assess text similarity and fetch relevant content.

In [7]:
embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'],
                              model=MODEL,
                              chunk_size=16)

# Store in Vector DB

A vector store takes care of storing embedded data and performing vector search for you.

In [8]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=VECDB_DIR
)

In [9]:
# Reloading
# vectordb = Chroma(persist_directory=VECDB_DIR, embedding_function=embeddings)

# Chatting with OpenAI

OpenAI Chat large language models API.
To use, you should have the openai python package installed, and the environment variable OPENAI_API_KEY set with your API key.

**Memory**

Help LLMs remember information. Memory is a bit of a loose term. It could be as simple as remembering information you've chatted about in the past or more complicated information retrieval.

**Retriever**

A vector store retriever is a retriever that uses a vector store to retrieve documents.

In [10]:
llm = ChatOpenAI(openai_api_key=os.environ['OPENAI_API_KEY'],
                 model_name='gpt-3.5-turbo',
                 temperature=0)

In [11]:
memory = ConversationBufferMemory(memory_key="chat_history",
                                  return_messages=True)

**ConversationalRetrievalChain** 

Chain for having a conversation based on retrieved documents.
This chain takes in chat history (a list of messages) and new questions, and then returns an answer to that question. 

The algorithm for this chain consists of three parts:

1. Use the chat history and the new question to create a “standalone question”. This is done so that this question can be passed into the retrieval step to fetch relevant documents. If only the new question was passed in, then relevant context may be lacking. If the whole conversation was passed into retrieval, there may be unnecessary information there that would distract from retrieval.

2. This new question is passed to the retriever and relevant documents are returned.

3. The retrieved documents are passed to an LLM along with either the new question (default behavior) or the original question and chat history to generate a final response.

In [12]:
retriever = vectordb.as_retriever() # search_type -> similarity or maximum marginal relevance
qa = ConversationalRetrievalChain.from_llm(llm,
                                           retriever=retriever,
                                           memory=memory)

In [14]:
question = "What is this docuemnt about?"
result = qa({"question": question})
result

{'question': 'What is this docuemnt about?',
 'chat_history': [HumanMessage(content='What is this?'),
  AIMessage(content='This is a description of two different courses offered by Trinity College Dublin, The University of Dublin. The first course is in Information Engineering, which focuses on designing computational products and systems. The second course is in International Development, which prepares students for careers in the field of international development. The description includes information about career opportunities and admission requirements for each course.'),
  HumanMessage(content='What is this docuemnt about?'),
  AIMessage(content='The document is about a postgraduate course in Comparative Literature at Trinity College Dublin. It provides information about the course structure, career opportunities for graduates, and admission requirements.')],
 'answer': 'The document is about a postgraduate course in Comparative Literature at Trinity College Dublin. It provides in

In [15]:
question = "What are the career opportunities in this subject?"
result = qa({"question": question})
result

{'question': 'What are the career opportunities in this subject?',
 'chat_history': [HumanMessage(content='What is this?'),
  AIMessage(content='This is a description of two different courses offered by Trinity College Dublin, The University of Dublin. The first course is in Information Engineering, which focuses on designing computational products and systems. The second course is in International Development, which prepares students for careers in the field of international development. The description includes information about career opportunities and admission requirements for each course.'),
  HumanMessage(content='What is this docuemnt about?'),
  AIMessage(content='The document is about a postgraduate course in Comparative Literature at Trinity College Dublin. It provides information about the course structure, career opportunities for graduates, and admission requirements.'),
  HumanMessage(content='What are the career opportunities in this subject?'),
  AIMessage(content="The

In [18]:
question = "What are the admission requirements for this subject?"
result = qa({"question": question})
result

{'question': 'What are the admission requirements for this subject?',
 'chat_history': [HumanMessage(content='What is this?'),
  AIMessage(content='This is a description of two different courses offered by Trinity College Dublin, The University of Dublin. The first course is in Information Engineering, which focuses on designing computational products and systems. The second course is in International Development, which prepares students for careers in the field of international development. The description includes information about career opportunities and admission requirements for each course.'),
  HumanMessage(content='What is this docuemnt about?'),
  AIMessage(content='The document is about a postgraduate course in Comparative Literature at Trinity College Dublin. It provides information about the course structure, career opportunities for graduates, and admission requirements.'),
  HumanMessage(content='What are the career opportunities in this subject?'),
  AIMessage(content="