In [1]:
from glob import glob
import os

from langchain.vectorstores import Chroma
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain

In [2]:
MODEL= "text-embedding-ada-002"
VECDB_DIR = 'Vectordb/chroma/'

In [3]:
glob("Data/*.pdf")

['Data\\TrinityPostgraduateProspectus2024.pdf']

In [4]:
docs = []
for each_file in glob("Data/*.pdf"):
    loader = PyPDFLoader(each_file)
    # print(loader)
    docs.extend(loader.load())

In [5]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 2500,
    chunk_overlap = 250
)

In [6]:
splits = text_splitter.split_documents(docs)
len(splits)

187

In [7]:
embeddings = OpenAIEmbeddings(openai_api_key=os.environ['OPENAI_API_KEY'],
                              model=MODEL,
                              chunk_size=16)

In [8]:
vectordb = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory=VECDB_DIR
)

In [9]:
# Reloading
# vectordb = Chroma(persist_directory=VECDB_DIR, embedding_function=embeddings)

In [10]:
llm = ChatOpenAI(openai_api_key=os.environ['OPENAI_API_KEY'],
                 model_name='gpt-3.5-turbo',
                 temperature=0)

In [11]:
memory = ConversationBufferMemory(memory_key="chat_history",
                                  return_messages=True)

In [12]:
retriever = vectordb.as_retriever()
qa = ConversationalRetrievalChain.from_llm(llm,
                                           retriever=retriever,
                                           memory=memory)

In [13]:
question = "What is this?"
result = qa({"question": question})
result

{'question': 'What is this?',
 'chat_history': [HumanMessage(content='What is this?'),
  AIMessage(content='This is a description of two different courses offered by Trinity College Dublin, The University of Dublin. The first course is in Information Engineering, which focuses on designing computational products and systems. The second course is in International Development, which prepares students for careers in the field of international development. The descriptions provide information about the career opportunities and admission requirements for each course.')],
 'answer': 'This is a description of two different courses offered by Trinity College Dublin, The University of Dublin. The first course is in Information Engineering, which focuses on designing computational products and systems. The second course is in International Development, which prepares students for careers in the field of international development. The descriptions provide information about the career opportunitie