In [2]:
import os
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from tqdm import tqdm

In [3]:
persist_directory="..//data//document_embeddings"

In [3]:
def create_embeddings():
    DATA_PATH = "../data/documents"

    embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
    documents = []
    
    for filename in os.listdir(DATA_PATH):
        if filename.endswith(".pdf"):
            file_path = os.path.join(DATA_PATH, filename)
            raw_documents = PyPDFLoader(file_path).load()
            
            splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=200)
            documents.extend(splitter.split_documents(raw_documents))
    
    db = Chroma.from_documents(documents[:2], embedding_function, persist_directory=persist_directory)
    
    for doc in tqdm(documents[2:], desc="Adding Documents"):
        db.add_documents(documents=[doc])
    
    db.persist()
    db = None

create_embeddings()

  from tqdm.autonotebook import tqdm, trange
Adding Documents: 100%|██████████| 439/439 [00:15<00:00, 29.11it/s]


In [4]:
model_kwargs = {'device': 'cuda'}
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2", model_kwargs=model_kwargs)

vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding_function)
resources = vectordb.similarity_search("Dear Sir, Could you update me on how many credits do I require for my CS graduation.", k=3)

  from tqdm.autonotebook import tqdm, trange


In [5]:
resources

[Document(page_content='194. UNDERGRADUATE DEGREE REQUIREMENTS, REGULATIONS AND PROCEDURES\n4.1 Overall Requirements\n4.1.1 B.Tech.\nThe total credit requirement for the B.Tech. (4-year programme) is 148-158 credits (exact requirement is discipline \nspecific). The minimum and maximum number of registered semesters for graduation requirements are listed in \nTable 8. For B.Tech. programmes, the total credits are distributed over following categories :\n(a) Institute Core (IC):\n • Basic Sciences (BS): Mathematics, Physics, Chemistry and Biology courses\n • Engineering Arts and Science (EAS): Fundamental engineering courses\n • Humanities and Social Sciences (HUSS): At least two courses to be taken in the 200 level  \n and at least one course to be taken in the 300 level. Management Courses (MSL 3XX) are not  \n counted under this category.\n(b) Departmental Core (DC): courses of relevant discipline.\n(c) Departmental Electives (DE): electives related to the parent discipline.\n(d) Prog