In [1]:
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.retrievers import BM25Retriever, EnsembleRetriever
import torch


In [2]:
llm = Ollama(model="mistral",  callbacks=CallbackManager([StreamingStdOutCallbackHandler()]),num_gpu=1, base_url="http://localhost:11434")

In [3]:
modelPath = "BAAI/bge-large-en-v1.5"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cuda:0'}
encode_kwargs = {'normalize_embeddings': True}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embedding = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
loader = DirectoryLoader("./data", glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()
len(documents)


93

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
texts = text_splitter.split_documents(documents)
len(texts)

522

In [6]:
# import torch
# torch.cuda.empty_cache()
# print(torch.cuda.memory_summary(device=None, abbreviated=False))
import gc
torch.cuda.empty_cache()
gc.collect()

8440

In [7]:
from langchain.vectorstores import Chroma
persist_directory = './db'
vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)
vectordb.persist()

In [8]:
import torch
dev = "cuda:0" if torch.cuda.is_available() else "cpu"
dev

'cuda:0'

In [9]:
retriever = vectordb.as_retriever(search_kwargs={'k': 7})

In [10]:
bm25_retriever = BM25Retriever.from_documents(texts)
bm25_retriever.k =  5

In [11]:
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever],
                                       weights=[0.3, 0.7])

In [12]:
ensemble_retriever.get_relevant_documents("NPTEL is it a Discipline Core or Open Elective or something else")

[Document(page_content='• Balanced curriculum with engineering, science, humanities and management courses.  \n• Project based curriculum which emphasizes learning by \ndoing  \n• Ample opportunities to do inter -disciplinary courses. \n• Soft on slow learners by offering important /common courses in all semesters.  \n• Optional Summer / lnt er-session semester  \n• Opportunity to gain under graduate research experience.  \n• Value addition with double Major / Honours  / Minor \noption.', metadata={'page': 43, 'source': 'data/19.pdf'}),
 Document(page_content='NPTEL\tCOURSE\tREGISTRATION\t\tQ22:\tHow\tmany\tNPTEL\tcourses\tcan\tbe\tregistered\tper\tsemester?\tANS:\tOnly\tone\tcourse\tcan\tbe\tregister\tper\tsemester.\t\tQ23:\tWill\tthe\tNPTEL\tcourse\tcredits\tconsidered\twithin\tthe\tmaximum\tlimit\t27\tcredits?\tANS:\tNo.\tNPTEL\tcourse\tcredit\twon’t\tbe\tconsidered\twith\tin\tmaximum\t27\tcredits.\t\tQ24:\tHow\twill\tI\tupload\tmy\tNPTEL\tcertidicate?\tANS:\tFirst\tyou\tneed\tto\tr

In [13]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(llm=llm, 
                                  retriever=ensemble_retriever,
                                  return_source_documents=True)
     

In [14]:
def process_llm_response(query):
    llm_response = qa_chain(query)
    return llm_response['result']
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [15]:
query = "NPTEL is it a Discipline Core or Open Elective or something else"
process_llm_response(query)
# qa_chain(query)

 NPTEL courses can be registered as Open Electives since they are not specific to any particular discipline. However, they should meet the advanced level requirement from reputed platforms such as NPTEL, Coursera, Khan Academy, and edX.

' NPTEL courses can be registered as Open Electives since they are not specific to any particular discipline. However, they should meet the advanced level requirement from reputed platforms such as NPTEL, Coursera, Khan Academy, and edX.'