In [1]:
import langchain.llms
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.document_loaders import PyPDFLoader
from langchain.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.retrievers import BM25Retriever, EnsembleRetriever
import torch

In [2]:
llm = Ollama(model="gemma:2b",  callbacks=CallbackManager([StreamingStdOutCallbackHandler()]),num_gpu=1, base_url="http://localhost:11434")

In [3]:
modelPath = "BAAI/bge-large-en-v1.5"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cuda:0'}
encode_kwargs = {'normalize_embeddings': True}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embedding = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
loader = DirectoryLoader("./data", glob="*.pdf", loader_cls=PyPDFLoader)
documents = loader.load()
len(documents)

44

In [5]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
len(texts)

291

In [6]:
import gc
torch.cuda.empty_cache()
gc.collect()

4266

In [7]:
from langchain.vectorstores import Chroma
persist_directory = './db'
vectordb = Chroma.from_documents(documents=texts, 
                                 embedding=embedding,
                                 persist_directory=persist_directory)
vectordb.persist()

In [8]:
import torch
dev = "cuda:0" if torch.cuda.is_available() else "cpu"
dev

'cuda:0'

In [9]:
retriever = vectordb.as_retriever(search_kwargs={'k': 7})

In [10]:
bm25_retriever = BM25Retriever.from_documents(texts)
bm25_retriever.k =  5

In [11]:
ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, retriever],
                                       weights=[0.3, 0.7])

In [12]:
ensemble_retriever.get_relevant_documents("course types, 5")

[Document(page_content='5   \n4.6 Course  Types  \n \nCourses may be classified as Theory only  (TH), Lab only (LO), Project only  (PJT) , \nEmbedded Theory and Lab (ETL), Embedded Lab and Project (ELP), Embedded Theory and \nProject (ETP) Embedded Theory , Lab and Project (ET LP). Courses  such as Seminar,  Mini Project/  \nDesign  Project / Summe r Project / Innovation  project, Capstone  project,  Master’s  \nDissertation/Thesis,  Comprehensive  exam,  Industrial  internship,  Co-/ Extra -Curricular, IIP/', metadata={'page': 8, 'source': 'data\\Academic-Regulations.pdf'}),
 Document(page_content='Students  also have  the option of choosing  from  a ‘basket  of courses’  within  each  classification.  \nAmple  options are given  to choos e interdisciplinary courses  from  other  program s which  will \nhelp the student  develop  additional  skills.  Slow  learners  will also benefit  since  important  \ncourses  are offered  in both semesters in any given academic year. This  arrange

In [13]:
from langchain.chains import RetrievalQA

qa_chain = RetrievalQA.from_chain_type(llm=llm, 
                                  retriever=ensemble_retriever,
                                  return_source_documents=True)

In [14]:
def process_llm_response(query):
    llm_response = qa_chain(query)
    return llm_response['result']
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])

In [16]:
query = "What do I need for Semester Abroad?"
process_llm_response(query)
# qa_chain(query)

According to the context, students need to register for additional program elective courses listed in their curriculum to earn the 'Honours' credential.

"According to the context, students need to register for additional program elective courses listed in their curriculum to earn the 'Honours' credential."

In [17]:
query = "What are course types? Explain in detail."
process_llm_response(query)

The context does not specify what course types are, so I cannot answer this question from the context.

'The context does not specify what course types are, so I cannot answer this question from the context.'

# IMPLEMENTING CachedBackEmbeddings on HuggingFaceEmbeddings

In [18]:
from langchain.embeddings import CacheBackedEmbeddings
%pip install --upgrade --quiet  langchain-openai faiss-cpu

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [19]:
from langchain.storage import LocalFileStore
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter

# underlying_embeddings = OpenAIEmbeddings()

store = LocalFileStore("./cache/")

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    embedding, store
)

In [20]:
list(store.yield_keys())

['03bcea88-ccb1-5f8a-beb1-81fdaa608adc',
 '0559df5c-3a73-511f-84f0-ef3946bedd01',
 '06b24394-2006-5082-8c48-8671e6b11e9a',
 '09d7d7cf-ed06-52a5-b302-b909e8994890',
 '0a1037fc-5859-5e67-b364-e08cf319522f',
 '0abac0db-7db2-5fd5-82dd-a616503495a0',
 '0b655443-1935-5425-bd02-bfdca2365578',
 '0b6e5abc-7173-5a97-967a-28fb97350a33',
 '0be0e6d6-797b-58c5-b53e-b14ab6dbc4d9',
 '0ef87309-11c6-5270-a7fb-858198f2b610',
 '10a32057-64a9-5536-b216-0a3cff8ded0d',
 '10faea12-8729-5fcd-97b4-b6060d620c3d',
 '1203035f-4815-562a-95fc-3931c59ad4d6',
 '13b3ebb7-3219-545b-a1f9-37c11e513460',
 '16ed01d0-1a2b-5699-a815-36b31e34fa1e',
 '17a0efe9-12fe-58e2-9682-6462fa4b1df4',
 '1b51fd55-7ff8-574b-a083-dd9b8c04286f',
 '1eba8b7b-43da-5d1c-b4f8-dbc6031f5778',
 '1ffd27b5-5702-58dd-8205-a68c452a6985',
 '2185bae5-c62d-5651-a171-ac35341d974e',
 '24228d25-9dd9-5a8a-8e80-d7f4b97e576e',
 '247d6cee-0023-56a2-8f96-a13cd10cf5d4',
 '28034214-1488-5ff3-833c-5ca29f2d0b74',
 '280d73ee-f28c-5225-b624-e6384cf8b3fe',
 '2a099eef-fea7-

In [21]:
raw_documents = DirectoryLoader("./data", glob="*.pdf", loader_cls=PyPDFLoader).load()
text_splitter = CharacterTextSplitter(chunk_size=1500, chunk_overlap=0)
documents = text_splitter.split_documents(raw_documents)

In [22]:
%%time
db = FAISS.from_documents(documents, cached_embedder)

CPU times: total: 250 ms
Wall time: 2.72 s


In [23]:
list(store.yield_keys())[:5]

['03bcea88-ccb1-5f8a-beb1-81fdaa608adc',
 '0559df5c-3a73-511f-84f0-ef3946bedd01',
 '06b24394-2006-5082-8c48-8671e6b11e9a',
 '09d7d7cf-ed06-52a5-b302-b909e8994890',
 '0a1037fc-5859-5e67-b364-e08cf319522f']

In [24]:
from langchain.embeddings import CacheBackedEmbeddings
from langchain.storage import InMemoryByteStore

store = InMemoryByteStore()

cached_embedder = CacheBackedEmbeddings.from_bytes_store(
    embedding, store
)

# Implementing MilvusVectorDatabase

In [25]:
%pip install --upgrade --quiet  pymilvus

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [26]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import Milvus
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter

In [None]:
from pymilvus import connections, db

conn = connections.connect(host="127.0.0.1", port=19530)

database = db.create_database("book")


In [None]:
vector_db = Milvus.from_documents(
    raw_documents,
    embedding,
    connection_args={"host": "127.0.0.1", "port": "19530"},
)

# Implementing OCR 

In [1]:
!pip install pytesseract
from PIL import Image
import pytesseract
import numpy as np
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
filename = './data/vitfees.jpeg'
img1 = np.array(Image.open(filename))
text = pytesseract.image_to_string(img1)




[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
print(text)

Scholarship Group A (INR) Group B (INR)
ae Total

Fee fee Fee Fee fee Fee

Advance Balance Total Advance Balance

1 1,76,000 1,50,000 26,000 1,98,000 | 1,98,000 0

" 2,35,000 1,50,000 85,000 3,07,000 | 2,00,000 1,07,000
ml 3,43,000 2,00,000 1,43,000 4,05,000 | 2,50,000 1,55,000
Vv 3,68,000 2,00,000 1,68,000 4,48,000 3,00,000 1,48,000

Vv 3,98,000 2,50,000 1,48,000 4,93,000 3,00,000 1,93,000




In [3]:
filename = './data/vittt.png'
img1 = np.array(Image.open(filename))
text = pytesseract.image_to_string(img1)

In [4]:
print(text)

The Final Assessment Test (FAT) Theory Schedule (Closed Book Exam) for all courses offered during the
Winter Semester 2023-24 for is available in V-Top. The same is presented below for your reference.

FAT Schedule - Winter Semester 2023-24

Exam Session 1 (09:30 AM - 12:30 PM)
Exam Day,
siete Schedule
‘A, AISTAISTAAL AIATAL, IL14MI+R12,_ MA2+UIL, 13eNSsMa,ThaR@¥U3,
06-May-24 | Monday | AL+TALTAAL+V1, I3+14+U3, 13+U3, 17418, I74R74UT, I7U4, 1B4M7, M24R7, M34N3+U9,
N7+NB+N9+N1O & R34R4
O7-May-24 | Tuesday | A2, A2+TA2, AZ+TA2+TAA2, A2¥TA2FTAA2WV3 & A2TAZ*VL
Bi, BIVTBL, 14+I5+N2, 12+I4+i5, 174R4, IBsM9, 11J6MI3¥NI2, NIGeRIIVU3, I5¥ND,
08 May-24 | Wednesday 18+U1, 19+110+M2, 19+N2, I9+U7, M2+N3, M3+N3, M44R9+U7 & R2+R3
09-May-24 | Thursday | 62, 82+7B2, B2+TB2+TBB2 & B2+TB2+TBB2+VA
Di, DIFTDI, DIFTDIAVA, IOeR3eR4, IOFRS, MasM5, MasN2+R1, MS+RS¢R4, M7ANEFR7,
10-May-24 Friday M7485, MB+N9 & N7+R9
11-May-24 | Saturday | 02, 02+7D2, D2+TO2+V6 & D2+TOZ+TDDZ
Taaay-24 | Monday | Ch CHTCH CLTCIST