In [1]:
import chromadb
from chromadb.config import Settings

# 1. We are running chroma db in a client server mode
# 2. To run a Sever execute the below code. This runs a server in the docker container. This can be on different machine
#       docker pull ghcr.io/chroma-core/chroma:0.4.22
#       docker run --detach -v C:\Users\shridhar\Documents\Shridhar\Projects\Langchain\chromadb_data:/chroma/.chroma/index -p 8000:8000 ghcr.io/chroma-core/chroma:0.4.22
#

In [2]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain import HuggingFacePipeline
from langchain.chains import ConversationalRetrievalChain
from langchain.memory import ConversationBufferMemory

from langchain.vectorstores import Chroma
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline
from huggingface_hub import notebook_login
import torch

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [3]:
print("Torch version: ", torch.__version__)
print("Chroma DB Version: ", chromadb.__version__)

Torch version:  2.2.1+cpu
Chroma DB Version:  0.4.18


In [4]:
# Writing a function for creating embeddings

# hugging_face_token = "
# notebook_login(hugging_face_token)
model_name = './models/sentence-transformers/all-MiniLM-L12-v2'

def get_embeddings(docs):
    embedding_model = HuggingFaceEmbeddings(model_name=model_name)
    embeddings = embedding_model.embed_documents(docs)
    return embeddings

In [5]:
# 3. Run the client - This can be on another computer

settings = Settings(anonymized_telemetry=False, 
                    allow_reset = False
                    # ,chroma_db_impl="duckdb+parquet"
                     ,is_persistent=True
                    )

# chroma_client = chromadb.HttpClient(host="localhost", port = 8000, settings = settings)
chroma_client = chromadb.PersistentClient(path="C:\\Users\\shridhar\\Documents\\Shridhar\\Projects\\Langchain\\chromadb_data\\")

# anonymized_telemetry It collects usage information and sends to chroma. Setting it false doesnt send any usage information
# allow_reset resets the whole database. Default value is false. But making it explicitly False for information purpose

chroma_client.list_collections()

[Collection(name=case_file_collection)]

In [6]:
chroma_client.delete_collection(name="case_file_collection")

In [7]:
# create a new collection or get collection if it is already created
# case_file_collection = chroma_client.get_or_create_collection(name="case_file_collection")
case_file_collection = chroma_client.create_collection(name="case_file_collection",
                                                                metadata={"hnsw:space": "cosine"})

In [8]:
chroma_client.max_batch_size

5461

In [9]:
from functools import reduce

list_file_name = ['./case_files./case_file.txt', 
                  './case_files./H1B-Process.txt', 
                  "./case_files./5022_2016_Judgement_06-Sep-2017.txt",
                  './case_files/41044_2011_Judgement_06-Sep-2017.txt',
                  './case_files/39830_2016_Judgement_07-Sep-2017.txt'
                  ]
list_case_file_metadata = [{'case_no':"139/2007", 'case_type':"Criminal"}, 
                           {'case_no':"None", 'case_type':"Immigration"},
                           {'case_no':"None", 'case_type':"Immigration"},
                           {'case_no':"None", 'case_type':"Immigration"},
                           {'case_no':"None", 'case_type':"Immigration"},
                           
                           ]
list_case_files = []
list_ids = reduce(lambda a, b: a+[str(b)], list(range(len(list_file_name))), [])




for i in range(len(list_file_name)):
    path = list_file_name[i]
    file = open(path, mode = 'r')
    case_file_data = file.read()
    list_case_files.append(case_file_data)
    file.close()

In [10]:
embeddings = get_embeddings(list_case_files)

  _torch_pytree._register_pytree_node(


In [11]:
no_of_documents = len(embeddings)
max_batch_size = chroma_client.max_batch_size
print("Number of documents = ", no_of_documents)
print("Embedding Dimension = ", len(embeddings[1]))
print("Max batch Size = ", max_batch_size)

Number of documents =  5
Embedding Dimension =  384
Max batch Size =  5461


In [12]:
for i in range(0, no_of_documents, max_batch_size):
    case_file_collection.add(documents = list_case_files[i:i+max_batch_size], 
                             ids = list_ids[i:i+max_batch_size],
                             embeddings = embeddings[i:i+max_batch_size],
                             metadatas = list_case_file_metadata[i:i+max_batch_size],
                             )
# By default uses "all-MiniLM-L6-v2 onnx" this embedding model. Onnx is a common format for using models between different format

In [13]:
question = "Get a case from Bombay High Court?"
answers = case_file_collection.query(query_texts = question, n_results=3, where={"case_no": "139/2007"})
answers

{'ids': [['0']],
 'distances': [[0.5615436099163783]],
 'metadatas': [[{'case_no': '139/2007', 'case_type': 'Criminal'}]],
 'embeddings': None,
 'documents': [[' \n\n\n\n\n             (APPELLANT IS IN JAIL CUSTODY) \n         IN THE HIGH COURT OF JHARKHAND AT RANCHI \n            (CRIMINAL APPELLATE JURISDICTION ) \n              Criminal Appeal No. 139/2007. \n   \n                             IN THE MATTER OF: \n                             An application under \n                             section 374(2) and 389(1) \n                             of the Code of Criminal \n                             Procedure\n                                        AND \n                             IN THE MATTER OF: \n\nRamesh Oraon, Son of- Late Mahto Oraon, \nResident of Village- Murgu, Chaili toli, Police Station-\n                                                Sisai, \nDistrict- Gumla, Jharkhand.               Appellant. \n                        VERSUS \nThe State of Jharkhand.            

In [16]:
def get_context(query):
    settings = Settings(anonymized_telemetry=False, 
                    allow_reset = False
                    # ,chroma_db_impl="duckdb+parquet"
                        ,is_persistent=True
                    )

    # chroma_client = chromadb.HttpClient(host="localhost", port = 8000, settings = settings)
    chroma_client = chromadb.PersistentClient(path="C:\\Users\\shridhar\\Documents\\Shridhar\\Projects\\Langchain\\chromadb_data\\")
    case_file_collection = chroma_client.get_collection("case_file_collection")
    query_embeddings = get_embeddings(query)
    result = case_file_collection.query(query_embeddings = query_embeddings, n_results = 1)
    return result

In [19]:
query = ["Am I elligible for H1B?"]
get_context(query)

{'ids': [['1']],
 'distances': [[0.5985772903915532]],
 'metadatas': [[{'case_no': 'None', 'case_type': 'Immigration'}]],
 'embeddings': None,
 'documents': [['The H1B Visa Explained\nUnderstanding the H1B Visa for working in the United States\nH1B Visa Sample\nWhat Is the H1B visa\nThe H1B visa is a nonimmigrant work visa that allows US employers to hire foreign workers with specialized skills to work in the United States for a specific period of time Typically the roles require a bachelors degree or equivalent Occupations that qualify for the H1B visa are typically in fields such as technology finance engineering architecture or more\n\nImportant\nUS Citizenship and Immigration Services USCIS announced it would hold a second lottery for the H1B visa program for FY 2024 Learn more here\n\nAccess our unified solutions online anytime anywhere Schedule a demo\nGET STARTED\nLearn More\nH1B Visa Eligibility\nIn order to be eligible for the H1B visa you will need\n\nA valid job offer from a

In [None]:
case_file_collection.query()