In [1]:
from langchain_community.document_loaders import JSONLoader
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain_community.vectorstores import Chroma
import json
import os
from langchain_core.documents import Document
from sentence_transformers import SentenceTransformer

In [3]:
#import sentence transformer model for embedding
from sentence_transformers import SentenceTransformer

embed_model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
#creating a file with the combined data of all the 6 legal documents

input_folder = 'legal_docs'
output_file = 'combined_legal_docs.json'

combined_data = []

for filename in os.listdir(input_folder):
    if filename.endswith('.json'):
        filepath = os.path.join(input_folder, filename)
        
        with open(filepath, 'r', encoding='utf-8') as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError as e:
                print(f"Error in {filename}: {e}")
                continue

            for obj in data:
                obj['source'] = filename.replace("_"," ").replace(".json","")
                combined_data.append(obj)

# Write combined JSON
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(combined_data, f, indent=2, ensure_ascii=False)

print(f"Combined {len(combined_data)} entries into {output_file}")

In [2]:
#creating metadata and content functions that create a dictionary object containing all parameters

def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["Article"] = record.get("article") or "none"
    metadata["Title"] = record.get("title") or "none"
    metadata["Section_Title"] = record.get("section_title") or "none"
    metadata["Chapter_Title"] = record.get("chapter_title") or "none"
    metadata["Chapter"] = record.get("chapter") or "none"
    metadata["Section"] = record.get("Section") or record.get("section") or "none"
    metadata["source"] = record.get("source") or "unknown"
    return metadata
    
def get_content(record):
    return record.get("description") or record.get("section_desc") or ""
    

In [None]:
#json loader that loads the data from the file combined legal docs

loader = JSONLoader(
    file_path="./combined_legal_docs.json",
    jq_schema=".[]?",
    content_key=None,
    text_content=False,
    metadata_func=metadata_func,
)

documents_raw = loader.load()

In [None]:
#creates document list that contains page_content and metadata of each law

document = []
for doc in documents_raw:
    record = json.loads(doc.page_content)
    content = get_content(record)
    
    if not content.strip():
        continue
        
    document.append(Document(
        page_content = content,
        metadata = doc.metadata
    ))

In [None]:
from langchain_chroma import Chroma
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions

In [None]:
# Create a local ChromaDB instance
chroma_client = chromadb.PersistentClient(path="./chroma_legal_db")

collection = chroma_client.get_or_create_collection(
    name="legal_documents"
)

# Add documents
for i, doc in enumerate(document):
    collection.add(
        ids=[str(i)],
        embeddings=[embeddings[i]],
        metadatas=[doc.metadata],
        documents=[doc.page_content]
    )