In [None]:
# Create the folder structure for all four legal acts
!mkdir /content/divorce
!mkdir /content/copyright
!mkdir /content/consumer_protection
!mkdir /content/inheritance

In [None]:
# installing all necessary packages
%pip install llama-index
%pip install llama-index-core
%pip install llama-index-embeddings-huggingface
%pip install llama-index-readers-file
%pip install transformers accelerate bitsandbytes
%pip install --upgrade --quiet llama-index-llms-nvidia llama-index-embeddings-nvidia llama-index-readers-file

Collecting llama-index
  Downloading llama_index-0.12.20-py3-none-any.whl.metadata (12 kB)
Collecting llama-index-agent-openai<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_agent_openai-0.4.6-py3-none-any.whl.metadata (727 bytes)
Collecting llama-index-cli<0.5.0,>=0.4.0 (from llama-index)
  Downloading llama_index_cli-0.4.0-py3-none-any.whl.metadata (1.5 kB)
Collecting llama-index-core<0.13.0,>=0.12.20 (from llama-index)
  Downloading llama_index_core-0.12.20-py3-none-any.whl.metadata (2.6 kB)
Collecting llama-index-embeddings-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_embeddings_openai-0.3.1-py3-none-any.whl.metadata (684 bytes)
Collecting llama-index-indices-managed-llama-cloud>=0.4.0 (from llama-index)
  Downloading llama_index_indices_managed_llama_cloud-0.6.8-py3-none-any.whl.metadata (3.6 kB)
Collecting llama-index-llms-openai<0.4.0,>=0.3.0 (from llama-index)
  Downloading llama_index_llms_openai-0.3.22-py3-none-any.whl.metadata (3.3 kB)
Collec

In [None]:
# Importing necessary packages
import pandas as pd
import numpy as np

from llama_index.core import Settings
from llama_index.core import Document
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline

import logging
import sys
import os
import getpass
from IPython.display import Markdown, display
from llama_index.core import (
    SimpleDirectoryReader,
    StorageContext,
    VectorStoreIndex,
    Response
)
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.response.notebook_utils import display_source_node
from llama_index.llms.nvidia import NVIDIA
from llama_index.core.evaluation import RelevancyEvaluator
from llama_index.core.evaluation import RetrieverEvaluator
import nest_asyncio

In [None]:
# del os.environ['NVIDIA_API_KEY']  ## delete key and reset
if os.environ.get("NVIDIA_API_KEY", "").startswith("nvapi-"):
    print("Valid NVIDIA_API_KEY already in environment. Delete to reset")
else:
    nvapi_key = getpass.getpass("NVAPI Key (starts with nvapi-): ")
    assert nvapi_key.startswith(
        "nvapi-"
    ), f"{nvapi_key[:5]}... is not a valid key"
    os.environ["NVIDIA_API_KEY"] = nvapi_key

NVAPI Key (starts with nvapi-): ··········


In [None]:
nest_asyncio.apply()

In [None]:
llm = NVIDIA(llm = "deepseek-ai/deepseek-r1")
evaluation_llm = NVIDIA()

In [None]:
evaluator = RelevancyEvaluator(llm=evaluation_llm)

In [None]:
# downloading the embedding model with the HuggingFace token
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Global settings for the LLM and embedding model
Settings.llm = llm
Settings.embed_model = embed_model

In [None]:
# Loading the document for all 4 legal acts
doc_loader1 = SimpleDirectoryReader("/content/copyright")
documents1 = doc_loader1.load_data()

doc_loader2 = SimpleDirectoryReader("/content/divorce")
documents2 = doc_loader2.load_data()

doc_loader3 = SimpleDirectoryReader("/content/consumer_protection")
documents3 = doc_loader3.load_data()

doc_loader4 = SimpleDirectoryReader("/content/inheritance")
documents4 = doc_loader4.load_data()

# Creating a SentenceSplitter with chunk size, chunk overlap
text_splitter = SentenceSplitter(
  separator=" ",
  chunk_size=80,
  chunk_overlap=20

)

pipeline = IngestionPipeline(
    transformations=[
        text_splitter,
        embed_model
    ])

# Chunking the document
nodes1 = pipeline.run(documents=documents1)
print("Number of nodes for copyright:",len(nodes1))

nodes2 = pipeline.run(documents=documents2)
print("Number of nodes for divorce:",len(nodes2))

nodes3 = pipeline.run(documents=documents3)
print("Number of nodes for consumer protection:",len(nodes3))

nodes4 = pipeline.run(documents=documents4)
print("Number of nodes for inheritance:",len(nodes4))

# Creating an index and storing in an in-memory vectorstore
index_copyright = VectorStoreIndex(nodes1)
index_divorce = VectorStoreIndex(nodes2)
index_consumer_protection = VectorStoreIndex(nodes3)
index_inheritance = VectorStoreIndex(nodes4)

Number of nodes for copyright: 18
Number of nodes for divorce: 72
Number of nodes for consumer protection: 16
Number of nodes for inheritance: 47


In [None]:
from llama_index.core.evaluation import (
    generate_question_context_pairs,
    EmbeddingQAFinetuneDataset,
)

In [None]:
qa_dataset = generate_question_context_pairs(
    nodes1, llm=llm, num_questions_per_chunk=2
)

100%|██████████| 18/18 [00:14<00:00,  1.28it/s]


In [None]:
qa_dataset2 = generate_question_context_pairs(
    nodes2, llm=llm, num_questions_per_chunk=2
)

100%|██████████| 72/72 [00:56<00:00,  1.27it/s]


In [None]:
qa_dataset3 = generate_question_context_pairs(
    nodes3, llm=llm, num_questions_per_chunk=2
)

100%|██████████| 16/16 [00:13<00:00,  1.21it/s]


In [None]:
qa_dataset4 = generate_question_context_pairs(
    nodes4, llm=llm, num_questions_per_chunk=2
)

100%|██████████| 47/47 [00:35<00:00,  1.31it/s]


In [None]:
qa_dataset.save_json("/content/" + "copyright_eval_dataset.json")
qa_dataset2.save_json("/content/" + "divorce_eval_dataset.json")
qa_dataset3.save_json("/content/" + "cp_eval_dataset.json")
qa_dataset4.save_json("/content/" + "inheritance_eval_dataset.json")

In [None]:
qa_dataset = EmbeddingQAFinetuneDataset.from_json("/content/" + "copyright_eval_dataset.json")

In [None]:
qa_dataset.queries

{'e78189d6-996d-4ba4-bc57-01dae0339004': 'Here are two questions based on the context information:',
 'f0212b35-85d2-461e-95c1-6dfd50d6aebd': 'Question 1:',
 '94d96c34-8314-4c84-a797-16ca848def5c': 'Here are two questions based on the context information:',
 '186b829c-00cd-4b30-9bf0-f5c0f1093f47': 'Question 1:',
 '4c428897-c77d-40e8-a26c-a3191090f092': 'Here are two questions based on the context information:',
 'd1b5f490-d96f-4e7a-adcd-0d26945ade38': 'Question 1:',
 '9bc7eaf3-b5f9-40f8-91d0-649518e80a27': 'Here are two questions based on the context information:',
 '7ed261b1-985d-47df-bcd2-e8783ed70e3a': '**Question 1:** What is the primary purpose of copyright, according to the given context?',
 'edc6dbea-7058-4661-b228-603d0556ae80': 'Here are two questions based on the context information:',
 '2cb24f8b-cefc-43f5-8bfc-989ac15a4c00': '**Question 1**',
 '13c00bf8-6e99-40be-a73b-34803faaf7d7': 'Here are two questions based on the context information:',
 '35bd1fd1-ae42-4bf2-b8d1-f153a22