In [None]:
%pip install -qU langchain-groq
%pip install langchain pinecone-client python-dotenv streamlit
%pip install -U langchain-community
%pip install sentence-transformers
%pip install pinecone-client
%pip install pinecone-client[grpc]
%pip install --upgrade langchain-pinecone
%pip install pymupdf pdfplumber
%pip install xmltodict

In [None]:
%pip install nltk
%pip install scikit-learn
%pip install beautifulsoup4
%pip install lxml

Project Description: Legal Document Analysis and Summarization Tool using RAG

In [7]:
import getpass
import os

os.environ["GROQ_API_KEY"] = getpass.getpass()

# Groq is used as the LLM
from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192")

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import string
from bs4 import BeautifulSoup
import contractions

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

stemmer = PorterStemmer()


def separate_numbers_and_text(text):
    separated_text = re.sub(r"(\d+)([a-zA-Z]+)|([a-zA-Z]+)(\d+)", r"\1 \2 \3 \4", text)
    return separated_text


def preprocess_text(text):
    text = BeautifulSoup(text, "html.parser").text
    text = text.lower()

    text = text.translate(str.maketrans("", "", string.punctuation))
    text = " ".join(word for word in text.split() if word not in stop_words)
    text = separate_numbers_and_text(text)
    text = " ".join(text.split())
    text = re.sub(r"\s+", " ", text)  # removing extra white space
    text = contractions.fix(
        text
    )  # fixing contractions (ex: can't gets converted to cannot)
    text = re.sub(r"http\S+|www\.\S+|https\S+", "<URL>", text)  # removing URLs
    text = re.sub(r"\S+@\S+", "<EMAIL>", text)  # removing email IDs
    text = re.sub(
        r"\b(not)\s+(\w+)\b", r"\1_\2", text
    )  # handling negations (ex: "not good" becomes "not_good")

    text = " ".join(stemmer.stem(word) for word in text.split())  # stemming

    return text


def read_and_preprocess_data(folder_path):
    judgement_path = os.path.join(folder_path, "judgement")
    summary_path = os.path.join(folder_path, "summary")

    judgement_files = sorted(os.listdir(judgement_path))
    summary_files = sorted(os.listdir(summary_path))

    judgements = []
    summaries = []

    for judgement_file, summary_file in zip(judgement_files, summary_files):
        with open(
            os.path.join(judgement_path, judgement_file), "r", encoding="utf-8"
        ) as jf:
            judgement_text = jf.read()

            judgement_text = preprocess_text(judgement_text)
            judgements.append(judgement_text)

        with open(
            os.path.join(summary_path, summary_file), "r", encoding="utf-8"
        ) as sf:
            summary_text = sf.read()

            summary_text = preprocess_text(summary_text)
            summaries.append(summary_text)

    data = pd.DataFrame({"Judgement": judgements, "Summary": summaries})
    return data


# Paths to train and test folders
train_folder_path = r"D:\Rohan\ML\Datasets\legal_dataset\IN-Abs\train-data-small"
test_folder_path = r"D:\Rohan\ML\Datasets\legal_dataset\IN-Abs\test-data"

# Reading and preprocessing train and test data
train_data = read_and_preprocess_data(train_folder_path)
test_data = read_and_preprocess_data(test_folder_path)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
train_data["Judgement"][0]

'appeal lxvi 1949 appeal high court judicatur bombay refer section 66 indian incom tax act 1022 km munshi n p nathvani appel lant mc setalvad attorney gener india h j umrigar respond 1950 may 26 judgment court deliv mehr chand mahajan j appeal judgment high court judicatur bombay incom tax matter rais question whether munici pal properti tax urban immov properti tax payabl relev bombay act allow deduct section 9 1 iv indian incom tax act assesse compani invest compani deriv incom properti citi bombay assess year 1940 41 net incom assesse head properti comput incom tax offic sum rs 621764 deduct gross rent certain payment compani paid relev year rs 122675 municip properti tax rs 32760 urban properti tax deduct two sum claim provis section 9 act first item deduct sum rs 48572 allow ground item repres tenant burden paid assesse otherwis claim disal low appeal assesse appel sistant commission incom tax appel tribu nal unsuccess tribun howev agre refer two question law high court judicatur 

In [4]:
train_data.head()

Unnamed: 0,Judgement,Summary
0,appeal lxvi 1949 appeal high court judicatur b...,charg creat respect municip properti tax secti...
1,civil appeal no 94 1949 107 834 appeal judgmen...,agreement leas leas indian declar includ must ...
2,imin appeal 40 1951 127 appeal judgment order ...,question whether magistr person interest eas w...
3,appeal 388 1960 appeal special leav judgment o...,appel member joint hindu famili carri busi gov...
4,appeal 198 1954 appeal judgment order date oct...,appel ruler state baster later integr state ma...


In [5]:
test_data.head()

Unnamed: 0,Judgement,Summary
0,appeal 101 1959 appeal special leav judgment o...,appel displac person west pakistan grant quasi...
1,appeal 52 1957 appeal judgment decre date apri...,appel respond owner adjoin collieri suit prese...
2,appeal no 45 46 1959 appeal special leav judgm...,respond firm claim exempt sale tax articl 286 ...
3,ion crimin appeal 89 1961 appeal special leav ...,appel tri murder fact establish quarrel appel ...
4,civil appeal 50 1961 appeal special leav award...,employ appel cross cutter saw mill ask show be...


In [21]:
# vectorization using TfIdf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import torch


tfidf_vectorizer = TfidfVectorizer()

train_tfidf = tfidf_vectorizer.fit_transform(train_data["Judgement"])
test_tfidf = tfidf_vectorizer.transform(test_data["Judgement"])

train_tfidf_dense = torch.tensor(train_tfidf.toarray(), dtype=torch.float32)
test_tfidf_dense = torch.tensor(test_tfidf.toarray(), dtype=torch.float32)

In [33]:
# downloading the RAG model and initialising
from transformers import BartForConditionalGeneration, BartTokenizer

model = BartForConditionalGeneration.from_pretrained("facebook/bart-large")
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [34]:
train_labels = [
    tokenizer(
        summary, return_tensors="pt", padding=True, truncation=True
    ).input_ids.squeeze(0)
    for summary in train_data["Summary"]
]
test_labels = [
    tokenizer(
        summary, return_tensors="pt", padding=True, truncation=True
    ).input_ids.squeeze(0)
    for summary in test_data["Summary"]
]

In [36]:
from torch.utils.data import Dataset, DataLoader


class TfidfDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return self.data.shape[0]

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]


from torch.nn.utils.rnn import pad_sequence


def collate_fn(batch):
    inputs, labels = zip(*batch)

    # Pad inputs and labels to the same length
    inputs_padded = pad_sequence(inputs, batch_first=True)
    labels_padded = pad_sequence(labels, batch_first=True)

    return inputs_padded, labels_padded


# Prepare Datasets
train_dataset = TfidfDataset(train_tfidf_dense, train_labels)
test_dataset = TfidfDataset(test_tfidf_dense, test_labels)

# Prepare DataLoaders
train_loader = DataLoader(
    train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn
)
test_loader = DataLoader(
    test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn
)

In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

import torch.optim as optim

# Initialize optimizer
optimizer = optim.AdamW(model.parameters(), lr=5e-5)

# Training loop
for epoch in range(3):  # Adjust number of epochs as needed
    model.train()
    for batch in train_loader:
        inputs, labels = batch
        inputs = inputs.to(device).long()
        labels = labels.to(device).long()

        outputs = model(input_ids=inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch + 1} completed")

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
