In [1]:
# Install dependencies
!pip install transformers datasets torch pdfplumber

import torch
import pdfplumber
import re
from transformers import T5Tokenizer, T5ForConditionalGeneration
from datasets import load_dataset
from google.colab import files
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW

# Load the dataset
dataset = load_dataset("SHASWATSINGH3101/THE_BHARATIYA_NYAYA_SANHITA2023_summarize", split="train")

# Load the tokenizer and model
model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# Convert dataset to PyTorch format
class LegalDataset(Dataset):
    def __init__(self, dataset):
        self.texts = dataset["text"]
        self.summaries = dataset["summary"]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        inputs = tokenizer("summarize: " + self.texts[idx], max_length=512, truncation=True, padding="max_length", return_tensors="pt")
        labels = tokenizer(self.summaries[idx], max_length=150, truncation=True, padding="max_length", return_tensors="pt")

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": labels["input_ids"].squeeze(0)
        }

# Convert dataset
tokenized_dataset = LegalDataset(dataset)

# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_dataloader = DataLoader(tokenized_dataset, shuffle=True, batch_size=4)

optimizer = AdamW(model.parameters(), lr=5e-5)

# Fine-tuning loop (optional)
epochs = 2
model.train()
for epoch in range(epochs):
    total_loss = 0
    for batch in train_dataloader:
        batch = {key: val.to(device) for key, val in batch.items()}
        optimizer.zero_grad()
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} completed with loss: {total_loss / len(train_dataloader)}")

# Save trained model
model.save_pretrained("legal_summarizer")
tokenizer.save_pretrained("legal_summarizer")

# Function to summarize input text
def summarize_text(text):
    inputs = tokenizer("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    summary_ids = model.generate(
        **inputs,
        max_length=150,
        num_beams=4,
        early_stopping=True
    )

    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Function to read text from PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = "".join([page.extract_text() for page in pdf.pages if page.extract_text()])
    return text

# Function to check errors in legal documents
def check_legal_errors(text):
    errors = []
    if len(text) < 50:
        errors.append("Document is too short to be a valid legal text.")
    if not re.search(r"Section\\s\\d+", text, re.IGNORECASE):
        errors.append("No legal sections found in the document.")
    if not re.search(r"Act\\s\\d+", text, re.IGNORECASE):
        errors.append("No legal acts referenced in the document.")
    return errors

# Upload file
uploaded = files.upload()
file_name = list(uploaded.keys())[0]

# Process the document
if file_name.endswith(".pdf"):
    document_text = extract_text_from_pdf(file_name)
else:
    with open(file_name, "r", encoding="utf-8") as f:
        document_text = f.read()

# Generate summary
summary = summarize_text(document_text)
print("\n=== Legal Document Summary ===\n")
print(summary)

# Check for legal errors
errors = check_legal_errors(document_text)
if errors:
    print("\n=== Legal Document Errors ===\n")
    for error in errors:
        print("-", error)
else:
    print("\nNo major legal errors detected in the document.")



Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/33.0 [00:00<?, ?B/s]

text_and_summary.json:   0%|          | 0.00/916k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1432 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1 completed with loss: 2.880219395600218
Epoch 2 completed with loss: 2.253319601439897


Saving vidit-mehra-updated.pdf to vidit-mehra-updated.pdf

=== Legal Document Summary ===

114 CASE COMMENT: ADM JABALPUR v. SHIVKANT SHUKLA Written by Vidit Mehra 2nd Year of 3 Year LL. Student, Symbiosis Law School, Pune Citation: (1976) 2 SCC 521; AIR 1976 SC 1207 Bench: Ray, A.N. (Cj), Khanna, Hans Raj, Beg, Y. Hameedullah, Chandrachud, P.N., Bhagwat

=== Legal Document Errors ===

- No legal sections found in the document.
- No legal acts referenced in the document.
