In [None]:
import pandas as pd

df = pd.read_csv("/content/Laws and Constitution of India.csv")
print(df.head())  # Displays the first 5 rows


     instruction          input  \
0  IPC Chapter 1  IPC Section 1   
1  IPC Chapter 1  IPC Section 2   
2  IPC Chapter 1  IPC Section 3   
3  IPC Chapter 1  IPC Section 4   
4  IPC Chapter 1  IPC Section 5   

                                              output  
0  Deals with the title and extent of the Indian ...  
1  States that every person committing an offense...  
2  Extends IPC to crimes committed by Indians out...  
3  Covers extraterritorial jurisdiction, applying...  
4  States that certain laws (military, navy, spec...  


In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer, TrainingArguments, Trainer, DataCollatorForSeq2Seq
from datasets import load_dataset
import pandas as pd
import datasets
from datasets import DatasetDict
# **Step 1: Load Dataset**
df = pd.read_csv("/content/Laws and Constitution of India.csv")
dataset = datasets.Dataset.from_pandas(df)
print(dataset[0])

# **Step 2: Load T5-base Model & Tokenizer**
model_name = "t5-base"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)

# **Step 3: Tokenize Data for T5**
def tokenize_function(example):
    input_text = f"Instruction: {example['instruction']} Input: {example['input']}"
    target_text = example['output']
    return {
        "input_ids": tokenizer(input_text, padding="longest", truncation=True, max_length=512)["input_ids"],
        "attention_mask": tokenizer(input_text, padding="longest", truncation=True, max_length=512)["attention_mask"],
        "labels": tokenizer(target_text, padding="longest", truncation=True, max_length=128)["input_ids"]
    }

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function)
print(tokenized_datasets)

split_dataset = tokenized_datasets.train_test_split(test_size=0.1)
split_dataset = DatasetDict({
    "train": split_dataset["train"],
    "test": split_dataset["test"]
})
# Split into train (90%) and test (10%)
# tokenized_datasets = tokenized_datasets["train"].train_test_split(test_size=0.1)

# **Step 4: Training Arguments**
training_args = TrainingArguments(
    output_dir="./fine_tuned_t5",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=2,  # Adjusted for lower memory usage
    per_device_eval_batch_size=2,
    num_train_epochs=3,  # Increased epochs for better learning
    save_total_limit=2,
    logging_dir="./logs",
    push_to_hub=False,
    fp16=True if torch.cuda.is_available() else False,  # Use FP16 if GPU supports it
    bf16=False  # Set to True for newer GPUs (A100, H100)
)

# **Step 5: Define Trainer**
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=split_dataset["train"],
    eval_dataset=split_dataset["test"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForSeq2Seq(tokenizer, model=model),
)

# **Step 6: Train the Model**
trainer.train()

# **Step 7: Save Model**
model.save_pretrained("fine_tuned_t5")
tokenizer.save_pretrained("fine_tuned_t5")

print("Fine-tuning complete! Model saved in 'fine_tuned_t5'.")


{'instruction': 'IPC Chapter 1', 'input': 'IPC Section 1', 'output': 'Deals with the title and extent of the Indian Penal Code, applicable to the entire country.'}


Map:   0%|          | 0/1796 [00:00<?, ? examples/s]

Dataset({
    features: ['instruction', 'input', 'output', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1796
})


  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msatyamtripathi038[0m ([33msatyamtripathi038-jodhpur-instititute-of-engineering-and[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss
1,2.8397,2.281305
2,2.2765,2.155046
3,2.2036,2.123345


Fine-tuning complete! Model saved in 'fine_tuned_t5'.


In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_path = "fine_tuned_t5"  # Path to your saved model

# Load the fine-tuned model (automatically detects SafeTensors)
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

# Test the fine-tuned model with a sample input
input_text = "IPC Chapter 1. Input: IPC Section 1"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

output_ids = model.generate(input_ids, max_length=100)
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print("Generated Response:", output_text)


Generated Response: IPC Chapter 1: Inputs to the court for a violation of the Constitution.


In [None]:
pip freeze > requirements.txt