In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import pandas as pd

# Load the updated IPC dataset
df = pd.read_csv('/kaggle/input/dataset/Balanced_IPC_Sections_409_Cleaned.csv')

# Check the first few rows to understand the structure
print(df.head())


                                    Case_Description IPC_section
0  Several individuals conspired to kidnap a busi...     IPC 120
1  A group of officials conspired to manipulate t...     IPC 120
2  Smugglers conspired to illegally transport end...     IPC 120
3  Two employees of a company conspired to leak c...     IPC 120
4  A terrorist group not only conspired but also ...     IPC 120


In [2]:
!pip install datasets



In [3]:
from transformers import DistilBertTokenizer
from datasets import Dataset

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the text
def tokenize_function(examples):
    return tokenizer(examples['Case_Description'], padding=True, truncation=True)

# Convert the dataframe into a HuggingFace Dataset
dataset = Dataset.from_pandas(df[['Case_Description', 'IPC_section']])

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Show tokenized data
print(tokenized_datasets)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Map:   0%|          | 0/22495 [00:00<?, ? examples/s]

Dataset({
    features: ['Case_Description', 'IPC_section', 'input_ids', 'attention_mask'],
    num_rows: 22495
})


In [4]:
from transformers import DistilBertForSequenceClassification

# Define the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(df['IPC_section'].unique()))

# Check the model architecture
print(model)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [10]:
import os
import pandas as pd
from transformers import Trainer, TrainingArguments, DistilBertForSequenceClassification, DistilBertTokenizer, DataCollatorWithPadding
from datasets import Dataset

# Disable WandB logging
os.environ["WANDB_DISABLED"] = "true"

# Load the dataset
df = pd.read_csv('/kaggle/input/dataset/Balanced_IPC_Sections_409_Cleaned.csv')

# Tokenizer for DistilBERT
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['Case_Description'], padding='max_length', truncation=True) # Pad to max_length

# Convert to HuggingFace Dataset
dataset = Dataset.from_pandas(df[['Case_Description', 'IPC_section']])
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Map IPC sections to numeric labels
df['label'] = df['IPC_section'].astype('category').cat.codes

# Ensure that labels are present
tokenized_datasets = tokenized_datasets.add_column('labels', df['label'].values)

# Initialize the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(df['IPC_section'].unique()))

# Define training arguments with increased epochs and checkpoints
training_args = TrainingArguments(
    output_dir='./results',
    per_device_train_batch_size=8,
    num_train_epochs=10,  # Increased epochs to 20
    logging_dir='./logs',
    logging_steps=200,
    evaluation_strategy="epoch",  # Logging after each epoch
    save_strategy="steps",  # Save model every specified steps
    save_steps=500,  # Save model every 500 steps (you can adjust this as per your requirement)
    save_total_limit=3,  # Limit the number of saved checkpoints to 3
    report_to="none",  # Disable WandB reporting
)

# Use DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    eval_dataset=tokenized_datasets,
    data_collator=data_collator # Add data_collator
)

# Start training
trainer.train()

Map:   0%|          | 0/22495 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,2.0289,1.593679
2,0.7575,0.504533
3,0.3345,0.196863
4,0.1571,0.06372
5,0.056,0.019653
6,0.0247,0.008767
7,0.0124,0.008568
8,0.0056,0.00556
9,0.0105,0.005012
10,0.0044,0.00598




TrainOutput(global_step=14060, training_loss=0.5667580621849592, metrics={'train_runtime': 8768.9148, 'train_samples_per_second': 25.653, 'train_steps_per_second': 1.603, 'total_flos': 3.00148271030784e+16, 'train_loss': 0.5667580621849592, 'epoch': 10.0})

In [11]:
# Evaluate the model
eval_results = trainer.evaluate()
print(eval_results)


{'eval_loss': 0.005979881156235933, 'eval_runtime': 219.205, 'eval_samples_per_second': 102.621, 'eval_steps_per_second': 6.414, 'epoch': 10.0}


In [16]:
import torch

In [18]:
# Example complaint text
complaint_text = ["The victim was assaulted by a known individual with intent to har."]

# Tokenize the input text and move to the GPU
inputs = tokenizer(complaint_text, return_tensors="pt", truncation=True, padding=True).to(model.device) # Move inputs to the same device as the model

# Make a prediction
outputs = model(**inputs)
predicted_ipc = torch.argmax(outputs.logits).item()

# Map the numeric prediction back to the IPC section
predicted_ipc_section = df['IPC_section'].astype('category').cat.categories[predicted_ipc]
print(f"Predicted IPC Section: {predicted_ipc_section}")

Predicted IPC Section: IPC 376
