In [None]:
# -------------------------------
# 0. Mount Google Drive (COLAB)
# -------------------------------
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
PROJECT_ROOT = "/content/drive/MyDrive/firmanagementsystem/model/"

BNS_PATH = f"{PROJECT_ROOT}/bns_final.json"
TRAIN_PATH = f"{PROJECT_ROOT}/train_10000.json"
EVAL_PATH = f"{PROJECT_ROOT}/eval_10000.json"

MODEL_PATH = f"{PROJECT_ROOT}/bert_bns_model.pth"
TOKENIZER_PATH = f"{PROJECT_ROOT}/bert_bns_tokenizer"
METADATA_PATH = f"{PROJECT_ROOT}/bert_bns_metadata.json"
INFERENCE_PATH = f"{PROJECT_ROOT}/bert_inference.py"

In [None]:
# -------------------------------
# 1. Imports
# -------------------------------
import json
import warnings
warnings.filterwarnings("ignore")

import torch
import numpy as np
import pandas as pd

from torch import nn
from torch.utils.data import Dataset, DataLoader

from transformers import (
    BertTokenizer,
    BertModel,
    get_linear_schedule_with_warmup
)

from torch.optim import AdamW


from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import (
    hamming_loss,
    precision_score,
    recall_score,
    f1_score
)

import matplotlib.pyplot as plt
from tqdm import tqdm


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
# -------------------------------
# 2. Device
# -------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")


Using device: cuda


In [None]:
print(type(bns_data))        # should be dict
print(list(bns_data.keys())[:5])


<class 'dict'>
['3', '4', '5', '6', '7']


In [None]:
bns_sections = {}

for sec_no, section in bns_data.items():
    bns_sections[sec_no] = {
        "title": section.get("title", ""),
        "description": section.get("description", ""),
        "punishment": section.get("punishment", ""),
        "procedural": section.get("procedural", {})
    }



In [None]:
print(type(bns_data))                  # must be dict
print(type(list(bns_data.keys())[0]))  # must be str
print(type(list(bns_data.values())[0]))# must be dict


<class 'dict'>
<class 'str'>
<class 'dict'>


In [None]:
with open(TRAIN_PATH, "r", encoding="utf-8") as f:
    train_json = json.load(f)
    train_data = train_json["data"]

with open(EVAL_PATH, "r", encoding="utf-8") as f:
    eval_json = json.load(f)
    eval_data = eval_json["data"]

print(f"Training samples: {len(train_data)}")
print(f"Evaluation samples: {len(eval_data)}")
print(type(train_data))     # must be list
print(train_data[0].keys()) # must contain complaint & section_numbers




Training samples: 10000
Evaluation samples: 10000
<class 'list'>
dict_keys(['id', 'complaint', 'section_numbers', 'incident_type', 'keywords'])


In [None]:
# -------------------------------
# 5. Prepare Texts & Labels
# -------------------------------
def extract_texts_and_labels(data):
    texts, labels = [], []

    for item in data:
        text = (
            item.get("description")
            or item.get("complaint")
            or item.get("text")
            or ""
        )

        if isinstance(item.get("bns_sections"), list):
            sections = item["bns_sections"]
        elif isinstance(item.get("sections"), list):
            sections = item["sections"]
        elif item.get("section_number"):
            sections = [item["section_number"]]
        else:
            sections = []

        if text and sections:
            texts.append(text)
            labels.append([str(s) for s in sections])

    return texts, labels


train_texts, train_labels = extract_texts_and_labels(train_data)
eval_texts, eval_labels = extract_texts_and_labels(eval_data)

print("Text & labels prepared")


Text & labels prepared


In [None]:
# -------------------------------
# 6. MultiLabel Binarizer
# -------------------------------
train_texts = []
train_labels = []

for item in train_data:
    text = item.get("complaint", "").strip()
    labels = item.get("section_numbers", [])

    if text and isinstance(labels, list) and len(labels) > 0:
        train_texts.append(text)
        train_labels.append(labels)

print("Training samples:", len(train_texts))
print("Train labels sample:", train_labels[:3])
print("Type of first label:", type(train_labels[0]))

eval_texts = []
eval_labels = []

for item in eval_data:
    text = item.get("complaint", "").strip()
    labels = item.get("section_numbers", [])

    if text and isinstance(labels, list) and len(labels) > 0:
        eval_texts.append(text)
        eval_labels.append(labels)

print("Evaluation samples:", len(eval_texts))

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
mlb.fit(train_labels + eval_labels)

train_labels_encoded = mlb.transform(train_labels)
eval_labels_encoded = mlb.transform(eval_labels)

num_classes = len(mlb.classes_)
print("Total unique BNS sections:", num_classes)
print("Sample classes:", mlb.classes_[:10])



Training samples: 10000
Train labels sample: [['468'], ['322', '289'], ['329']]
Type of first label: <class 'list'>
Evaluation samples: 10000
Total unique BNS sections: 60
Sample classes: ['103' '105' '120' '122' '131' '145' '147' '148' '149' '152']


In [None]:
# -------------------------------
# 7. Dataset Class
# -------------------------------
class BNSSectionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.float32)
        }


In [None]:
# -------------------------------
# 8. Model
# -------------------------------
class BERTMultiLabelClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = nn.Dropout(0.3)
        self.fc1 = nn.Linear(768, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        x = self.dropout(outputs.pooler_output)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.fc2(x)
        return logits

In [None]:

# -------------------------------
# 9. Tokenizer & Loaders
# -------------------------------
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_dataset = BNSSectionDataset(
    train_texts, train_labels_encoded, tokenizer
)
eval_dataset = BNSSectionDataset(
    eval_texts, eval_labels_encoded, tokenizer
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=16)

model = BERTMultiLabelClassifier(num_classes).to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# -------------------------------
# 10. Training Setup
# -------------------------------
epochs = 5
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
criterion = nn.BCEWithLogitsLoss()

total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)

In [None]:
# -------------------------------
# 11. Training Loop
# -------------------------------
for epoch in range(epochs):
    print(f"\nEpoch {epoch + 1}/{epochs}")
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    print(f"Train loss: {total_loss / len(train_loader):.4f}")

    # Evaluation
    model.eval()
    preds, trues = [], []

    with torch.no_grad():
        for batch in eval_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            logits = model(input_ids, attention_mask)
            probs = torch.sigmoid(logits)

            preds.extend((probs > 0.5).cpu().numpy())
            trues.extend(labels.cpu().numpy())

    f1 = f1_score(trues, preds, average="micro", zero_division=0)
    print(f"Eval F1: {f1:.4f}")



Epoch 1/5


Training: 100%|██████████| 625/625 [15:22<00:00,  1.48s/it]


Train loss: 0.2892
Eval F1: 0.0000

Epoch 2/5


Training: 100%|██████████| 625/625 [15:22<00:00,  1.48s/it]


Train loss: 0.1333
Eval F1: 0.0000

Epoch 3/5


Training: 100%|██████████| 625/625 [15:21<00:00,  1.47s/it]


Train loss: 0.1312
Eval F1: 0.0000

Epoch 4/5


Training: 100%|██████████| 625/625 [15:20<00:00,  1.47s/it]


Train loss: 0.1234
Eval F1: 0.0000

Epoch 5/5


Training: 100%|██████████| 625/625 [15:22<00:00,  1.48s/it]


Train loss: 0.1168
Eval F1: 0.0000


In [None]:
import torch
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, hamming_loss

threshold_eval = 0.3
model.eval()

all_preds, all_trues = [], []

with torch.no_grad():
    for batch in eval_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        logits = model(input_ids, attention_mask)
        probs = torch.sigmoid(logits)

        # Top-1 fallback: ensure at least 1 section per sample
        max_indices = probs.argmax(dim=1)
        fallback_mask = (probs <= threshold_eval).all(dim=1)  # shape: [batch_size]

        # Apply fallback only to samples where all probs <= threshold
        if fallback_mask.any():
            probs[fallback_mask] = 0  # zero out all columns for those samples
            probs[fallback_mask, max_indices[fallback_mask]] = 1.0  # set top-1 to 1

        preds = (probs > threshold_eval).int()

        all_preds.append(preds.cpu())
        all_trues.append(labels.cpu())

# Combine all batches
all_preds = torch.cat(all_preds).numpy()
all_trues = torch.cat(all_trues).numpy()

# Metrics
micro_f1 = f1_score(all_trues, all_preds, average="micro", zero_division=0)
micro_precision = precision_score(all_trues, all_preds, average="micro", zero_division=0)
micro_recall = recall_score(all_trues, all_preds, average="micro", zero_division=0)
exact_accuracy = accuracy_score(all_trues, all_preds)
hamming_acc = 1 - hamming_loss(all_trues, all_preds)

print(f"Eval Micro F1: {micro_f1:.4f}")
print(f"Eval Micro Precision: {micro_precision:.4f}")
print(f"Eval Micro Recall: {micro_recall:.4f}")
print(f"Exact Match Accuracy: {exact_accuracy:.4f}")
print(f"Hamming Accuracy: {hamming_acc:.4f}")


Eval Micro F1: 0.4224
Eval Micro Precision: 0.5838
Eval Micro Recall: 0.3309
Exact Match Accuracy: 0.2762
Hamming Accuracy: 0.9734


In [None]:
# -------------------------------
# 12. Save Model & Metadata
# -------------------------------
torch.save(model.state_dict(), MODEL_PATH)
tokenizer.save_pretrained(TOKENIZER_PATH)

metadata = {
    "num_classes": num_classes,
    "classes": mlb.classes_.tolist(),
    "threshold": 0.5,
    "bns_sections": bns_sections
}

with open(METADATA_PATH, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)

print("Model, tokenizer, metadata saved")


Model, tokenizer, metadata saved


In [None]:
import joblib

MLB_PATH = "/content/drive/MyDrive/firmanagementsystem/model/mlb.pkl"
joblib.dump(mlb, MLB_PATH)

KEYWORD_RULES_PATH = "/content/drive/MyDrive/firmanagementsystem/model/keyword_rules.json"

with open(KEYWORD_RULES_PATH, "w", encoding="utf-8") as f:
    json.dump(KEYWORD_RULES, f, indent=2, ensure_ascii=False)





In [2]:
import torch
import json
import joblib
from transformers import BertForSequenceClassification, BertTokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Paths
MODEL_PATH = "/content/drive/MyDrive/firmanagementsystem/model/bert_bns_model.pth"
TOKENIZER_PATH = "/content/drive/MyDrive/firmanagementsystem/model/bert_bns_tokenizer"
METADATA_PATH = "/content/drive/MyDrive/firmanagementsystem/model/bert_bns_metadata.json"
MLB_PATH = "/content/drive/MyDrive/firmanagementsystem/model/mlb.pkl"
BNS_JSON_PATH = "/content/drive/MyDrive/firmanagementsystem/model/bns_final.json"
KEYWORD_RULES_PATH = "/content/drive/MyDrive/firmanagementsystem/model/keyword_rules.json"

# Load metadata
with open(METADATA_PATH, "r") as f:
    metadata = json.load(f)
from transformers import BertModel
import torch.nn as nn
import torch

class CustomBERTClassifier(nn.Module):
    def __init__(self, num_classes):
        super().__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.fc1 = nn.Linear(768, 512)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(512, num_classes)

    def forward(self, input_ids, attention_mask=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.fc1(pooled_output)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Recreate model architecture
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=metadata["num_classes"],
    problem_type="multi_label_classification"
)

# Load trained weights
model = CustomBERTClassifier(num_classes=metadata["num_classes"])
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.to(device)
model.eval()


# Load tokenizer
tokenizer = BertTokenizer.from_pretrained(TOKENIZER_PATH)

# Load mlb
mlb = joblib.load(MLB_PATH)

# Load BNS data
with open(BNS_JSON_PATH, "r") as f:
    bns_data = json.load(f)

# Load keyword rules
with open(KEYWORD_RULES_PATH, "r") as f:
    KEYWORD_RULES = json.load(f)





FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/firmanagementsystem/model/bert_bns_metadata.json'

In [1]:
# @title
KEYWORD_RULES = {

# ────────────────
# OFFENCES AGAINST HUMAN BODY
# ────────────────

# 103 – Murder
"killed him": "103",
"murdered": "103",
"intentionally caused death": "103",
"attacked and killed": "103",

# 105 – Culpable homicide
"caused death without intention": "105",
"death due to assault": "105",
"act likely to cause death": "105",

# 63 – Rape
"rape": "63",
"forcibly had sexual intercourse": "63",
"without her consent": "63",
"against her will": "63",

# 70 – Gang rape
"gang raped": "70",
"raped by multiple persons": "70",

# 68 – Sexual intercourse by person in authority
"sexual exploitation by authority": "68",
"sexual intercourse by superior": "68",

# 75 – Sexual harassment
"unwelcome sexual advances": "75",
"sexual harassment": "75",

# 76 – Disrobing woman
"attempted to disrobe": "76",
"forced her to remove clothes": "76",

# 64 – Grievous hurt
"caused grievous injuries": "64",
"fractured bones": "64",

# 65 – Voluntarily causing hurt
"caused bodily pain": "65",
"physically assaulted": "65",

# 66 – Causing death by negligence
"death due to negligence": "66",
"rash and negligent act caused death": "66",

# 67 – Sexual intercourse by husband during separation
"husband forced sexual intercourse during separation": "67",

# ────────────────
# CRIMINAL INTIMIDATION / EXTORTION
# ────────────────

# 503 – Criminal intimidation
"threatened me with harm": "503",
"threatened to kill": "503",

# 506 – Punishment for criminal intimidation
"criminal intimidation": "506",

# 350 – Extortion
"demanded money by threat": "350",
"extorted money": "350",


# 420 – Cheating
"cheated me of money": "420",
"dishonestly induced": "420",

# 322 – Dishonest transfer of property
"fraudulent transfer of property": "322",

# 329 – Cheating by personation
"impersonated an officer": "329",
"cheating by pretending": "329",

# 468 – Forgery
"fake records": "468",
"forged documents": "468",
"fake signature": "468",

# 335 – Making false document
"created false document": "335",


# 145 – Unlawful assembly
"unlawful assembly": "145",
"gathered unlawfully": "145",

# 147 – Rioting
"riot broke out": "147",
"violent mob": "147",

# 148 – Rioting with deadly weapons
"riot armed with weapons": "148",

# 149 – Waging war against Government
"collecting arms to wage war": "149",

"assaulted public servant": "160",

# 161 – Abetment of assault on superior officer
"abetment of assault on superior officer": "161",

# 172 – False evidence
"gave false evidence": "172",
"fabricated evidence": "172",

# 173 – False police report
"false complaint to police": "173",

# 175 – Obstructing public servant
"obstructed police officer": "175",

# 176 – Concealing offence
"concealed evidence": "176",

# 188 – Disobedience to order
"disobeyed lawful order": "188",

# 190 – Illegal complaint
"illegal complaint filed": "190",

# 200 – Public servant misconduct
"misuse of official position": "200",


# 361 – Kidnapping
"kidnapped a minor": "361",

# 369 – Abduction of child
"abducted a child": "369",

# 370 – Human trafficking
"human trafficking": "370",
"trafficked for exploitation": "370",

# 371 – Buying or selling person
"sold a person": "371",

# 372 – Exploitation of trafficked person
"exploited trafficked minor": "372",

# ────────────────
# MISC
# ────────────────

# 275 – Sale of noxious food
"sold adulterated food": "275",

# 289 – Negligent conduct
"negligent handling of machinery": "289",

# 312 – Attempted robbery with weapon
"attempted robbery with knife": "312",

# 318 – Causing death by negligence
"negligent act caused death": "318",

# 341 – Wrongful restraint
"wrongfully restrained": "341",

# 347 – Kidnapping to extort
"kidnapped for ransom": "347",

# 356 – Criminal breach by public servant
"abuse of official power": "356",

# 357 – Assault to deter public servant
"assaulted to deter public servant": "357",

# 72 – Disclosure of identity of victim
"revealed identity of rape victim": "72",

# 97 – General punishment
"punishable offence": "97",
 # 63 – Rape
"rape": "63",
"forcibly had sexual intercourse": "63",
"without her consent": "63",
"against her will": "63",

# 70 – Gang rape
"gang raped": "70",
"raped by multiple persons": "70",

# 103 – Murder
"murdered": "103",
"killed him": "103",
"intentionally caused death": "103",

# 82 – Child marriage
"child marriage": "82",
"married a minor girl": "82",
"minor girl was married": "82",
"forced marriage of minor": "82",

# ────────────────
# MEDIUM PRIORITY
# ────────────────

# 503 – Criminal intimidation
"threatened me": "503",
"threatened to kill": "503",

# 506 – Punishment for intimidation
"criminal intimidation": "506",

# 420 – Cheating
"cheated me of money": "420",
"dishonestly induced": "420",

# 468 – Forgery
"fake records": "468",
"forged documents": "468",
"fake signature": "468",

# 322 – Dishonest transfer
"fraudulent transfer of property": "322",

# 350 – Extortion
"demanded money by threat": "350",

# 361 – Kidnapping
"kidnapped a minor": "361",

# 370 – Human trafficking
"human trafficking": "370"

}


In [None]:
def predict_bns_hybrid(complaint, model, tokenizer, mlb, bns_data, top_k=3):
    model.eval()

    results = []

    # ==========================
    # 1. KEYWORD MATCHING FIRST
    # ==========================
    complaint_lower = complaint.lower()
    keyword_sections = set()

    for phrase, sec in KEYWORD_RULES.items():
        if phrase in complaint_lower:
            keyword_sections.add(sec)

    for sec in keyword_sections:
        info = bns_data.get(sec, {})
        results.append({
            "section_number": sec,
            "title": info.get("title", "N/A"),
            "description": info.get("description", "N/A"),
            "punishment": ", ".join(
                [sub.get("punishment", "N/A") for sub in info.get("sub_sections", [])]
            ),
            "confidence": 1.0,
            "source": "keyword"
        })

    # ==========================
    # 2. BERT PREDICTION
    # ==========================
    inputs = tokenizer(
        complaint,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=512
    ).to(device)

    if "token_type_ids" in inputs:
        inputs.pop("token_type_ids")

    with torch.no_grad():
        logits = model(**inputs)
        probs = torch.sigmoid(logits).cpu().numpy()[0]

    top_indices = probs.argsort()[-top_k:][::-1]
    bert_sections = mlb.classes_[top_indices]
    bert_scores = probs[top_indices]

    for sec, score in zip(bert_sections, bert_scores):
        if sec not in keyword_sections and score > 0.3:
            info = bns_data.get(sec, {})
            results.append({
                "section_number": sec,
                "title": info.get("title", "N/A"),
                "description": info.get("description", "N/A"),
                "punishment": ", ".join(
                    [sub.get("punishment", "N/A") for sub in info.get("sub_sections", [])]
                ),
                "confidence": float(score),
                "source": "model"
            })

    # ==========================
    # 3. REMOVE DUPLICATES
    # ==========================
    unique = {}
    for r in results:
        unique[r["section_number"]] = r

    results = list(unique.values())

    # ==========================
    # 4. SORT: KEYWORDS FIRST
    # ==========================
    results.sort(key=lambda x: x["confidence"], reverse=True)

    return results


In [None]:
test_complaint = (
    "Child marriage"
)

predictions = predict_bns_hybrid(
    test_complaint, model, tokenizer, mlb, bns_data, top_k=3
)

for p in predictions:
    print("Section:", p["section_number"])
    print("Title:", p["title"])
    print("Confidence:", p["confidence"])
    print("=" * 50)


Section: 82
Title: Marrying again during lifetime of husband or wife.
Confidence: 1.0
