In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import re
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader




In [None]:
# Load the dataset (uploaded to Colab locally)
train_path = '/content/train.csv'  # Path to your uploaded train.csv file
test_path = '/content/test.csv'   # Path to your uploaded test.csv file

# Load train and test datasets
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)



In [None]:
# Display the first few rows of the training data
print("Training Data Overview:")
print(train_data.head())

# Handle missing values
train_data['crimeaditionalinfo'] = train_data['crimeaditionalinfo'].fillna("")
train_data['sub_category'] = train_data['sub_category'].fillna("Unknown")



In [None]:
# Combine rare classes in 'sub_category'
min_class_threshold = 5  # Minimum number of samples per class
class_counts = train_data['sub_category'].value_counts()
rare_classes = class_counts[class_counts < min_class_threshold].index
train_data['sub_category'] = train_data['sub_category'].replace(rare_classes, 'Other')

# Define Features and Labels
X = train_data['crimeaditionalinfo']
y = train_data['sub_category']



In [None]:
# Encode Labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)



In [None]:
# Tokenizer and Dataset Preparation
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            add_special_tokens=True,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create Datasets
train_dataset = CustomDataset(X_train, y_train, tokenizer)
test_dataset = CustomDataset(X_test, y_test, tokenizer)

# Load Pretrained BERT Model for Sequence Classification
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(label_encoder.classes_)
)



In [None]:
# Define Trainer Arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True
)

# Define Trainer
from transformers import Trainer, TrainingArguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=lambda p: {
        'accuracy': accuracy_score(p.label_ids, np.argmax(p.predictions, axis=1)),
        'precision': precision_score(p.label_ids, np.argmax(p.predictions, axis=1), average='weighted'),
        'recall': recall_score(p.label_ids, np.argmax(p.predictions, axis=1), average='weighted'),
        'f1': f1_score(p.label_ids, np.argmax(p.predictions, axis=1), average='weighted')
    }
)



In [None]:
# Train the Model
trainer.train()

# Evaluate the Model
results = trainer.evaluate()
print("\nEvaluation Metrics:")
print(f"Accuracy: {results['eval_accuracy']:.4f}")
print(f"Precision: {results['eval_precision']:.4f}")
print(f"Recall: {results['eval_recall']:.4f}")
print(f"F1-Score: {results['eval_f1']:.4f}")

# Save Model and Tokenizer
model.save_pretrained('/content/bert_model')
tokenizer.save_pretrained('/content/bert_model')

print("Model and tokenizer saved to /content/bert_model")



In [None]:
# Load and Predict
def predict_complaint(text, model, tokenizer, label_encoder, max_len=128):
    model.eval()
    encoding = tokenizer.encode_plus(
        text,
        max_length=max_len,
        add_special_tokens=True,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=1).item()

    return label_encoder.inverse_transform([prediction])[0]



In [None]:
# Test Prediction
sample_complaint = "I applied for a loan online but the lender is blackmailing me for more money."
loaded_model = BertForSequenceClassification.from_pretrained('/content/bert_model')
loaded_tokenizer = BertTokenizer.from_pretrained('/content/bert_model')
predicted_category = predict_complaint(sample_complaint, loaded_model, loaded_tokenizer, label_encoder)
print(f"Predicted Sub-Category for the Complaint: {predicted_category}")