In [1]:
import os, json, wandb
import torch
import numpy as np 
import pandas as pd
import polars as pl
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

In [2]:
train = pl.read_csv("train.csv")
test = pl.read_csv("test.csv")

In [3]:
train = train.with_columns(pl.col("category").str.to_lowercase(), 
                    pl.col("sub_category").str.to_lowercase().fill_null("NULL"), 
                    pl.col("crimeaditionalinfo").str.to_lowercase().str.strip_chars())
test = test.with_columns(pl.col("category").str.to_lowercase(), 
                    pl.col("sub_category").str.to_lowercase().fill_null("NULL"), 
                    pl.col("crimeaditionalinfo").str.to_lowercase().str.strip_chars())
train = train.with_columns((pl.struct(["category", "sub_category"])
                    .map_elements(lambda e: e["category"] + " - " + e["sub_category"], return_dtype=pl.String))
                    .alias("output"))
test = test.with_columns((pl.struct(["category", "sub_category"])
                    .map_elements(lambda e: e["category"] + " - " + e["sub_category"], return_dtype=pl.String))
                    .alias("output"))

### For Null text use category = "online financial fraud",  sub_category = "upi related frauds"

In [4]:
# train.filter(pl.col("len").is_null()).group_by(["category", "sub_category"]).len()

In [5]:
train = train.filter(pl.col("crimeaditionalinfo").is_not_null())
test = test.filter(pl.col("crimeaditionalinfo").is_not_null())

In [6]:
le = LabelEncoder()
x = le.fit_transform(train["output"])
train = train.insert_column(4, pl.Series(name="label", values=x))
le.classes_ = np.append(le.classes_, "NULL")

x = le.transform(["NULL" if x not in le.classes_ else x for x in test["output"]])
test = test.insert_column(4, pl.Series(name="label", values=x))

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

train_dataset = Dataset.from_polars(train)
test_dataset = Dataset.from_polars(test.sample(fraction=0.33))

def tokenize_data(examples):
    return tokenizer(examples["crimeaditionalinfo"], truncation=True)

tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_test = test_dataset.map(tokenize_data, batched=True)



Map:   0%|          | 0/93665 [00:00<?, ? examples/s]

Map:   0%|          | 0/10303 [00:00<?, ? examples/s]

In [8]:
run = wandb.init(
    project='Govt Hackathon', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112998899999284, max=1.0…

In [9]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(le.classes_)).to(device)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.1,
    logging_strategy="steps",
    logging_steps=1,
    report_to="wandb",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
trainer.train()



Step,Training Loss,Validation Loss
586,1.2428,1.680815
1172,2.2733,1.67786
1758,2.0238,1.598853
2344,1.2744,1.591804
2930,2.1478,1.601974
3516,1.3411,1.596113
4102,1.1855,1.507481
4688,1.3665,1.494886
5274,1.6267,1.481625


TrainOutput(global_step=5854, training_loss=1.6473084893351249, metrics={'train_runtime': 19904.5481, 'train_samples_per_second': 4.706, 'train_steps_per_second': 0.294, 'total_flos': 5243773357673088.0, 'train_loss': 1.6473084893351249, 'epoch': 0.9999145956102143})

In [11]:
trainer.save_model('model/distillbert_finetuned_1')
wandb.finish()

VBox(children=(Label(value='0.016 MB of 0.016 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,██▅▅▅▅▂▁▁
eval/runtime,█▄▅▁▁▅▁▁▁
eval/samples_per_second,▁▅▄██▄███
eval/steps_per_second,▁▅▄██▄███
train/epoch,▁▁▁▁▂▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇█████
train/global_step,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇████
train/grad_norm,▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▂▁▃▁▁▁▂▁▂▁▂▂▂█▁▁▁▁▄▁▂▁▁▁▁
train/learning_rate,█▇▇▇▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▄▄▄▄▃▃▃▃▃▂▂▁▁▁▁▁▁▁▁▁
train/loss,▇▃█▅▆▄▃█▅▂▆▅▄▂▂▃▃▂▆▆▄▄▅▄▁▃▄▄▆▄▄▄▅▅▄▃▄▆▄▄

0,1
eval/loss,1.48163
eval/runtime,540.765
eval/samples_per_second,19.053
eval/steps_per_second,2.382
total_flos,5243773357673088.0
train/epoch,0.99991
train/global_step,5854.0
train/grad_norm,5.32764
train/learning_rate,0.0
train/loss,0.6912


In [12]:
trainer.save_model('model/distillbert_finetuned_1')

In [13]:
model = AutoModelForSequenceClassification.from_pretrained("model/distillbert_finetuned_1", num_labels=len(le.classes_))
model.to(device)
tokenizer = AutoTokenizer.from_pretrained("model/distillbert_finetuned_1")

In [14]:
def predict_text(text, model, tokenizer):
    """
    Process single text input and return predicted class
    """
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    
    with torch.no_grad():
        logits = model(**inputs).logits
    
    return logits.argmax().item()

def evaluate_model(test_df, model, tokenizer):
    """
    Evaluate model on test dataset
    """
    predictions = []
    
    # Use tqdm progress bar for iteration
    for text in tqdm(test_df['crimeaditionalinfo'], desc="Making predictions"):
        pred = predict_text(text, model, tokenizer)
        predictions.append(pred)

    # Calculate accuracy
    accuracy = accuracy_score(test_df['label'], predictions)
    
    return {
        'accuracy': accuracy,
        'predictions': predictions
    }

results = evaluate_model(test.to_pandas(), model, tokenizer)
test.with_columns(pl.Series("result", results["predictions"]))

test.write_csv("result.csv")
print(f"Model Accuracy: {results['accuracy']:.4f}")


Making predictions:   0%|          | 0/31222 [00:00<?, ?it/s]

Model Accuracy: 0.5348
