In [None]:
%pip install -r requirements.txt

In [None]:
import os, json, wandb
import numpy as np 
import pandas as pd
import polars as pl
from tqdm.auto import tqdm

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer, CountVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder

import spacy
from sentence_transformers import SentenceTransformer
from lightgbm import LGBMClassifier

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from datasets import Dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

2024-11-07 22:17:04.500263: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1730998024.557158   32082 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1730998024.572913   32082 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-07 22:17:04.822018: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Importing Dataset and Analysing Data

In [2]:
train = pl.read_csv("train.csv")
test = pl.read_csv("test.csv")

In [3]:


# Create histograms
hist1 = go.Histogram(x=train["crimeaditionalinfo"].str.len_chars().to_numpy(), name='Train')
hist2 = go.Histogram(x=test["crimeaditionalinfo"].str.len_chars().to_numpy(), name='Test')

# Combine histograms into a single figure
fig = go.Figure(data=[hist1, hist2])

# Update layout
fig.update_layout(title="Length of Additional Info", barmode='overlay')

# Show plot
fig.show()

### The maximum length of the text provided is 1500

In [4]:
train.null_count(), test.null_count()

(shape: (1, 3)
 ┌──────────┬──────────────┬────────────────────┐
 │ category ┆ sub_category ┆ crimeaditionalinfo │
 │ ---      ┆ ---          ┆ ---                │
 │ u32      ┆ u32          ┆ u32                │
 ╞══════════╪══════════════╪════════════════════╡
 │ 0        ┆ 6591         ┆ 21                 │
 └──────────┴──────────────┴────────────────────┘,
 shape: (1, 3)
 ┌──────────┬──────────────┬────────────────────┐
 │ category ┆ sub_category ┆ crimeaditionalinfo │
 │ ---      ┆ ---          ┆ ---                │
 │ u32      ┆ u32          ┆ u32                │
 ╞══════════╪══════════════╪════════════════════╡
 │ 0        ┆ 2236         ┆ 7                  │
 └──────────┴──────────────┴────────────────────┘)

### Some text is null  and many sub_category are also null

In [5]:
#  train set with sub categories of null value
train.filter(pl.col("crimeaditionalinfo").is_null()).group_by(["category", "sub_category"]).len()

category,sub_category,len
str,str,u32
"""Online Financial Fraud""","""Internet Banking Related Fraud""",1
"""Online Financial Fraud""","""DebitCredit Card FraudSim Swap…",3
"""Online Financial Fraud""","""UPI Related Frauds""",13
"""Any Other Cyber Crime""","""Other""",1
"""Online and Social Media Relate…","""Profile Hacking Identity Theft""",1
"""Online Financial Fraud""","""Fraud CallVishing""",1
"""Online and Social Media Relate…","""Cheating by Impersonation""",1


In [6]:
#  test set with sub categories of null value
test.filter(pl.col("crimeaditionalinfo").is_null()).group_by(["category", "sub_category"]).len()

category,sub_category,len
str,str,u32
"""Online Financial Fraud""","""UPI Related Frauds""",4
"""Sexually Obscene material""",,1
"""Online Financial Fraud""","""DebitCredit Card FraudSim Swap…",1
"""Online Financial Fraud""","""Fraud CallVishing""",1


### As Most NULL text field have "Online Financial Fraud" as category and "UPI Related Frauds" as sub_category So we should default this behaviour according to data distribution since no information can be extracted from NULL value

In [7]:
#  Drop null values
train = train.drop_nulls("crimeaditionalinfo")
test = test.drop_nulls("crimeaditionalinfo")

In [8]:
#  Fill null values with NULL string
train = train.fill_null("NULL")
test = test.fill_null("NULL")

In [9]:
#  converting all strings to lowercase and stripping whitespaces
train = train.with_columns(pl.col("crimeaditionalinfo").str.to_lowercase().str.strip_chars(),
                           pl.col("category").str.to_lowercase().str.strip_chars(),
                           pl.col("sub_category").str.to_lowercase().str.strip_chars())
test = test.with_columns(pl.col("crimeaditionalinfo").str.to_lowercase().str.strip_chars(),
                         pl.col("category").str.to_lowercase().str.strip_chars(),
                         pl.col("sub_category").str.to_lowercase().str.strip_chars())

In [10]:
# Same number of categories
train["category"].n_unique(), test["category"].n_unique()

(15, 15)

In [11]:
# test set has more sub categories
train["sub_category"].n_unique(), test["sub_category"].n_unique()

(36, 38)

In [12]:
#  train set with sub categories of null value
train.filter(pl.col("sub_category").is_null()).group_by(["category", "sub_category"]).len()

category,sub_category,len
str,str,u32


In [13]:
#  test set with sub categories of null value
test.filter(pl.col("sub_category").is_null()).group_by(["category", "sub_category"]).len()

category,sub_category,len
str,str,u32


### Since the sub categories of NULL values have these category we can just group them

In [14]:
#  Number of unique groups of categories and sub categories in train
train.group_by(["category", "sub_category"]).len().n_unique()

40

In [15]:
#  Number of unique groups of categories and sub categories in test
test.group_by(["category", "sub_category"]).len().n_unique()

42

In [16]:
#  Combining category and sub category

train = train.with_columns(pl.Series("joined", train["category"] + " - " + train["sub_category"]))
test = test.with_columns(pl.Series("joined", test["category"] + " - " + test["sub_category"]))

In [17]:
#  Since Test has more category and sub categories combination then we can discard them as they are not present in train
set(test["joined"].unique()) - set(train["joined"].unique())

{'crime against women & children - computer generated csam/csem',
 'crime against women & children - cyber blackmailing & threatening',
 'crime against women & children - sexual harassment'}

## Preprocessing text

In [None]:
#  Load spacy model
spacy.prefer_gpu()
spacy_nlp = spacy.load("en_core_web_md")

In [21]:
#  lemmatizing and removing punctuations
train_text = []
for text in tqdm(train["crimeaditionalinfo"]):
    train_text.append(" ".join([d.lemma_ for d in spacy_nlp(text) if not d.is_punct]))
test_text = []
for text in tqdm(test["crimeaditionalinfo"]):
    test_text.append(" ".join([d.lemma_ for d in spacy_nlp(text) if not d.is_punct]))

  0%|          | 0/93665 [00:00<?, ?it/s]

  0%|          | 0/31222 [00:00<?, ?it/s]

converting all strings to lowercase and stripping whitespaces and added lemmaized text to train and test

In [22]:
train = train.with_columns(pl.Series("text", train_text).str.to_lowercase().str.strip_chars())
test = test.with_columns(pl.Series("text", test_text).str.to_lowercase().str.strip_chars())

label encoding the combined category and sub category

In [24]:
le = LabelEncoder()
x = le.fit_transform(train["joined"])
train = train.with_columns(pl.Series("label", x))
le.classes_ = np.append(le.classes_, "null")

x = le.transform(["null" if x not in le.classes_ else x for x in test["joined"]])
test = test.with_columns(pl.Series("label", x))

In [None]:
#  Saving cleaned train and test to csv
train.write_csv("train_cleaned.csv")
test.write_csv("test_cleaned.csv")

## Model training and Evaluating

### TFIDF Classifier

In [25]:
#  TF-IDF vectorizer
tf_idf = TfidfVectorizer()
train_tf = tf_idf.fit_transform(train["text"])
test_tf = tf_idf.transform(test["text"])

In [26]:
# LGBM Classifier 
naive_bayes_classifier = LGBMClassifier()
naive_bayes_classifier.fit(train_tf, train["label"])

y_pred = naive_bayes_classifier.predict(test_tf)
accuracy_score(test["label"], y_pred), confusion_matrix(test["label"], y_pred)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 3.191275 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 369040
[LightGBM] [Info] Number of data points in the train set: 93665, number of used features: 5779
[LightGBM] [Info] Start training from score -2.153074
[LightGBM] [Info] Start training from score -5.509944
[LightGBM] [Info] Start training from score -5.273694
[LightGBM] [Info] Start training from score -5.265395
[LightGBM] [Info] Start training from score -5.224904
[LightGBM] [Info] Start training from score -5.155911
[LightGBM] [Info] Start training from score -5.191730
[LightGBM] [Info] Start training from score -5.167084
[LightGBM] [Info] Start training from score -5.216998
[LightGBM] [Info] Start training from score -5.199437
[LightGBM] [Info] Start training from score -6.366076
[LightGBM] [Info] Start training from score -6.765349
[LightGBM] [Info] Start training from score -5.592408
[Ligh

(0.36935494202805713,
 array([[613,  45,  60, ...,  52,  66,   0],
        [ 18,  23,   2, ...,   3,   4,   0],
        [ 15,   2,  39, ...,   0,   2,   0],
        ...,
        [ 80,  15,   4, ...,  37,  35,   0],
        [ 86,  27,   5, ...,  38,  57,   0],
        [  0,   0,   0, ...,   0,   0,   0]]))

#### TFIDF classifier is getting only ~36% accuracy

### Embedding based classifier

In [None]:
#  Sentence Transformer
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
#  Encode train and test text
train_emb = model.encode(train["text"].to_list(), batch_size=128, show_progress_bar=True, device="cuda")
test_emb = model.encode(test["text"].to_list(), batch_size=128, show_progress_bar=True, device="cuda")

In [None]:
# LGBM Classifier 
naive_bayes_classifier = LGBMClassifier()
naive_bayes_classifier.fit(train_emb, train["label"])

y_pred = naive_bayes_classifier.predict(test_emb)
accuracy_score(test["label"], y_pred), confusion_matrix(test["label"], y_pred)

#### Embedding based classifier is getting only ~36% accuracy

### Neural Network based classifier

In [None]:
#  Sentence Transformer
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
#  Encode train and test text
train_emb = model.encode(train["text"].to_list(), batch_size=128, show_progress_bar=True, device="cuda")
test_emb = model.encode(test["text"].to_list(), batch_size=128, show_progress_bar=True, device="cuda")

In [None]:
#  the NN model architecture
class TextClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(TextClassifier, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.layer2 = nn.Sequential(
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.layer3 = nn.Sequential(
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2)
        )
        self.output = nn.Linear(128, num_classes)
        
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        return self.output(x)


In [None]:
# Convert data to PyTorch tensors
X_train = torch.FloatTensor(train_emb)
y_train = torch.LongTensor(train["label"].to_numpy())
X_test = torch.FloatTensor(test_emb)
y_test = torch.LongTensor(test["label"].to_numpy())

# Create data loaders
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=64)

In [None]:
# Initialize model, loss function and optimizer
model = TextClassifier(train_emb.shape[1], train["label"].n_unique())
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

In [None]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
    
    # Evaluation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_X, batch_y in test_loader:
            outputs = model(batch_X)
            _, predicted = torch.max(outputs.data, 1)
            total += batch_y.size(0)
            correct += (predicted == batch_y).sum().item()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Test Accuracy: {100 * correct / total:.2f}%')

#### Neural network based classifier is getting only ~54% accuracy

### BERT based classifier

In [None]:
#  set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#  Load pre-trained DistilBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

#  Initialize data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#  Load pre-trained DistilBERT model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=train["label"].n_unique())
model.to(device)

In [None]:
#  Load train and test data
train_dataset = Dataset.from_polars(train)
test_dataset = Dataset.from_polars(test)

def tokenize_data(examples):
    return tokenizer(examples["crimeaditionalinfo"], truncation=True)

#  Tokenize train and test data
tokenized_train = train_dataset.map(tokenize_data, batched=True)
tokenized_test = test_dataset.map(tokenize_data, batched=True)

In [None]:
#  Initialize wandb for logging
wandb.init(project='Govt Hackathon', job_type="training", anonymous="allow")

In [None]:
#  Initialize training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=2,
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.1,
    logging_strategy="steps",
    logging_steps=1,
    report_to="wandb",
)

#  Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

In [None]:
#  Train the model
trainer.train()

In [None]:
#  Save the model and finish wandb
trainer.save_model('model/distillbert_finetuned_1')
wandb.finish()

In [None]:
#  Load Finetuned DistilBERT model
model = AutoModelForSequenceClassification.from_pretrained("model/distillbert_finetuned_1", num_labels=len(le.classes_))
model.to(device)
tokenizer = AutoTokenizer.from_pretrained("model/distillbert_finetuned_1")

In [None]:
#  Predict function to make predictions on test data
def predict_text(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
    
    with torch.no_grad():
        logits = model(**inputs).logits
    
    return logits.argmax().item()

#  Evaluate model on test dataset
def evaluate_model(test_df, model, tokenizer):
    predictions = []
    # Use tqdm progress bar for iteration
    for text in tqdm(test_df['crimeaditionalinfo'], desc="Making predictions"):
        pred = predict_text(text, model, tokenizer)
        predictions.append(pred)
    
    # Calculate accuracy
    accuracy = accuracy_score(test_df['label'], predictions)
    
    return {
        'accuracy': accuracy,
        'predictions': predictions
    }


In [None]:
#  Evaluate model and get the accuracy score
results = evaluate_model(test.to_pandas(), model, tokenizer)
test.with_columns(pl.Series("result", results["predictions"]))

test.write_csv("result.csv")
print(f"Model Accuracy: {results['accuracy']:.4f}")

#### BERT based classifier is getting only ~55% accuracy

### Finetuned LLM model

In [None]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
from trl import SFTTrainer, setup_chat_format

In [None]:
base_model = "HuggingFaceTB/SmolLM2-135M-Instruct"
torch_dtype = torch.float16
attn_implementation = "eager"

In [None]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

In [None]:
instruction = "Analyze the provided text from a Cybercrime Prevention Assisatant perspective. Identify the category and sub_category of the complaint reported. Answer in json format only. The Format must follow like this {'category': 'category', 'sub_category': 'sub_category'}"
def format_chat_template(row):
    
    row_json = [{"role": "system", "content": instruction },
               {"role": "user", "content": row["crimeaditionalinfo"]},
               {"role": "assistant", "content": row["output"]}]
    
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = Dataset.from_pandas(train.to_pandas()).map(
    format_chat_template,
    num_proc= 6,
)
dataset = dataset.train_test_split(0.2)

In [None]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [None]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

In [None]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir="output/smolm2-finetune-1",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.1,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb",
    resume_from_checkpoint=True,
)

In [None]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length= 1500,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)

In [None]:
trainer.train()

In [None]:
wandb.finish()
trainer.save_model("output/smolm2-finetune-1")

In [None]:
def generate_predictions(data, model, tokenizer, instruction, num_samples=100):
    predictions = []
    ground_truth = []
    
    for i in range(num_samples):
        # Create messages for each sample
        messages = [
            {"role": "system", "content": instruction},
            {"role": "user", "content": data[i]["crimeaditionalinfo"]}
        ]
        
        # Generate prediction
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
        inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")
        outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)
        pred_text = tokenizer.decode(outputs[0], skip_special_tokens=True).split("assistant")[1].strip()
        
        # Store prediction and ground truth
        predictions.append(pred_text)
        ground_truth.append(data[i]["joined"])  # Assuming 'joined' is the ground truth column
        
    return predictions, ground_truth

In [None]:
# Generate predictions
predictions, ground_truth = generate_predictions(test.to_pandas(), model, tokenizer, instruction)

# Calculate metrics
accuracy = accuracy_score(ground_truth, predictions)

print(f"Accuracy: {accuracy:.4f}")

#### Due to compute, time contraints the accuracy of finetuned LLM model was not able to complete

## Conclusion

The dataset with complex text data and classification tags presented a unique challenge of classifiying the crime info
As I have tested with many models and architecture I was unable to achieve more than 60% classification accuracy on any of the models So, the classification task was indeed very diverse and difficult to predict.

| Model | Accuracy |
|--------|-----------|
| TFIDF & LGBClassifier | 36% |
| SentenceTransformer Embedding & LGBClassifier | 36% |
| Neural Network & LGBClassifier | 54% |
| DistilBERT finetuned | 55% |