In [1]:
import os
import json
import string
import torch, wandb
import numpy as np 
import pandas as pd
import polars as pl
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import matplotlib.pyplot as plt
from datasets import Dataset
import bitsandbytes as bnb
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
from trl import SFTTrainer, setup_chat_format

In [2]:
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mrony00013[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
train = pl.read_csv("train.csv")
test = pl.read_csv("test.csv")

In [4]:
train = train.with_columns(pl.col("category").str.to_lowercase(), 
                    pl.col("sub_category").str.to_lowercase().fill_null("NULL"), 
                    pl.col("crimeaditionalinfo").str.to_lowercase(),
                    (pl.struct(["category", "sub_category"])
                    .map_elements(lambda e: json.dumps({"category": e["category"], "sub_category": e["sub_category"]}), return_dtype=pl.String))
                    .alias("output"))
test = test.with_columns(pl.col("category").str.to_lowercase(), 
                    pl.col("sub_category").str.to_lowercase().fill_null("NULL"), 
                    pl.col("crimeaditionalinfo").str.to_lowercase(),
                    (pl.struct(["category", "sub_category"])
                    .map_elements(lambda e: json.dumps({"category": e["category"], "sub_category": e["sub_category"]}), return_dtype=pl.String))
                    .alias("output"))

### For Null text use category = "online financial fraud",  sub_category = "upi related frauds"

In [5]:
# train.filter(pl.col("len").is_null()).group_by(["category", "sub_category"]).len()

In [6]:
train = train.filter(pl.col("crimeaditionalinfo").is_not_null())
test = test.filter(pl.col("crimeaditionalinfo").is_not_null())

In [7]:
run = wandb.init(
    project='Fine-tune SmolLM2', 
    job_type="training", 
    anonymous="allow"
)

In [8]:
base_model = "HuggingFaceTB/SmolLM2-135M-Instruct"
torch_dtype = torch.float16
attn_implementation = "eager"

In [9]:
# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)

In [10]:
instruction = "Analyze the provided text from a Cybercrime Prevention Assisatant perspective. Identify the category and sub_category of the complaint reported. Answer in json format only. The Format must follow like this {'category': 'category', 'sub_category': 'sub_category'}"
def format_chat_template(row):
    
    row_json = [{"role": "system", "content": instruction },
               {"role": "user", "content": row["crimeaditionalinfo"]},
               {"role": "assistant", "content": row["output"]}]
    
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = Dataset.from_pandas(train.to_pandas()).map(
    format_chat_template,
    num_proc= 6,
)
dataset = dataset.train_test_split(0.2)

Map (num_proc=6):   0%|          | 0/93665 [00:00<?, ? examples/s]

In [11]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

In [12]:
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config)

In [None]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir="output/smolm2-finetune-1",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.1,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to="wandb",
    resume_from_checkpoint=True,
)

In [14]:
# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length= 1500,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing= False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/74932 [00:00<?, ? examples/s]

Map:   0%|          | 0/18733 [00:00<?, ? examples/s]

In [15]:
trainer.train()



Step,Training Loss,Validation Loss
3747,0.5114,1.438532
7494,0.4059,1.381283
11241,0.3133,1.342507
14988,0.6398,1.320972
18735,1.219,1.30311
22482,1.4739,1.291177
26229,0.7237,1.281178


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
wandb.finish()

VBox(children=(Label(value='0.005 MB of 0.005 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/loss,█▅▃▁▁
eval/runtime,█▁▁▅▇
eval/samples_per_second,▁▇█▃▂
eval/steps_per_second,▁▇█▃▂
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇█
train/global_step,▁▁▁▁▂▂▂▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▅▅▆▆▇▇▇▇▇▇▇▇██████
train/grad_norm,▁▃▃▂▄▃▄▃▄▃▃▃▁▄▃▄▁▂▅▂▅▅▄▄▄▄▅▆▄▅▃█▅▃▄▄▅▅▃▄
train/learning_rate,██▇▇▇▆▆▆▆▅▅▅▅▅▅▅▅▄▄▄▃▃▃▃▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▁
train/loss,▇▃▄▃▅▅▅▆█▆▇▆▆▄▁▅▄▄▅▅▄▄▁▆▂▆▃▂▆▄▅▄▄▅▂▆▃▂▂▄

0,1
eval/loss,1.51403
eval/runtime,355.1252
eval/samples_per_second,2.816
eval/steps_per_second,2.816
total_flos,507128203161216.0
train/epoch,1.0
train/global_step,2000.0
train/grad_norm,0.59873
train/learning_rate,0.0
train/loss,0.1488


In [None]:
data = Dataset.from_pandas(train.to_pandas()).map(
    format_chat_template,
    num_proc= 6,
)

data[7000]["output"]


Map (num_proc=6):   0%|          | 0/93665 [00:00<?, ? examples/s]

'{"category": "Online Financial Fraud", "sub_category": "UPI Related Frauds"}'

In [16]:
trainer.save_model("output/smolm2-finetune-1")

In [None]:
messages = [{"role": "system", "content": instruction},
    {"role": "user", "content": data[7000]["crimeaditionalinfo"]}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])


{"category": "Online Financial Fraud", "sub_category": "UPI Related Frauds"}
{"category": "UPI Related Frauds", "sub_category": "UPI Related Frauds"}}
{"category": "UPI Related Frauds", "sub_category": "UPI Related Frauds"}}
{"category": "UPI Related Frauds", "sub_category": "UPI Related Frauds"}}
{"category": "UPI Related Frauds", "sub_category": "UPI Related Fraudes"}}
{"category": "UPI Related Fraudes", "sub_category": "UPI Related Fraudes"}}
{"category": "UPI Related Fra
