# 🚀 Supervised Fine-Tuning on Network Flow Dataset
This notebook tokenizes network flow samples and fine-tunes Mistral-7B with a classification label (e.g., Attack/Normal).

In [1]:
from huggingface_hub import login
hf_token = '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'
login(hf_token)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, MistralConfig, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from transformers import BitsAndBytesConfig
import torch
from datasets import load_dataset

# --- Load tokenizer ---
model_id = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

# --- Load dataset ---
dataset = load_dataset("Suvo696/InSDNN", split="train")

# --- Create text inputs ---
def convert_example(example):
    input_text = " ".join([f"{k} is {v}" for k, v in example.items() if k != "Label"])
    label = 0 if example["Label"].strip().upper() == "NORMAL" else 1
    return {"text": input_text, "label": label}

dataset = dataset.map(convert_example)

# Set pad token (required for padding strategy)
tokenizer.pad_token = tokenizer.eos_token

# Tokenize function
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=1152)

# Tokenize dataset
tokenized_dataset = dataset.map(tokenize)
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


# --- Quantization Config ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

config = MistralConfig.from_pretrained(model_id, use_flash_attention_2=True)
config.use_flash_attention_2 = True
config.pad_token_id = tokenizer.pad_token_id
config.num_labels = 2 
# --- Load quantized classification model ---
model = AutoModelForSequenceClassification.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    config=config,
    device_map="auto",
)
model.config.pad_token_id = tokenizer.pad_token_id

# --- Prepare model for LoRA ---
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_CLS"
)
model = get_peft_model(model, lora_config)

# --- Training Arguments ---
training_args = TrainingArguments(
    output_dir="./mistral_attack_classifier",
    per_device_train_batch_size=64,
    gradient_accumulation_steps=8,                 
    num_train_epochs=3,                             
    learning_rate=1e-5,                             
    fp16=False,
    bf16=True,
    logging_dir="./mistral_attack_classifier/logs",
    logging_steps=1,                               
    save_strategy="epoch",                           
    save_total_limit=1,                            
    report_to="none",
    label_names=["labels"],
    dataloader_num_workers=8, 
    dataloader_pin_memory=True,
    dataloader_persistent_workers=True,
    gradient_checkpointing=True,
    do_eval=False
)


# --- Trainer ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset.select(range(100)),
    tokenizer=tokenizer
)

# --- Train ---
trainer.train()

# Save final model and tokenizer
trainer.save_model("./mistral_attack_classifier/final_model")
tokenizer.save_pretrained("./mistral_attack_classifier/final_model")


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mistralai/Mistral-7B-v0.1 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss


In [None]:
test_dataset = load_dataset("Suvo696/InSDNN", split="test")
# Apply same preprocessing
test_dataset = test_dataset.map(convert_example)
test_dataset = test_dataset.map(tokenize)
test_dataset = test_dataset.rename_column("label", "labels")
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

metrics = trainer.evaluate(test_dataset)
print(metrics)


Map:   0%|          | 0/68778 [00:00<?, ? examples/s]

Map:   0%|          | 0/68778 [00:00<?, ? examples/s]

KeyboardInterrupt: 

In [None]:
import numpy as np

predictions = trainer.predict(test_dataset)
preds = np.argmax(predictions.predictions, axis=1)

for i in range(10):  # Show first 10 examples
    print(f"Predicted: {preds[i]}, Actual: {predictions.label_ids[i]}")


In [None]:
from sklearn.metrics import classification_report
print(classification_report(predictions.label_ids, preds, target_names=["Normal", "Attack"]))
