<a href="https://colab.research.google.com/github/sairambokka/Collab-Notebooks/blob/main/LLM_Based_Firewall.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install colorama
!pip install scapy

Collecting colorama
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Installing collected packages: colorama
Successfully installed colorama-0.4.6
Collecting scapy
  Downloading scapy-2.6.1-py3-none-any.whl.metadata (5.6 kB)
Downloading scapy-2.6.1-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scapy
Successfully installed scapy-2.6.1


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True, timeout_ms=300000)

Mounted at /content/drive


In [None]:
# packet sniffer
import scapy.all as scapy
import argparse
from scapy.layers import http
import colorama
from colorama import Fore
import csv

# Machine Learning
import math, time, random, datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection
from sklearn import metrics
plt.style.use('dark_background')
import warnings
warnings.filterwarnings('ignore')
import missingno
import pickle

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC

In [None]:
default_values = {
    'id': 0, 'dur': 0, 'proto': 0, 'service': 0, 'state': 0, 'spkts': 0,
    'dpkts': 0, 'sbytes': 0, 'dbytes': 0, 'rate': 0, 'sttl': 0, 'dttl': 0,
    'sload': 0, 'dload': 0, 'sloss': 0, 'dloss': 0, 'sinpkt': 0, 'dinpkt': 0,
    'sjit': 0, 'djit': 0, 'swin': 0, 'stcpb': 0, 'dtcpb': 0, 'dwin': 0,
    'tcprtt': 0, 'synack': 0, 'ackdat': 0, 'smean': 0, 'dmean': 0,
    'trans_depth': 0, 'response_body_len': 0, 'ct_srv_src': 0,
    'ct_state_ttl': 0, 'ct_dst_ltm': 0, 'ct_src_dport_ltm': 0,
    'ct_dst_sport_ltm': 0, 'ct_dst_src_ltm': 0, 'is_ftp_login': 0,
    'ct_ftp_cmd': 0, 'ct_flw_http_mthd': 0, 'ct_src_ltm': 0, 'ct_srv_dst': 0,
    'is_sm_ips_ports': 0, 'attack_cat': "Normal", 'label': 0
}

data = []

def extract_data(packet):
    data = default_values.copy()
    def update_data(key, value):
        if key in data:
            data[key] = value

    # HTTP packets
    if packet.haslayer(http.HTTPRequest):
        update_data('proto', 'HTTP')

    # IP packets
    elif packet.haslayer(scapy.IP):
        ip_layer = packet[scapy.IP]
        update_data('sbytes', len(ip_layer))
        update_data('proto', ip_layer.proto)

    # TCP packets
    elif packet.haslayer(scapy.TCP):
        tcp_layer = packet[scapy.TCP]
        update_data('spkts', 1)
        update_data('dpkts', 0)

    # UDP packets
    elif packet.haslayer(scapy.UDP):
        udp_layer = packet[scapy.UDP]
        update_data('spkts', 1)
        update_data('dpkts', 0)

    # ICMP packets
    elif packet.haslayer(scapy.ICMP):
        icmp_layer = packet[scapy.ICMP]
        update_data('spkts', 1)
        update_data('dpkts', 0)

    return list(data.values())

def update_csv():
    global data
    csv_file_path = 'UNSW_NB15_training-set.csv'
    with open(csv_file_path, mode='r') as file:
        reader = list(csv.reader(file))

    max_id = max(int(row[0]) for row in reader[1:]) if len(reader) > 1 else 0
    for i, row in enumerate(data):
        row[0] = max_id + i + 1

    reader.extend(data)

    with open(csv_file_path, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(reader)

def get_interface():
    parser = argparse.ArgumentParser()
    parser.add_argument('-f')
    parser.add_argument("-i", "--interface", dest="interface", help="Specify the network interface")
    arguments = parser.parse_args()
    return arguments.interface

def sniff(iface):
    global data
    packet_count = 0

    def process_packet(packet):
        nonlocal packet_count
        print(f"[+] Packet {packet_count + 1} captured")
        packet.show()
        extracted_data = extract_data(packet)
        data.append(extracted_data)
        packet_count += 1

    scapy.sniff(iface=iface, store=False, prn=process_packet, stop_filter=lambda _: packet_count >= 10)

iface = get_interface()
sniff(iface)
update_csv()

[+] Packet 1 captured
###[ Ethernet ]###
  dst       = 02:42:ac:1c:00:0c
  src       = 02:42:e4:40:48:b1
  type      = IPv4
###[ IP ]###
     version   = 4
     ihl       = 5
     tos       = 0x0
     len       = 77
     id        = 21976
     flags     = DF
     frag      = 0
     ttl       = 64
     proto     = 6
     chksum    = 0x8c8d
     src       = 172.28.0.1
     dst       = 172.28.0.12
     \options   \
###[ TCP ]###
        sport     = 41044
        dport     = 8080
        seq       = 4003877376
        ack       = 264557279
        dataofs   = 8
        reserved  = 0
        flags     = PA
        window    = 249
        chksum    = 0x5885
        urgptr    = 0
        options   = [('NOP', None), ('NOP', None), ('Timestamp', (1418442680, 763892655))]
###[ HTTP 1 ]###
###[ Raw ]###
           load      = b'\xc1\x93\xd4\xd4\x1b]\xf6\x0e\xb8K\x00"\xb3\xfce\xc5\xe5\xa5\x10\x80[\xd4\x93\xd5\x1b'

[+] Packet 2 captured
###[ Ethernet ]###
  dst       = 02:42:e4:40:48:b1
  src     

In [None]:
train_data = pd.read_csv('UNSW_NB15_training-set.csv')
test_data = pd.read_csv('UNSW_NB15_testing-set.csv')

# Combine the data to avoid preprocessing twice
combined_data = pd.concat([train_data, test_data]).reset_index(drop=True)

categorical_cols = combined_data.select_dtypes('object').columns
numeric_cols = combined_data._get_numeric_data().columns
print(numeric_cols)

Index(['id', 'dur', 'spkts', 'dpkts', 'sbytes', 'dbytes', 'rate', 'sttl',
       'dttl', 'sload', 'dload', 'sloss', 'dloss', 'sinpkt', 'dinpkt', 'sjit',
       'djit', 'swin', 'stcpb', 'dtcpb', 'dwin', 'tcprtt', 'synack', 'ackdat',
       'smean', 'dmean', 'trans_depth', 'response_body_len', 'ct_srv_src',
       'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm',
       'ct_dst_src_ltm', 'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd',
       'ct_src_ltm', 'ct_srv_dst', 'is_sm_ips_ports', 'label'],
      dtype='object')


In [None]:
# Handle type of service that is '-'
combined_data['service'].unique()
combined_data['service'] = np.where(combined_data['service'] == '-', 'None', combined_data['service'])
print(combined_data['service'].unique())

# Automate the process of removing dump values
def remove_dump_values(data, columns):
    for column in columns:
        data[column] = np.where(data[column] == '-', 'None', data[column])
    return data

columns = combined_data.columns
processed_data = remove_dump_values(combined_data, columns)

['None' 'ftp' 'smtp' 'snmp' 'http' 'ftp-data' 'dns' 'ssh' 'radius' 'pop3'
 'dhcp' 'ssl' 'irc' '0']


In [None]:
# Remove features that are not needed
processed_data = processed_data.drop(['id'], axis=1)
# Replace the attack_cat with true
processed_data.drop(['attack_cat'], axis=1, inplace=True)
categorical_cols = categorical_cols.drop(['attack_cat'])

# Perform one-hot encoding
processed_data_encoded = pd.get_dummies(processed_data, columns=categorical_cols)

In [None]:
# Normalization
numeric_cols = list(numeric_cols)
numeric_cols.remove('label')
numeric_cols.remove('id')

processed_data_encoded[numeric_cols] = processed_data_encoded[numeric_cols].astype('float')

processed_data_encoded[numeric_cols] = (processed_data_encoded[numeric_cols] - np.min(processed_data_encoded[numeric_cols])) / np.std(processed_data_encoded[numeric_cols])

processed_data_encoded.head()

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,state_CLO,state_CON,state_ECO,state_FIN,state_INT,state_PAR,state_REQ,state_RST,state_URN,state_no
0,0.020334,0.044136,0.03572,0.001485,0.001177,0.000462,2.458723,2.252553,7.6e-05,0.003522,...,False,False,False,True,False,False,False,False,False,False
1,0.108785,0.102983,0.339335,0.004224,0.287381,0.000489,0.604924,2.234816,4.5e-05,0.208749,...,False,False,False,True,False,False,False,False,False,False
2,0.271691,0.058848,0.142878,0.002095,0.090194,8.8e-05,0.604924,2.234816,8e-06,0.025257,...,False,False,False,True,False,False,False,False,False,False
3,0.281485,0.088271,0.107159,0.003614,0.005267,8.5e-05,0.604924,2.234816,1.5e-05,0.001392,...,False,False,False,True,False,False,False,False,False,False
4,0.075233,0.07356,0.053579,0.003073,0.001833,0.000208,2.478237,2.234816,4.6e-05,0.001653,...,False,False,False,True,False,False,False,False,False,False


### LLM SETUP AND FINE TUNE USING UNSLOTH

In [None]:
%%capture
import torch
major_version, minor_version = torch.cuda.get_device_capability()
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
if major_version >= 8:
    !pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes
else:
    !pip install --no-deps xformers trl peft accelerate bitsandbytes
pass

In [None]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! Llama 3 is up to 8k
dtype = None
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

fourbit_models = [
    "unsloth/mistral-7b-bnb-4bit",
    "unsloth/mistral-7b-instruct-v0.2-bnb-4bit",
    "unsloth/llama-2-7b-bnb-4bit",
    "unsloth/gemma-7b-bnb-4bit",
    "unsloth/gemma-7b-it-bnb-4bit",
    "unsloth/gemma-2b-bnb-4bit",
    "unsloth/gemma-2b-it-bnb-4bit",
    "unsloth/llama-3-8b-bnb-4bit",
]

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gpt-oss-20b-unsloth-bnb-4bit", # Llama-3 70b also works (just change the model name)
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.7.1+cu126 with CUDA 1208 (you have 2.6.0+cu124)
    Python  3.9.23 (you have 3.11.13)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


Unsloth: Your Flash Attention 2 installation seems to be broken?
A possible explanation is you have a new CUDA version which isn't
yet compatible with FA2? Please file a ticket to Unsloth or FA2.
We shall now use Xformers instead, which does not have any performance hits!
We found this negligible impact by benchmarking on 1x A100.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.4: Fast Gpt_Oss patching. Transformers: 4.55.0.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.37G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.16G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/165 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/27.9M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]



---



In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth: Making `model.base_model.model.model` require gradients


In [None]:
import pandas as pd

dataset_finetune = pd.DataFrame()

# Set a constant instruction
instruction_text = "this is the input of a single packet in the network classify it as safe or not where 0 is safe and 1 is not"

# Using a list filled with the constant instruction repeated for each row in the original DataFrame
dataset_finetune['instruction'] = [instruction_text] * len(processed_data_encoded)

# Creating 'input' column by concatenating column names and their values for each row
dataset_finetune['input'] = processed_data_encoded.apply(lambda row: ', '.join([f"column {col}: {row[col]}" for col in processed_data_encoded.columns[:-1]]), axis=1)

# Assuming the label is in the last column with name 'Label'a
dataset_finetune['output'] = processed_data_encoded['label']

# You now have the new dataset as required
print(dataset_finetune.head())  # Display the first few rows to check

# Optionally, you can save this new DataFrame to a CSV file
dataset_finetune.to_csv("dataset_finetune.csv", index=False)

                                         instruction  \
0  this is the input of a single packet in the ne...   
1  this is the input of a single packet in the ne...   
2  this is the input of a single packet in the ne...   
3  this is the input of a single packet in the ne...   
4  this is the input of a single packet in the ne...   

                                               input output  
0  column dur: 0.02033382661946325, column spkts:...      0  
1  column dur: 0.1087850852635243, column spkts: ...      0  
2  column dur: 0.2716905420489535, column spkts: ...      0  
3  column dur: 0.281484852104969, column spkts: 0...      0  
4  column dur: 0.07523271464318014, column spkts:...      0  


In [None]:
# this is basically the system prompt
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # do not forget this part!
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN # without this token generation goes on forever!
        texts.append(text)
    return { "text" : texts, }
pass

from datasets import load_dataset
# dataset = load_dataset("yahma/alpaca-cleaned", split = "train")

dataset = load_dataset("csv", data_files="dataset_finetune.csv", split='train')

# Split the dataset into train and test sets
dataset_train = dataset.train_test_split(test_size=0.8)["train"] # Adjust test_size as needed

dataset_train = dataset_train.map(formatting_prompts_func, batched = True,)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/51536 [00:00<?, ? examples/s]

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_train,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 16, # increase this to make the model learn "better"
        num_train_epochs=10,
        learning_rate = 2e-4,
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/51536 [00:00<?, ? examples/s]

In [None]:
#@title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
19.328 GB of memory reserved.


In [None]:
# We're now kicking off the actual training of our model, which will spit out some statistics showing us how well it learns
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 51,536 | Num Epochs = 1 | Total steps = 16
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 7,962,624 of 20,922,719,808 (0.04% trained)


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msairambokka[0m ([33msairambokka-umbc[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,4.2307
2,4.2073
3,4.0366
4,3.3632
5,2.4137
6,1.938
7,1.8787
8,1.7333
9,1.6466
10,1.7642


In [None]:
FastLanguageModel.for_inference(model)
from random import choice

random_row = choice(dataset_finetune.to_dict('records'))

inputs = tokenizer(
[
    alpaca_prompt.format(
        random_row['instruction'],  # Use the instruction from the random row
        random_row['input'],  # Use the input from the random row
        "",  # Output is left blank for generation
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
this is the input of a single packet in the network classify it as safe or not where 0 is safe and 1 is not

### Input:
column dur: 0.00015717630513900453, column spkts: 0.014711907051050714, column dpkts: 0.01785975255520214, column sbytes: 0.0007481145622005921, column dbytes: 0.0011081000211303341, column rate: 0.01992543306713898, column sttl: 0.3024619984825595, column dttl: 0.25718120748342743, column sload: 0.0029816785479448078, column dload: 0.28607095775264235, column sloss: 0.0, column dloss: 0.0, column sinpkt: 1.44466753783607e-07, column dinpkt: 2.742166397636572e-06, column sjit: 0.0, column djit: 0.0, column swin: 0.0, column stcpb: 0.0, column dtcpb: 0.0, column dwin: 0.0, column tcprtt: 0.0, column synack: 0.0, column ackdat: 0.0, column smean: 0.31568951414512003, column dmean: 0.31885154091148