In [2]:
import os
import re
import math
import joblib
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score


In [3]:
csv_path = "/home/matrix/Documents/vscode/PROJECT/Porject_updated_files/captures_combined.csv"

def load_data():
    return pd.read_csv(csv_path, dtype=str, keep_default_na=False, quoting=1)

df = load_data()
print("Loaded rows:", len(df))
df.head(2)


Loaded rows: 9006


Unnamed: 0,timestamp,raw_text
0,2025-11-14T09:04:21.994070Z,---- CAPTURE START ----\r\ntimestamp: 2025-11-...
1,timestamp: 2025-11-14T10:41:50.954658Z,


In [4]:
SIGNATURES = [
    r"(?i)' *or *'1' *= *'1",
    r"(?i)union\s+select",
    r"(?i)drop\s+table",
    r"(?i)<script>",
    r"(?i)<svg",
    r"(?i)javascript:",
    r"(?i)\.\./",
    r"(?i)cat /etc/passwd",
    r"(?i)base64",
    r"(?i)sleep\(",
]

def shannon_entropy(s):
    if not s: return 0
    counts = {}
    for ch in s:
        counts[ch] = counts.get(ch, 0) + 1
    probs = np.array(list(counts.values())) / len(s)
    return -np.sum(probs * np.log2(probs + 1e-9))

def auto_label(text):
    for sig in SIGNATURES:
        if re.search(sig, text):
            return 1
    if shannon_entropy(text) > 4.5:
        return 1
    return 0

df["auto_label"] = df["raw_text"].apply(auto_label)
df["label"] = df["auto_label"]  # hybrid mode, later manual override
df.head(2)


Unnamed: 0,timestamp,raw_text,auto_label,label
0,2025-11-14T09:04:21.994070Z,---- CAPTURE START ----\r\ntimestamp: 2025-11-...,1,1
1,timestamp: 2025-11-14T10:41:50.954658Z,,0,0


In [5]:
print("Total rows:", len(df))


Total rows: 9006


In [6]:
print(df.index)
print(df.columns)
print(df.shape)



RangeIndex(start=0, stop=9006, step=1)
Index(['timestamp', 'raw_text', 'auto_label', 'label'], dtype='object')
(9006, 4)


In [7]:
review = df.sample(400)  # adjust sample size
review.to_csv("review_for_labeling.csv", index=False)
print("Review file created: review_for_labeling.csv (Edit labels manually)")


Review file created: review_for_labeling.csv (Edit labels manually)


In [8]:
if os.path.exists("review_for_labeling.csv"):
    edited = pd.read_csv("review_for_labeling.csv", dtype=str)
    edited = edited[["raw_text", "label"]]

    label_map = dict(zip(edited["raw_text"], edited["label"]))

    def override(row):
        if row["raw_text"] in label_map:
            if label_map[row["raw_text"]] in ("0","1"):
                return int(label_map[row["raw_text"]])
        return row["label"]

    df["label"] = df.apply(override, axis=1)

df.head(300)


Unnamed: 0,timestamp,raw_text,auto_label,label
0,2025-11-14T09:04:21.994070Z,---- CAPTURE START ----\r\ntimestamp: 2025-11-...,1,1
1,timestamp: 2025-11-14T10:41:50.954658Z,,0,0
2,client_ip: 127.0.0.1:38166,,0,0
3,method: POST,,0,0
4,url: http://192.168.1.10/products.php,,0,0
...,...,...,...,...
295,Pragma: (none),,0,0
296,Expires: (none),,0,0
297,If-Modified-Since: (none),,0,0
298,If-None-Match: (none),,0,0


In [18]:
def clean_text(s):
    s = s.replace("\n", " ")
    s = re.sub(r"\s+", " ", s)
    s = s.lower()
    return s

df["clean"] = df["raw_text"].apply(clean_text)



In [19]:
X = df["clean"]
y = df["label"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y)



In [20]:
vectorizer = TfidfVectorizer(
    analyzer='char',
    ngram_range=(3,5),
    max_features=40000
)

X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [21]:
clf = LogisticRegression(max_iter=1200, class_weight="balanced")
clf.fit(X_train_vec, y_train)

pred = clf.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, pred))
print(classification_report(y_test, pred))


Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1801
           1       1.00      1.00      1.00         1

    accuracy                           1.00      1802
   macro avg       1.00      1.00      1.00      1802
weighted avg       1.00      1.00      1.00      1802



In [22]:
safe = df[df["label"] == 0]["clean"]
safe_vec = vectorizer.transform(safe)

iso = IsolationForest(n_estimators=200, contamination=0.02)
iso.fit(safe_vec)

print("Zero-day model trained.")


Zero-day model trained.


In [23]:
os.makedirs("waf_model", exist_ok=True)

joblib.dump(vectorizer, "waf_model/vectorizer.pkl")
joblib.dump(clf, "waf_model/best_model.pkl")
joblib.dump(iso, "waf_model/anomaly_model.pkl")

print("Models saved in waf_model/")


Models saved in waf_model/


In [24]:
def predict_live(raw_text):
    text = clean_text(raw_text)
    vec = vectorizer.transform([text])

    clf_label = clf.predict(vec)[0]
    clf_prob = clf.predict_proba(vec)[0,1]

    anomaly_score = iso.decision_function(vec)[0]
    anomaly_flag = 1 if anomaly_score < -0.2 else 0

    final = 1 if (clf_label == 1 or anomaly_flag == 1) else 0

    return {
        "clf_label": int(clf_label),
        "clf_prob": float(clf_prob),
        "anomaly_flag": int(anomaly_flag),
        "final_label": int(final)
    }

predict_live(df["raw_text"].iloc[0])


{'clf_label': 1,
 'clf_prob': 0.9565721326742125,
 'anomaly_flag': 0,
 'final_label': 1}