In [1]:
# Cell 1. Импорты и константы

import pandas as pd
import numpy as np
from datetime import timedelta
BINNING_SPEC = {
    'Total Fwd Packets':      ('uniform', 5),
    'Total Backward Packets': ('uniform', 5),
    'Total Length of Fwd Packets': ('uniform', 5),
    'Total Length of Bwd Packets': ('uniform', 5),
    'Active Mean':            ('uniform', 5),
    'Idle Mean':              ('uniform', 5),
    'Flow Duration':          ('uniform', 5)
}

SELECTED_COLS = [
    'Total Fwd Packets','Total Backward Packets',
    'Total Length of Fwd Packets','Total Length of Bwd Packets',
    'Active Mean','Idle Mean','Flow Duration',
    'Destination IP','Destination Port',
    'FIN Flag Count','SYN Flag Count',
    'RST Flag Count','PSH Flag Count','ACK Flag Count'
]

FEATURE_GROUPS = {
    'network': [
        'Destination IP','Destination Port',
        'FIN Flag Count','SYN Flag Count',
        'RST Flag Count','PSH Flag Count','ACK Flag Count'
    ],
    'temporal': ['Flow Duration'],
    'traffic': [
        'Total Fwd Packets','Total Backward Packets',
        'Total Length of Fwd Packets','Total Length of Bwd Packets',
        'Active Mean','Idle Mean'
    ]
}
window_size = timedelta(minutes=30)
slide_size  = timedelta(minutes=15)


In [2]:
# Cell 2. Функции дискретизации и извлечения признаков

def discretize_columns(df_in, spec, cols):
    df2 = df_in[['Timestamp'] + cols].copy()
    for col, (method, bins) in spec.items():
        if col not in cols: continue
        arr = df2[col].astype(float)
        labels = [f"bin{i}" for i in range(bins)]
        if method=='quantile':
            b = pd.qcut(arr, q=bins, labels=labels, duplicates='drop')
        else:
            b = pd.cut(arr, bins=bins, labels=labels)
        df2[col] = b.astype(str)
    return df2

def window_features(raw_win, disc_win):
    feats = set()
    for (_, r), (_, d) in zip(raw_win.iterrows(), disc_win.iterrows()):
        for col in SELECTED_COLS:
            if col in BINNING_SPEC:
                feats.add(f"{col}={d[col]}")
            else:
                feats.add(f"{col}={r[col]}")
    return feats


In [3]:
# Cell 3. Загрузка данных и дискретизация

# размеченный для оценки
df_labeled   = pd.read_csv(
    r'C:\Users\Гребенников Матвей\Desktop\Диплом\Диплом\Code\diplom-project\diplom\test\result\Date\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX_sampled_v3.csv',
    parse_dates=['Timestamp'])

# неразмеченный для детекции
df_unlabeled = pd.read_csv(
    r'C:\Users\Гребенников Матвей\Desktop\Диплом\Диплом\Code\diplom-project\diplom\test\result\Date\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX_sampled_v4.csv',
    parse_dates=['Timestamp'])

# дискретизируем оба фрейма
df_disc_labeled = discretize_columns(df_labeled,   BINNING_SPEC, SELECTED_COLS)
df_disc_unlabeled = discretize_columns(df_unlabeled, BINNING_SPEC, SELECTED_COLS)


In [4]:
# ── Cell X: Считаем частоты фич в DDoS vs BENIGN ─────────────────────────────
from collections import Counter

# Подсчитаем, сколько строк каждого типа
total_ddos   = (df_labeled['Label'] == 'DDoS').sum()
total_benign = (df_labeled['Label'] == 'BENIGN').sum()

# Собираем Counter-ы по признакам
cnt_ddos   = Counter()
cnt_benign = Counter()

for idx, disc_row in df_disc_labeled.iterrows():
    raw_label = df_labeled.at[idx, 'Label']
    # полный набор фич текущей строки в формате "Feature=binX" или "Feature=value"
    feats = {f"{col}={disc_row[col]}" for col in SELECTED_COLS}
    if raw_label == 'DDoS':
        cnt_ddos.update(feats)
    elif raw_label == 'BENIGN':
        cnt_benign.update(feats)

print(f"[Info] Всего DDoS строк: {total_ddos}, BENIGN строк: {total_benign}")
print(f"[Info] Уникальных фич в DDoS: {len(cnt_ddos)}, в BENIGN: {len(cnt_benign)}")


[Info] Всего DDoS строк: 30000, BENIGN строк: 70000
[Info] Уникальных фич в DDoS: 28, в BENIGN: 21893


In [5]:
# Cell 4. Построение окон (для labeled и unlabeled)

windows = []  # список кортежей (raw_labeled, disc_unlabeled)
cur = df_labeled['Timestamp'].min()
end = df_labeled['Timestamp'].max()

while cur + window_size <= end:
    raw_l = df_labeled[
        (df_labeled['Timestamp'] >= cur) & (df_labeled['Timestamp'] < cur + window_size)
    ]
    disc_u = df_disc_unlabeled[
        (df_disc_unlabeled['Timestamp'] >= cur) & (df_disc_unlabeled['Timestamp'] < cur + window_size)
    ]
    windows.append((raw_l, disc_u))
    cur += slide_size


In [6]:
# ─── Cell 5b. Загрузка CAR-правил и Extended Top-K шаблонов + detect_attack ───

import pandas as pd

# 1) Загружаем class-association rules (CAR) вида {A}⇒Label=DDoS или Label=BENIGN
car_df = pd.read_csv('association_rules.csv')
rules = []
for _, r in car_df.iterrows():
    A = set(x.strip() for x in r['Antecedent'].strip('{}').split(','))
    B = r['Consequent']   # ожидаем "Label=DDoS" или "Label=BENIGN"
    rules.append((A, B))
print(f"[Load] Loaded {len(rules)} CAR-rules")

# 2) Загружаем расширенный Top-K шаблон
tmpl_df = pd.read_csv('attack_templates_topk_extended.csv')
attack_templates = {}   # { attack: { group: set(feat_str) } }
for _, row in tmpl_df.iterrows():
    att = row['Attack']
    grp = row['Group']
    # row['Values'] имеет вид "Flow Duration=bin0|bin3" или "Destination IP=192.168.10.50"
    colname, bins = row['Values'].split('=', 1)
    bins = bins.split('|')
    for b in bins:
        feat = f"{colname}={b}"
        attack_templates.setdefault(att, {}).setdefault(grp, set()).add(feat)

# 3) Функция детектора по строковым CAR-правилам с учётом всех трёх групп
def detect_attack(feats, min_rules=300):
    """
    feats: set of "Feature=val" для одной строки.
    Для атаки 'DDoS' требуем:
      1) по крайней мере по одному feat из каждой группы network/temporal/traffic
      2) ровно min_rules или больше CAR-правил вида A⇒Label=DDoS, где A⊆feats
    Возвращает 'DDoS' или 'BENIGN'.
    """
    # 1) групповая фильтрация
    tmpl = attack_templates.get('DDoS', {})
    for grp in FEATURE_GROUPS:
        if not (tmpl.get(grp, set()) & feats):
            return 'BENIGN'

    # 2) считаем сколько DDoS-CAR-правил сработало
    fired = 0
    for A, B in rules:
        if B == 'DDOS' and A.issubset(feats):
            fired += 1

    return 'DDoS' if fired >= min_rules else 'BENIGN'


[Load] Loaded 2862 CAR-rules


In [7]:
# ─── Cell 6. Row-level Evaluation (без окон) и сохранение результатов ─────────

import pandas as pd

# 6.1 Собираем строковые фичи для одной записи
def row_features(raw, disc):
    feats = set()
    for col in SELECTED_COLS:
        if col in BINNING_SPEC:
            feats.add(f"{col}={disc[col]}")
        else:
            feats.add(f"{col}={raw[col]}")
    return feats

# Предполагается, что:
#  - df_labeled и df_disc_labeled загружены и дискретизированы
#  - функции и структуры из Cell 5b уже объявлены

predictions = []
TP = FP = TN = FN = 0

for idx, raw in df_labeled.iterrows():
    disc  = df_disc_labeled.loc[idx]
    feats = row_features(raw, disc)
    pred  = detect_attack(feats, min_rules=300)
    true  = raw['Label']

    # метрики
    if pred == true:
        if pred == 'DDoS': TP += 1
        else:             TN += 1
    else:
        if pred == 'DDoS': FP += 1
        else:             FN += 1

    rec = raw.to_dict()
    rec['Predicted'] = pred
    predictions.append(rec)

precision = TP / (TP + FP) if TP + FP else 0
recall    = TP / (TP + FN) if TP + FN else 0
print(f"Row-level Precision: {precision:.2f}, Recall: {recall:.2f}")
print(f"TP={TP}, FP={FP}, TN={TN}, FN={FN}")

# 6.2 Сохраняем все исходные строки + столбец Predicted
pd.DataFrame(predictions).to_csv('row_level_predictions.csv', index=False)
print("Saved row_level_predictions.csv")


Row-level Precision: 0.82, Recall: 0.99
TP=29556, FP=6449, TN=63551, FN=444
Saved row_level_predictions.csv
