In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_score, f1_score
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

**Этап обучения для загрузки в AI модуль**

In [51]:
data = pd.read_csv("dataset_sdn.csv")
y = data['label']
X = data.drop('label', axis=1)
X = pd.get_dummies(X, columns=['src', 'dst', 'Protocol'], drop_first=True)

In [52]:
top_features = ['bytecount', 'pktcount', 'pktperflow', 'byteperflow', 'pktrate', 'tot_dur', 'dt', 'dur']
dummy_src_columns = [col for col in X.columns if col.startswith('src_')]
selected_features = top_features + dummy_src_columns
X_selected = X[selected_features]

In [53]:
X_selected.head()

Unnamed: 0,bytecount,pktcount,pktperflow,byteperflow,pktrate,tot_dur,dt,dur,src_10.0.0.10,src_10.0.0.11,...,src_10.0.0.18,src_10.0.0.2,src_10.0.0.20,src_10.0.0.3,src_10.0.0.4,src_10.0.0.5,src_10.0.0.6,src_10.0.0.7,src_10.0.0.8,src_10.0.0.9
0,48294064,45304,13535,14428310,451,101000000000.0,11425,100,False,False,...,False,False,False,False,False,False,False,False,False,False
1,134737070,126395,13531,14424046,451,281000000000.0,11605,280,False,False,...,False,False,False,False,False,False,False,False,False,False
2,96294978,90333,13534,14427244,451,201000000000.0,11425,200,False,False,...,False,True,False,False,False,False,False,False,False,False
3,96294978,90333,13534,14427244,451,201000000000.0,11425,200,False,False,...,False,True,False,False,False,False,False,False,False,False
4,96294978,90333,13534,14427244,451,201000000000.0,11425,200,False,False,...,False,True,False,False,False,False,False,False,False,False


In [54]:
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_selected, y, test_size=0.2, random_state=42)

In [55]:
imputer = SimpleImputer(strategy='mean')
X_train_1 = imputer.fit_transform(X_train_1)
X_test_1 = imputer.transform(X_test_1)

In [56]:
knn = KNeighborsClassifier()
decision_tree = DecisionTreeClassifier()
random_forest =  RandomForestClassifier()

In [57]:
def evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    return acc

In [66]:
models = {
    "Decision Tree": decision_tree,
    "Random Forest": random_forest,
    "K-Nearest Neighbors": knn,
}

scaler = StandardScaler()
X_train_1 = scaler.fit_transform(X_train_1)
X_test_1 = scaler.transform(X_test_1)

results = []
for model_name, model in models.items():
    start_time = time.time()
    
    model.fit(X_train_1, y_train_1)
    y_pred = model.predict(X_test_1)
    
    accuracy = accuracy_score(y_test_1, y_pred)
    precision = precision_score(y_test_1, y_pred, average='binary')
    f1 = f1_score(y_test_1, y_pred, average='binary')
    execution_time = time.time() - start_time
    
    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'F1-score': f1,
        'Time (s)': execution_time
    })

results_df = pd.DataFrame(results)

In [34]:
print("\n Таблица результатов: \n")
print(results_df.to_string(index=False))


 Таблица результатов: 

              Model  Accuracy  Precision  F1-score  Time (s)
      Decision Tree  1.000000   1.000000  1.000000  0.256860
      Random Forest  0.999856   0.999632  0.999816  5.391144
K-Nearest Neighbors  0.998515   0.997914  0.998098  2.074784


**Этап предсказания аномальности трафика**

In [59]:
data = pd.read_csv("output_dataset.csv")
y_TEST = data['is_attack']
X_TEST = data.drop(['is_attack', 'attack_type', 'timestamp', 'src_host'], axis=1)
X_TEST['Transfer_MB'] = data['Transfer'].str.extract(r'(\d+\.\d+)').astype(float)
X_TEST['Bandwidth_Mbps'] = data['Bandwidth'].str.extract(r'(\d+\.\d+)').astype(float)
X_TEST['Interval_start'] = data['Interval'].str.split('-').str[0].astype(float)
X_TEST['Interval_end'] = data['Interval'].str.split('-').str[1].astype(float)
X_TEST['packet_freq'] = data['packet_frequency'].str.extract(r'(\d+\.\d+)').astype(float)
X_TEST = pd.get_dummies(X_TEST, columns=['src_ip'], drop_first=True)

In [60]:
X_TEST.rename(columns=lambda x: x.replace('src_ip_', 'src_'), inplace=True)
extra_cols = ['Transfer_MB', 'Bandwidth_Mbps', 'Interval_start', 'Interval_end', 'packet_freq_sec']
X_TEST.rename(columns={'duration': 'dur'}, inplace=True)
X_TEST['dt'] = (pd.to_datetime(data['timestamp']).astype('int64') // 10**9 if 'timestamp' in data else 0)
X_TEST['tot_dur'] = X_TEST['Interval_end']  - X_TEST['Interval_start']
X_TEST.drop(columns=extra_cols, inplace=True, errors='ignore')

X_TEST = X_TEST[X_selected.columns]

In [None]:
results1 = []
reports = []
X_TEST = imputer.transform(X_TEST)
X_TEST = scaler.transform(X_TEST)

for model_name, model in models.items():
    start_time = time.time()
    
    y_pred = model.predict(X_TEST)
    
    # Расчет метрик
    accuracy = accuracy_score(y_TEST, y_pred)
    precision = precision_score(y_TEST, y_pred, average='binary')
    f1 = f1_score(y_TEST, y_pred, average='binary')
    execution_time = time.time() - start_time
    
    # Генерация classification report
    cls_report = classification_report(y_TEST, y_pred)
    
    # Сохранение результатов
    results1.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'F1-score': f1,
        'Time (s)': execution_time
    })
    
    # Сохранение отчета
    reports.append({
        'Model': model_name,
        'Report': cls_report
    })
results1_df = pd.DataFrame(results1)

In [63]:
print("\n Таблица результатов: \n")
print(results1_df.to_string(index=False))


 Таблица результатов: 

              Model  Accuracy  Precision  F1-score  Time (s)
      Decision Tree  0.760688   1.000000  0.832909  0.021002
      Random Forest  0.815488   0.997987  0.876138  0.129006
K-Nearest Neighbors  0.495900   0.843647  0.617625  2.984281


**Этап обучения многоклассовой классификации**

In [71]:
df_normal = pd.read_csv("Normal_data.csv")
df_metasploitable = pd.read_csv("metasploitable-2.csv")
df_OVS = pd.read_csv("OVS.csv")

In [72]:
df_combined = pd.concat([df_normal, df_metasploitable, df_OVS], axis=0, ignore_index=True)

In [73]:
df_combined['Timestamp'] = pd.to_datetime(df_combined['Timestamp'], format='mixed')
df_combined['Year'] = df_combined['Timestamp'].dt.year
df_combined['Month'] = df_combined['Timestamp'].dt.month
df_combined['Day'] = df_combined['Timestamp'].dt.day
df_combined['Hour'] = df_combined['Timestamp'].dt.hour
df_combined['Minute'] = df_combined['Timestamp'].dt.minute
df_combined['Second'] = df_combined['Timestamp'].dt.second
df_combined['DayOfWeek'] = df_combined['Timestamp'].dt.dayofweek

df_combined = df_combined.drop(columns=['Timestamp'])

In [74]:
df_combined['Src ip_part1'] = df_combined['Src IP'].apply(lambda x: int(x.split('.')[0]))
df_combined['Src ip_part2'] = df_combined['Src IP'].apply(lambda x: int(x.split('.')[1]))
df_combined['Src ip_part3'] = df_combined['Src IP'].apply(lambda x: int(x.split('.')[2]))
df_combined['Src ip_part4'] = df_combined['Src IP'].apply(lambda x: int(x.split('.')[3]))

df_combined = df_combined.drop(columns=['Src IP'])

In [75]:
features = ['Active Max', 'Idle Std', 'Fwd Act Data Pkts', 'Active Min', 'Fwd IAT Mean', 'Src ip_part1',
            'Src ip_part2', 'Src ip_part3', 'Src ip_part4', 'Year', 'Month', 'Day', 'Hour', 'Minute', 'Second', 'DayOfWeek']
X = df_combined[features]
y = df_combined["Label"]

In [106]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X, y, test_size=0.2, random_state=42)

In [107]:
scaler = StandardScaler()
X_train_2 = scaler.fit_transform(X_train_2)
X_test_2 = scaler.transform(X_test_2)

In [None]:
decision_tree = DecisionTreeClassifier()
random_forest = RandomForestClassifier()
knn = KNeighborsClassifier()
models = {
    "Decision Tree": decision_tree,
    "Random Forest": random_forest,
    "K-Nearest Neighbors": knn,
}
results = []
for model_name, model in models.items():
    start_time = time.time()
    model.fit(X_train_2, y_train_2)
    y_pred = model.predict(X_test_2)
    
    accuracy = accuracy_score(y_test_2, y_pred)
    precision = precision_score(y_test_2, y_pred, average='weighted')
    f1 = f1_score(y_test_2, y_pred, average='weighted')
    exec_time = time.time() - start_time
    
    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'F1-score': f1,
        'Time (s)': exec_time
    })

results_df = pd.DataFrame(results)


In [109]:
print("\n Таблица результатов в наборе данных InSDN:\n")
print(results_df.to_string(index=False))


 Таблица результатов в наборе данных InSDN:

              Model  Accuracy  Precision  F1-score  Time (s)
      Decision Tree  0.999927   0.999870  0.999898  1.494298
      Random Forest  0.999942   0.999884  0.999913 15.920906
K-Nearest Neighbors  0.999113   0.999059  0.999084 17.060842


**Этап определения вида атаки**

In [110]:
data = pd.read_csv("output_dataset.csv")
y_TEST = data['attack_type']
X_TEST = data.drop('is_attack', axis=1).drop("attack_type", axis=1)

In [111]:
X_TEST['Fwd Act Data Pkts'] = X_TEST['pktcount']

active_max = X_TEST.groupby('src_ip')['duration'].max().rename('Active Max') #максимальную длительность потока в группе 
X_TEST = X_TEST.merge(active_max, on='src_ip', how='left')

active_min = X_TEST.groupby('src_ip')['duration'].min().rename('Active Min')
X_TEST = X_TEST.merge(active_min, on='src_ip', how='left')


X_TEST['Interval_start'] = X_TEST['Interval'].str.split('-').str[0].astype(float)
X_TEST['Interval_end'] = X_TEST['Interval'].str.split('-').str[1].astype(float)
X_TEST['Idle Time'] = X_TEST.groupby('src_ip')['Interval_start'].diff()  # время между потоками
idle_std = X_TEST.groupby('src_ip')['Idle Time'].std().rename('Idle Std')
X_TEST = X_TEST.merge(idle_std, on='src_ip', how='left')

X_TEST['Fwd IAT Mean'] = X_TEST['packet_frequency'].str.extract(r'(\d+\.\d+)').astype(float)


In [112]:
X_TEST['timestamp'] = pd.to_datetime(X_TEST['timestamp'], format='mixed')
X_TEST['Year'] = X_TEST['timestamp'].dt.year
X_TEST['Month'] = X_TEST['timestamp'].dt.month
X_TEST['Day'] = X_TEST['timestamp'].dt.day
X_TEST['Hour'] = X_TEST['timestamp'].dt.hour
X_TEST['Minute'] = X_TEST['timestamp'].dt.minute
X_TEST['Second'] = X_TEST['timestamp'].dt.second
X_TEST['DayOfWeek'] = X_TEST['timestamp'].dt.dayofweek

X_TEST['Src ip_part1'] = X_TEST['src_ip'].apply(lambda x: int(x.split('.')[0]))
X_TEST['Src ip_part2'] = X_TEST['src_ip'].apply(lambda x: int(x.split('.')[1]))
X_TEST['Src ip_part3'] = X_TEST['src_ip'].apply(lambda x: int(x.split('.')[2]))
X_TEST['Src ip_part4'] = X_TEST['src_ip'].apply(lambda x: int(x.split('.')[3]))

columns_to_drop = [
    'timestamp', 'src_host', 'src_mac', 'dst_host', 'dst_ip','Interval', 'Transfer', 'Bandwidth', 
    'packet_frequency', 'src_ip', 'Interval_start',
    'Interval_end', 'Idle Time', 'duration', 'attackers_count', 'bytecount', 'pktcount', 'pktperflow',
       'byteperflow', 'pktrate'
]
X_TEST = X_TEST.drop(columns=[col for col in columns_to_drop if col in X_TEST.columns])

In [113]:
imputer = SimpleImputer(strategy='mean')
X_TEST_imputed = imputer.fit_transform(X_TEST)
X_TEST_imputed = scaler.fit_transform(X_TEST_imputed)

In [114]:
label_mapping = {
    'ddos-udp': 'DDoS',
    'ddos-icmp': 'DDoS',
    'ddos-syn': 'DDoS',
    'ddos-http': 'DDoS',
    'normal': 'Normal'
}

y_TEST_mapped = y_TEST.map(label_mapping)

In [None]:
results2 = []
for model_name, model in models.items():
    
    start_time = time.time()
    y_pred = model.predict(X_TEST_imputed)
    accuracy = accuracy_score(y_TEST_mapped, y_pred)
    precision = precision_score(y_TEST_mapped, y_pred, average='weighted')
    f1 = f1_score(y_TEST_mapped, y_pred, average='weighted')
    exec_time = time.time() - start_time

    results2.append({
            'Model': model_name,
            'Accuracy': accuracy,
            'Precision': precision,
            'F1-score': f1,
            'Time (s)': exec_time
        })

results2_df = pd.DataFrame(results2)

In [None]:
print("\n Таблица результатов в наборе данных InSDN:\n")
print(results2_df.to_string(index=False))


 Таблица результатов в наборе данных InSDN:

              Model  Accuracy  Precision  F1-score  Time (s)
      Decision Tree  0.901299   0.901531  0.901316  0.216338
      Random Forest  0.799055   0.799330  0.799080  5.506018
K-Nearest Neighbors  0.477645   0.481441  0.475325  0.219387


***Этап обучения на новых данных***

In [125]:
data = pd.read_csv("output_dataset.csv")
y_TEST = data['is_attack']
X_TEST = data.drop(['is_attack', 'attack_type', 'timestamp', 'src_host'], axis=1)
X_TEST['Transfer_MB'] = data['Transfer'].str.extract(r'(\d+\.\d+)').astype(float)
X_TEST['Bandwidth_Mbps'] = data['Bandwidth'].str.extract(r'(\d+\.\d+)').astype(float)
X_TEST['Interval_start'] = data['Interval'].str.split('-').str[0].astype(float)
X_TEST['Interval_end'] = data['Interval'].str.split('-').str[1].astype(float)
X_TEST['packet_freq'] = data['packet_frequency'].str.extract(r'(\d+\.\d+)').astype(float)
X_TEST = pd.get_dummies(X_TEST, columns=['src_ip'], drop_first=True)
X_TEST.rename(columns=lambda x: x.replace('src_ip_', 'src_'), inplace=True)
extra_cols = ['Transfer_MB', 'Bandwidth_Mbps', 'Interval_start', 'Interval_end', 'packet_freq_sec']
X_TEST.rename(columns={'duration': 'dur'}, inplace=True)
X_TEST['dt'] = (pd.to_datetime(data['timestamp']).astype('int64') // 10**9 if 'timestamp' in data else 0)
X_TEST['tot_dur'] = X_TEST['Interval_end']  - X_TEST['Interval_start']
X_TEST.drop(columns=extra_cols, inplace=True, errors='ignore')

X_TEST = X_TEST[X_selected.columns]


In [127]:
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(X_TEST, y_TEST, test_size=0.2, random_state=42)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [3, 5, 7, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', None]
}

dt = DecisionTreeClassifier()
grid_search = GridSearchCV(dt, params, cv=5, scoring='f1_weighted')
grid_search.fit(X_train_3, y_train_3)

best_dt = grid_search.best_estimator_

In [130]:
best_dt

In [131]:
data = pd.read_csv("output_dataset.csv")
y_TEST = data['attack_type']
X_TEST = data.drop('is_attack', axis=1).drop("attack_type", axis=1)
X_TEST['Fwd Act Data Pkts'] = X_TEST['pktcount']

active_max = X_TEST.groupby('src_ip')['duration'].max().rename('Active Max') #максимальную длительность потока в группе 
X_TEST = X_TEST.merge(active_max, on='src_ip', how='left')

active_min = X_TEST.groupby('src_ip')['duration'].min().rename('Active Min')
X_TEST = X_TEST.merge(active_min, on='src_ip', how='left')


X_TEST['Interval_start'] = X_TEST['Interval'].str.split('-').str[0].astype(float)
X_TEST['Interval_end'] = X_TEST['Interval'].str.split('-').str[1].astype(float)
X_TEST['Idle Time'] = X_TEST.groupby('src_ip')['Interval_start'].diff()  # время между потоками
idle_std = X_TEST.groupby('src_ip')['Idle Time'].std().rename('Idle Std')
X_TEST = X_TEST.merge(idle_std, on='src_ip', how='left')

X_TEST['Fwd IAT Mean'] = X_TEST['packet_frequency'].str.extract(r'(\d+\.\d+)').astype(float)

X_TEST['timestamp'] = pd.to_datetime(X_TEST['timestamp'], format='mixed')
X_TEST['Year'] = X_TEST['timestamp'].dt.year
X_TEST['Month'] = X_TEST['timestamp'].dt.month
X_TEST['Day'] = X_TEST['timestamp'].dt.day
X_TEST['Hour'] = X_TEST['timestamp'].dt.hour
X_TEST['Minute'] = X_TEST['timestamp'].dt.minute
X_TEST['Second'] = X_TEST['timestamp'].dt.second
X_TEST['DayOfWeek'] = X_TEST['timestamp'].dt.dayofweek

X_TEST['Src ip_part1'] = X_TEST['src_ip'].apply(lambda x: int(x.split('.')[0]))
X_TEST['Src ip_part2'] = X_TEST['src_ip'].apply(lambda x: int(x.split('.')[1]))
X_TEST['Src ip_part3'] = X_TEST['src_ip'].apply(lambda x: int(x.split('.')[2]))
X_TEST['Src ip_part4'] = X_TEST['src_ip'].apply(lambda x: int(x.split('.')[3]))

columns_to_drop = [
    'timestamp', 'src_host', 'src_mac', 'dst_host', 'dst_ip','Interval', 'Transfer', 'Bandwidth', 
    'packet_frequency', 'src_ip', 'Interval_start',
    'Interval_end', 'Idle Time', 'duration', 'attackers_count', 'bytecount', 'pktcount', 'pktperflow',
       'byteperflow', 'pktrate'
]
X_TEST = X_TEST.drop(columns=[col for col in columns_to_drop if col in X_TEST.columns])

In [132]:
imputer = SimpleImputer(strategy='mean')
X_TEST_imputed = imputer.fit_transform(X_TEST)
X_TEST_imputed = scaler.fit_transform(X_TEST_imputed)

In [133]:
X_train_4, X_test_4, y_train_4, y_test_4 = train_test_split(X_TEST_imputed, y_TEST, test_size=0.2, random_state=42)

In [134]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

params = {
    'max_depth': [3, 5, 7, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy'],
    'max_features': ['sqrt', 'log2', None]
}

dt_mclass = DecisionTreeClassifier()
grid_search_mclass = GridSearchCV(dt_mclass, params, cv=5, scoring='f1_weighted')
grid_search_mclass.fit(X_train_4, y_train_4)

best_dt_mclass = grid_search.best_estimator_

In [135]:
best_dt_mclass