In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
import Preprocessing
import DataProcessing
import model
import numpy as np
import utils
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
# load data 
data = Preprocessing.load_data('data/rt_iot2022', target_col='Attack_type')

In [None]:
Preprocessing.check_types(data,"object")

In [None]:
data = Preprocessing.delete_columns(data, ["Unnamed: 0",'service'])
data = data.loc[:, data.columns.notna()]
data = Preprocessing.one_hot_encode(data, ['proto'])
data = Preprocessing.label_encode(data, ['Attack_type'])
print(data.head(5))

In [None]:
# Tìm các cột không phải là số (numeric)
non_numeric_cols = data.select_dtypes(exclude=["number"]).columns.tolist()
print("Các cột không phải kiểu số:")
print(non_numeric_cols)


In [None]:
#Tách cột target ra khỏi dữ liệu
X, y = data.drop('Attack_type', axis=1), data['Attack_type']
X,_ = Preprocessing.scale_data(X)
# X_balanced, y_balanced = Preprocessing.apply_smote(X, y)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [None]:
#chưa áp dụng smotesmote
X_train_nsm, X_test_nsm, y_train_nsm, y_test_nsm = train_test_split(X,y, test_size=0.2, random_state=42)

#áp dụng smote
X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X, y, test_size=0.2, random_state=42)
X_train_sm, y_train_sm = Preprocessing.apply_smote(X_train_sm, y_train_sm)

Train với dữ liệu chưa giảm chiều

LinearSVC

In [None]:
model_LinearSVC = model.model_LinearSVC(X_train_sm, y_train_sm)
y_pred = model_LinearSVC.predict(X_test_sm)
utils.report(y_pred,y_test_sm)

In [None]:
accuracy = accuracy_score(y_test_sm, y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test_sm, y_pred, output_dict=True)
print("Precision (macro):", report["macro avg"]["precision"])
print("Recall (macro):", report["macro avg"]["recall"])
print("F1-score (macro):", report["macro avg"]["f1-score"])
print("Support (macro):", report["macro avg"]["support"])

xgboost

In [None]:
model_xgboost = model.model_xgboost(X_train_sm, y_train_sm,12)
y_pred = model_xgboost.predict(X_test_sm)
utils.report(y_test_sm, y_pred)

In [None]:
accuracy = accuracy_score(y_test_sm, y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test_sm, y_pred, output_dict=True)
print("Precision (macro):", report["macro avg"]["precision"])
print("Recall (macro):", report["macro avg"]["recall"])
print("F1-score (macro):", report["macro avg"]["f1-score"])
print("Support (macro):", report["macro avg"]["support"])

logistic_regression

In [None]:
model_logistic_regression = model.model_logistic_regression(X_train_sm, y_train_sm)
y_pred = model_logistic_regression.predict(X_test_sm)
utils.report(y_test_sm, y_pred)

In [None]:
accuracy = accuracy_score(y_test_sm,y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test_sm,y_pred, output_dict=True)
print("Precision (macro):", report["macro avg"]["precision"])
print("Recall (macro):", report["macro avg"]["recall"])
print("F1-score (macro):", report["macro avg"]["f1-score"])
print("Support (macro):", report["macro avg"]["support"])

KNN

In [None]:
model_knn = model.model_knn(X_train_sm, y_train_sm)
y_pred = model_knn.predict(X_test_sm)
utils.report(y_test_sm, y_pred)

In [None]:
accuracy = accuracy_score(y_test_sm, y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test_sm, y_pred, output_dict=True)
print("Precision (macro):", report["macro avg"]["precision"])
print("Recall (macro):", report["macro avg"]["recall"])
print("F1-score (macro):", report["macro avg"]["f1-score"])
print("Support (macro):", report["macro avg"]["support"])

random_forest

In [None]:
model_random_forest = model.model_random_forest(X_train_sm, y_train_sm)
y_pred = model_random_forest.predict(X_test_sm)
utils.report(y_test_sm, y_pred)

In [None]:
accuracy = accuracy_score(y_test_sm,y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test_sm,y_pred, output_dict=True)
print("Precision (macro):", report["macro avg"]["precision"])
print("Recall (macro):", report["macro avg"]["recall"])
print("F1-score (macro):", report["macro avg"]["f1-score"])
print("Support (macro):", report["macro avg"]["support"])

MLP

In [None]:
model_mlp = model.model_mlp(X_train_sm, y_train_sm, input_size=X_train_sm.shape[1],
                  hidden_size1=128, hidden_size2=64, hidden_size3=32, output_size=12, epochs=1000)
y_pred = model.predict_mlp(model_mlp, X_test_sm)
utils.report(y_test_sm,y_pred)

In [None]:
accuracy = accuracy_score(y_test_sm,y_pred)
print("Accuracy:", accuracy)

report = classification_report(y_test_sm,y_pred, output_dict=True)
print("Precision (macro):", report["macro avg"]["precision"])
print("Recall (macro):", report["macro avg"]["recall"])
print("F1-score (macro):", report["macro avg"]["f1-score"])
print("Support (macro):", report["macro avg"]["support"])

Train với dữ liệu đã giảm chiều

In [None]:
# Xử lý dữ liệu
list = DataProcessing.select_feature(X_train_sm, y_train_sm)
print(len(list))

In [None]:
X_train_sm = X_train_sm[list['Feature']]
df_reduced, dropped_columns = DataProcessing.remove_highly_correlated_columns(X_train_sm[list['Feature']], threshold=0.8)

In [None]:
X_train_sm_dr = df_reduced
X_test_sm_dr = X_test_sm[list['Feature']].drop(dropped_columns, axis=1)
y_train_sm_dr = y_train_sm.copy()
y_test_sm_dr = y_test_sm.copy()

LinearSVC

In [None]:
model_LinearSVC_dr = model.model_LinearSVC(X_train_sm_dr, y_train_sm_dr)
y_pred_dr = model_LinearSVC_dr.predict(X_test_sm_dr)
utils.report(y_test_sm_dr,y_pred_dr)

In [None]:
accuracy = accuracy_score(y_test_sm_dr, y_pred_dr)
print("Accuracy:", accuracy)

report = classification_report(y_test_sm_dr, y_pred_dr, output_dict=True)
print("Precision (macro):", report["macro avg"]["precision"])
print("Recall (macro):", report["macro avg"]["recall"])
print("F1-score (macro):", report["macro avg"]["f1-score"])
print("Support (macro):", report["macro avg"]["support"])

xgboost

In [None]:
model_xgboost_dr = model.model_xgboost(X_train_sm_dr, y_train_sm_dr,12)
y_pred_dr = model_xgboost_dr.predict(X_test_sm_dr)
utils.report(y_test_sm_dr,y_pred_dr)

In [None]:
accuracy = accuracy_score(y_test_sm_dr,y_pred_dr)
print("Accuracy:", accuracy)

report = classification_report(y_test_sm_dr,y_pred_dr, output_dict=True)
print("Precision (macro):", report["macro avg"]["precision"])
print("Recall (macro):", report["macro avg"]["recall"])
print("F1-score (macro):", report["macro avg"]["f1-score"])
print("Support (macro):", report["macro avg"]["support"])

logistic_regression

In [None]:
model_logistic_regression_dr = model.model_logistic_regression(X_train_sm_dr, y_train_sm_dr)
y_pred_dr = model_logistic_regression_dr.predict(X_test_sm_dr)
utils.report(y_test_sm_dr,y_pred_dr)

In [None]:
accuracy = accuracy_score(y_test_sm_dr,y_pred_dr)
print("Accuracy:", accuracy)

report = classification_report(y_test_sm_dr,y_pred_dr, output_dict=True)
print("Precision (macro):", report["macro avg"]["precision"])
print("Recall (macro):", report["macro avg"]["recall"])
print("F1-score (macro):", report["macro avg"]["f1-score"])
print("Support (macro):", report["macro avg"]["support"])

knn

In [None]:
model_knn_dr = model.model_knn(X_train_sm_dr, y_train_sm_dr)
y_pred_dr = model_knn_dr.predict(X_test_sm_dr)
utils.report(y_test_sm_dr, y_pred_dr)

In [None]:
accuracy = accuracy_score(y_test_sm_dr, y_pred_dr)
print("Accuracy:", accuracy)

report = classification_report(y_test_sm_dr, y_pred_dr, output_dict=True)
print("Precision (macro):", report["macro avg"]["precision"])
print("Recall (macro):", report["macro avg"]["recall"])
print("F1-score (macro):", report["macro avg"]["f1-score"])
print("Support (macro):", report["macro avg"]["support"])

random_forest

In [None]:
model_random_forest_dr = model.model_random_forest(X_train_sm_dr, y_train_sm_dr)
y_pred_dr = model_random_forest_dr.predict(X_test_sm_dr)
utils.report(y_test_sm_dr, y_pred_dr)

In [None]:
accuracy = accuracy_score(y_test_sm_dr, y_pred_dr)
print("Accuracy:", accuracy)

report = classification_report(y_test_sm_dr, y_pred_dr, output_dict=True)
print("Precision (macro):", report["macro avg"]["precision"])
print("Recall (macro):", report["macro avg"]["recall"])
print("F1-score (macro):", report["macro avg"]["f1-score"])
print("Support (macro):", report["macro avg"]["support"])

MLP

In [None]:
model_mlp_dr = model.model_mlp(X_train_sm_dr, y_train_sm_dr, input_size=X_train_sm_dr.shape[1],
                  hidden_size1=128, hidden_size2=64, hidden_size3=32, output_size=12, epochs=10000)

In [None]:
y_pred_dr = model.predict_mlp(model_mlp_dr, X_test_sm_dr)
utils.report(y_test_sm, y_pred_dr)



In [None]:
accuracy = accuracy_score(y_test_sm, y_pred_dr)
print("Accuracy:", accuracy)

report = classification_report(y_test_sm, y_pred_dr, output_dict=True)
print("Precision (macro):", report["macro avg"]["precision"])
print("Recall (macro):", report["macro avg"]["recall"])
print("F1-score (macro):", report["macro avg"]["f1-score"])
print("Support (macro):", report["macro avg"]["support"])

Train mô hỉnh tinh chỉnh.

In [None]:

# y_pred = model_random_forest.predict(X_test_sm)
# utils.report(y_pred,y_test_sm)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import randint

Random Forest

In [None]:
#test
# Không gian tham số
param_dist = {
    'n_estimators': randint(50, 200),
    'max_depth': randint(3, 10)
}

# Khởi tạo mô hình
get_random_forest_model_instance = model.get_random_forest_model() 
# Tìm kiếm ngẫu nhiên
random_search = RandomizedSearchCV(
    estimator=get_random_forest_model_instance,
    param_distributions=param_dist,
    n_iter=20,
    cv=5,
    scoring='f1_macro'
)

# Huấn luyện
random_search.fit(X_train_sm_dr, y_train_sm_dr)

# In tham số tốt nhất
print("Best params:", random_search.best_params_)
# Dự đoán và đánh giá
y_pred_rd = random_search.predict(X_test_sm_dr)
print("XGBoost Classification Report:")
utils.report(y_test_sm_dr,y_pred_rd)  

In [None]:
accuracy = accuracy_score(y_test_sm_dr,y_pred_rd)
print("Accuracy:", accuracy)

report = classification_report(y_test_sm_dr,y_pred_rd, output_dict=True)
print("Precision (macro):", report["macro avg"]["precision"])
print("Recall (macro):", report["macro avg"]["recall"])
print("F1-score (macro):", report["macro avg"]["f1-score"])
print("Support (macro):", report["macro avg"]["support"])

XGBoost

In [None]:
# Không gian tham số
# Tính scale_pos_weight
class_counts = np.bincount(y_train_sm_dr)
param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7, 10],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'scale_pos_weight': [1, class_counts.max() / class_counts.min()]  # Xử lý mất cân bằng
}

# Khởi tạo mô hình
xgb_model = model.get_xgboost_model(random_state=42)

# Tìm kiếm ngẫu nhiên
random_search_xgb = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_dist,
    n_iter=10,  # Số lần thử ngẫu nhiên
    cv=5,       # Cross-validation với 5 fold
    scoring='f1_macro',  # Dùng f1_macro để đánh giá tốt hơn với dữ liệu mất cân bằng
    random_state=42,
    n_jobs=-1    # Sử dụng tất cả CPU
)

# Huấn luyện
random_search_xgb.fit(X_train_sm_dr, y_train_sm_dr)

# Lấy mô hình tốt nhất
best_xgb_model = random_search_xgb.best_estimator_
print("Best parameters for XGBoost:", random_search_xgb.best_params_)

# Dự đoán và đánh giá
y_pred_xgb = best_xgb_model.predict(X_test_sm_dr)
print("XGBoost Classification Report:")
utils.report(y_test_sm_dr,y_pred_xgb)  

In [None]:
accuracy = accuracy_score(y_test_sm_dr, y_pred_xgb)
print("Accuracy:", accuracy)

report = classification_report(y_test_sm_dr, y_pred_xgb, output_dict=True)
print("Precision (macro):", report["macro avg"]["precision"])
print("Recall (macro):", report["macro avg"]["recall"])
print("F1-score (macro):", report["macro avg"]["f1-score"])
print("Support (macro):", report["macro avg"]["support"])

Logistic Regression

In [None]:
# # Định nghĩa không gian tham số cho Logistic Regression
# param_dist_lr = {
#     'C': [0.001, 0.01, 0.1, 1, 10, 100],
#     'max_iter': [100, 500, 1000],
#     'solver': ['lbfgs', 'saga'],
#     'class_weight': ['balanced', None]  # Xử lý mất cân bằng
# }

# # Khởi tạo mô hình
# lr_model = model.get_logistic_regression_model(random_state=42)

# # Tìm kiếm ngẫu nhiên
# random_search_lr = RandomizedSearchCV(
#     estimator=lr_model,
#     param_distributions=param_dist_lr,
#     n_iter=10,
#     cv=5,
#     scoring='f1_macro',  # Phù hợp với dữ liệu mất cân bằng
#     random_state=42,
#     n_jobs=-1
# )

# # Huấn luyện
# random_search_lr.fit(X_train_sm_dr, y_train_sm_dr)

# # Lấy mô hình tốt nhất
# best_lr_model = random_search_lr.best_estimator_
# print("Best parameters for Logistic Regression:", random_search_lr.best_params_)

# # Dự đoán và đánh giá
# y_pred_lr = best_lr_model.predict(X_test_sm_dr)
# print("Logistic Regression Classification Report:")
# utils.report(y_test_sm_dr, y_pred_lr)

KNN

In [None]:
# Định nghĩa không gian tham số cho KNN
param_dist_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# Khởi tạo mô hình
knn_model = model.get_knn_model()

# Tìm kiếm ngẫu nhiên
random_search_knn = RandomizedSearchCV(
    estimator=knn_model,
    param_distributions=param_dist_knn,
    n_iter=10,
    cv=5,
    scoring='f1_macro',
    random_state=42,
    n_jobs=-1
)

# Huấn luyện
random_search_knn.fit(X_train_sm_dr, y_train_sm_dr)

# Lấy mô hình tốt nhất
best_knn_model = random_search_knn.best_estimator_
print("Best parameters for KNN:", random_search_knn.best_params_)

# Dự đoán và đánh giá
y_pred_knn = best_knn_model.predict(X_test_sm_dr)
print("KNN Classification Report:")
utils.report(y_test_sm_dr, y_pred_knn)

In [None]:
accuracy = accuracy_score(y_test_sm_dr, y_pred_knn)
print("Accuracy:", accuracy)

report = classification_report(y_test_sm_dr, y_pred_knn, output_dict=True)
print("Precision (macro):", report["macro avg"]["precision"])
print("Recall (macro):", report["macro avg"]["recall"])
print("F1-score (macro):", report["macro avg"]["f1-score"])
print("Support (macro):", report["macro avg"]["support"])
