# "用tesing data(之前模型都沒有看過的測試資料(資料夾03-11)來個別測試一般網格搜尋調整過後的SVM模型的測試準確率(模型最終的表現)" 然後有附耗時

In [2]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import time

# ===============================
# 載入資料
# ===============================
print("載入資料...")
X_train = pd.read_csv('/Users/User/.vscode/thesis_experiment/CIC-DDoS-2019/01-12/X_train_upsampled.csv')
Y_train = pd.read_csv('/Users/User/.vscode/thesis_experiment/CIC-DDoS-2019/01-12/Y_train_upsampled.csv')
X_test = pd.read_csv('/Users/User/.vscode/thesis_experiment/CIC-DDoS-2019/01-12/X_test_ext.csv')
y_test = pd.read_csv('/Users/User/.vscode/thesis_experiment/CIC-DDoS-2019/01-12/y_test_ext.csv')
y_train = Y_train.iloc[:, 0] if isinstance(Y_train, pd.DataFrame) else Y_train

# ===============================
# 特徵標準化
# ===============================
print("特徵標準化...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ===============================
# 一般 Grid SearchCV
# ===============================
print("執行一般 GridSearchCV 搜尋...")

param_grid = {
    'C': [1, 10, 100, 500, 1000],
    'gamma': [0.0001, 0.001, 0.01, 0.1],
    'kernel': ['rbf']
}

svm = SVC(random_state=42)

start_time = time.time()

grid_search = GridSearchCV(
    estimator=svm,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train_scaled, y_train)

elapsed_time = time.time() - start_time

print("\n Traditional GridSearchCV search completed.")
print(f"execution time: {elapsed_time:.2f} sec")
print(f"best parameter: {grid_search.best_params_}")
print(f"best cross-validation grade: {grid_search.best_score_:.6f}")

# ===============================
# 測試集評估
# ===============================
print("\n the evaluation of the test set...")
best_svm_model = grid_search.best_estimator_
y_pred = best_svm_model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"testing accuracy: {accuracy:.6f}")
print("\nclassification report:")
print(classification_report(y_test, y_pred))
print("\nconfusion matrix:")
print(confusion_matrix(y_test, y_pred))


載入資料...
特徵標準化...
執行一般 GridSearchCV 搜尋...

 Traditional GridSearchCV search completed.
execution time: 4959.80 秒
best parameter: {'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}
best cross-validation grade: 0.999693

 the evaluation of the test set...
testing accuracy: 0.998647

classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     49763
           1       1.00      1.00      1.00     25620

    accuracy                           1.00     75383
   macro avg       1.00      1.00      1.00     75383
weighted avg       1.00      1.00      1.00     75383


confusion matrix:
[[49666    97]
 [    5 25615]]


In [None]:
import pandas as pd
import numpy as np
import time
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 載入資料
X_train = pd.read_csv('/Users/User/.vscode/thesis_experiment/CIC-DDoS-2019/01-12/X_train_upsampled.csv')
Y_train = pd.read_csv('/Users/User/.vscode/thesis_experiment/CIC-DDoS-2019/01-12/Y_train_upsampled.csv')
X_test = pd.read_csv('/Users/User/.vscode/thesis_experiment/CIC-DDoS-2019/01-12/X_test_ext.csv')
y_test = pd.read_csv('/Users/User/.vscode/thesis_experiment/CIC-DDoS-2019/01-12/y_test_ext.csv')
y_train = Y_train.iloc[:, 0] if isinstance(Y_train, pd.DataFrame) else Y_train

# 特徵標準化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 定義一般 GridSearch 的參數範圍（與 Adaptive 相同）
param_grid_lr = {
    'C': [1e-5, 1e-4, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'penalty': ['l2'],
    'solver': ['lbfgs']
}

# 建立模型與 GridSearch
logreg_model = LogisticRegression(max_iter=1000, random_state=42)

start_time = time.time()

grid_search_lr = GridSearchCV(
    estimator=logreg_model,
    param_grid=param_grid_lr,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search_lr.fit(X_train_scaled, y_train)

elapsed_time = time.time() - start_time

best_lr_model = grid_search_lr.best_estimator_

# 顯示搜尋結果
print("\nTraditional GridSearchCV search completed.")
print(f"execution time: {elapsed_time:.2f} sec")
print(f"best parameter: {grid_search_lr.best_params_}")
print(f"best cross-validation grade: {grid_search_lr.best_score_:.6f}")

# 測試集評估
def evaluate_model(model, X_test, y_test, model_name="Model"):
    y_pred_test = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_test)
    print(f"\n{model_name} testing accuracy: {accuracy:.6f}")
    print(f"{model_name} classification report:\n{classification_report(y_test, y_pred_test)}")
    print(f"{model_name} confusion matrix:\n{confusion_matrix(y_test, y_pred_test)}")

evaluate_model(best_lr_model, X_test_scaled, y_test, "Logistic Regression (GridSearch)")



Traditional GridSearchCV search completed.
execution time: 27.38 sec
best parameter: {'C': 0.001, 'penalty': 'l2', 'solver': 'lbfgs'}
best cross-validation grade: 0.990380

Logistic Regression (GridSearch) testing accuracy: 0.996830
Logistic Regression (GridSearch) classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     49763
           1       0.99      1.00      1.00     25620

    accuracy                           1.00     75383
   macro avg       1.00      1.00      1.00     75383
weighted avg       1.00      1.00      1.00     75383

Logistic Regression (GridSearch) confusion matrix:
[[49526   237]
 [    2 25618]]


In [2]:
from sklearn.neighbors import KNeighborsClassifier

# 定義一般 GridSearch 的參數範圍（與 Adaptive 相同）
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# 建立模型與 GridSearch
knn_model = KNeighborsClassifier()

start_time = time.time()

grid_search_knn = GridSearchCV(
    estimator=knn_model,
    param_grid=param_grid_knn,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search_knn.fit(X_train_scaled, y_train)

elapsed_time = time.time() - start_time

best_knn_model = grid_search_knn.best_estimator_

# 顯示搜尋結果
print("\n===== 一般 GridSearchCV (KNN) 結果 =====")
print(f"耗時: {elapsed_time:.2f} 秒")
print(f"最佳參數: {grid_search_knn.best_params_}")
print(f"最佳交叉驗證分數: {grid_search_knn.best_score_:.6f}")

# 測試集評估
def evaluate_model(model, X_test, y_test, model_name="Model"):
    y_pred_test = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_test)
    print(f"\n{model_name} 測試準確率: {accuracy:.6f}")
    print(f"{model_name} 分類報告:\n{classification_report(y_test, y_pred_test)}")
    print(f"{model_name} 混淆矩陣:\n{confusion_matrix(y_test, y_pred_test)}")

evaluate_model(best_knn_model, X_test_scaled, y_test, "KNN (GridSearch)")



===== 一般 GridSearchCV (KNN) 結果 =====
耗時: 6571.91 秒
最佳參數: {'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
最佳交叉驗證分數: 0.999799

KNN (GridSearch) 測試準確率: 0.999680
KNN (GridSearch) 分類報告:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     18807
           1       1.00      1.00      1.00     90714

    accuracy                           1.00    109521
   macro avg       1.00      1.00      1.00    109521
weighted avg       1.00      1.00      1.00    109521

KNN (GridSearch) 混淆矩陣:
[[18799     8]
 [   27 90687]]


## 一般網格搜尋的KNN (最終版)

In [5]:
import pandas as pd
import numpy as np
import time
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 載入資料
print("載入資料...")
X_train = pd.read_csv('/Users/User/.vscode/thesis_experiment/CIC-DDoS-2019/01-12/X_train_upsampled.csv')
Y_train = pd.read_csv('/Users/User/.vscode/thesis_experiment/CIC-DDoS-2019/01-12/Y_train_upsampled.csv')
X_test = pd.read_csv('/Users/User/.vscode/thesis_experiment/CIC-DDoS-2019/01-12/X_test_ext.csv')
y_test = pd.read_csv('/Users/User/.vscode/thesis_experiment/CIC-DDoS-2019/01-12/y_test_ext.csv')
y_train = Y_train.iloc[:, 0] if isinstance(Y_train, pd.DataFrame) else Y_train

# 特徵標準化
print("特徵標準化...")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 定義一般 GridSearch 的參數範圍（與原始程式碼相同）
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# 建立模型與 GridSearch
print("Executing traditional GridSearchCV (KNN) searching...")
knn_model = KNeighborsClassifier()

start_time = time.time()

grid_search_knn = GridSearchCV(
    estimator=knn_model,
    param_grid=param_grid_knn,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search_knn.fit(X_train_scaled, y_train)

elapsed_time = time.time() - start_time

# 顯示搜尋結果
print("\n===== Traditional GridSearchCV (KNN) result =====")
print(f"execution time: {elapsed_time:.2f} 秒")
print(f"best parameter: {grid_search_knn.best_params_}")
print(f"best cross-validation grade: {grid_search_knn.best_score_:.6f}")

# 測試集評估
def evaluate_model(model, X_test, y_test, model_name="Model"):
    y_pred_test = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred_test)
    print(f"\n{model_name} testing accuracy: {accuracy:.6f}")
    print(f"{model_name} classification report:\n{classification_report(y_test, y_pred_test)}")
    print(f"{model_name} confusion matrix:\n{confusion_matrix(y_test, y_pred_test)}")

best_knn_model = grid_search_knn.best_estimator_
evaluate_model(best_knn_model, X_test_scaled, y_test, "KNN (GridSearch)")

載入資料...
特徵標準化...
Executing traditional GridSearchCV (KNN) searching...

===== Traditional GridSearchCV (KNN) result =====
execution time: 4995.56 秒
best parameter: {'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
best cross-validation grade: 0.999799

KNN (GridSearch) testing accuracy: 0.998395
KNN (GridSearch) classification report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     49763
           1       1.00      1.00      1.00     25620

    accuracy                           1.00     75383
   macro avg       1.00      1.00      1.00     75383
weighted avg       1.00      1.00      1.00     75383

KNN (GridSearch) confusion matrix:
[[49644   119]
 [    2 25618]]
