In [4]:
# Install required packages
!pip install -q gdown imbalanced-learn transformers scikit-learn tensorflow

import os
import json
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, IsolationForest, VotingClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans, DBSCAN
from sklearn.feature_selection import mutual_info_classif
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import joblib

# Create models directory
os.makedirs("models", exist_ok=True)
blockchain_path = "models/blockchain.json"

if not os.path.exists(blockchain_path):
    with open(blockchain_path, "w") as f:
        json.dump([], f)

# Download and load NSL-KDD dataset
!wget -q https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt -O /content/KDDTrain+.txt

nsl = pd.read_csv("/content/KDDTrain+.txt", header=None)
nsl.columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land',
    'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised',
    'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
    'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count',
    'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
    'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
    'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate',
    'label', 'difficulty'
]

nsl = nsl.sample(n=50000, random_state=42)
nsl['label'] = nsl['label'].apply(lambda x: 0 if x == 'normal' else 1)
nsl = nsl.select_dtypes(include=[np.number])

# Download and load CICIDS2017 dataset
!gdown --folder https://drive.google.com/drive/folders/1BD8tEZ7K6ZBbxhexn40MFIkYSthriu0V?usp=drive_link -O /content/cicids2017 --quiet

cic_path = "/content/cicids2017/MachineLearningCSV /Wednesday-workingHours.pcap_ISCX.csv"
cic = pd.read_csv(cic_path).sample(n=50000, random_state=42)
cic.columns = cic.columns.str.strip()
cic = cic.replace([np.inf, -np.inf], np.nan).dropna()
cic['label'] = (cic['Label'] != 'BENIGN').astype(int)
cic = cic.select_dtypes(include=[np.number])

# Feature mapping between datasets
mapping = {
    'src_bytes': 'Flow Bytes/s', 'dst_bytes': 'Flow Packets/s',
    'wrong_fragment': 'Fwd Header Length', 'num_failed_logins': 'Fwd Packets/s',
    'hot': 'Flow IAT Mean', 'logged_in': 'Flow IAT Max', 'count': 'Fwd IAT Total',
    'srv_count': 'Subflow Fwd Bytes', 'same_srv_rate': 'Fwd IAT Std',
    'dst_host_srv_count': 'Flow IAT Std', 'dst_host_same_srv_rate': 'Idle Std',
    'dst_host_diff_srv_rate': 'Idle Max', 'dst_host_serror_rate': 'Fwd Header Length.1',
    'dst_host_srv_serror_rate': 'Flow Duration', 'srv_rerror_rate': 'Init_Win_bytes_forward',
    'srv_serror_rate': 'Bwd Packet Length Min', 'rerror_rate': 'Bwd Packet Length Max',
    'num_compromised': 'Bwd Packet Length Mean'
}

# Select and rename features for both datasets
nsl2 = nsl[list(mapping.keys()) + ['label']].copy()
cic2 = cic[[mapping[k] for k in mapping if mapping[k] in cic.columns] + ['label']].copy()
cic2.columns = list(mapping.keys())[:len(cic2.columns)-1] + ['label']

# Merge datasets
df = pd.concat([nsl2, cic2], ignore_index=True).dropna()

print("Class distribution before merge:")
print("NSL-KDD:", nsl['label'].value_counts(normalize=True))
print("CICIDS:", cic['label'].value_counts(normalize=True))
print("After merge:", df['label'].value_counts(normalize=True))

# Feature selection
X = df.drop("label", axis=1)
y = df["label"]

mi = mutual_info_classif(X, y)
mi_scores = pd.Series(mi, index=X.columns).sort_values(ascending=False)
print("Feature rankings (mutual information):")
print(mi_scores)

top_features = X.columns[mi > 0.01]
X = X[top_features]

# Train-test split and scaling
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Utility functions
def save_conf_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(6, 5))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f"{title} Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.savefig(f"models/{title}_cm.png")
    plt.show()
    plt.close()

def save_class_report(y_true, y_pred, name):
    report = classification_report(y_true, y_pred, output_dict=True)
    with open(f"models/{name}_report.json", "w") as f:
        json.dump(report, f, indent=2)

def log_predictions(model_name, y_pred, y_true):
    with open(blockchain_path, "r+") as f:
        chain = json.load(f)
        for i, (pred, actual) in enumerate(zip(y_pred, y_true)):
            entry = {
                "model": model_name,
                "timestamp": str(datetime.utcnow()),
                "index": len(chain) + i,
                "prediction": int(pred),
                "actual": int(actual),
                "hash": hash(f"{model_name}{pred}{actual}{len(chain)+i}")
            }
            chain.append(entry)
        f.seek(0)
        json.dump(chain, f, indent=2)

# Supervised models
models = {
    "NaiveBayes": GaussianNB(),
    "SVM": SVC(probability=True, random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42)
}

plt.figure(figsize=(10, 8))

for name, model in models.items():
    start = time.time()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    elapsed = time.time() - start

    acc = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {acc:.4f}, Training Time: {elapsed:.2f}s")

    cv = cross_val_score(model, X, y, cv=StratifiedKFold(5), scoring="accuracy")
    print(f"{name} 5-Fold CV Scores: {cv}")
    print(f"{name} Mean CV Score: {cv.mean():.4f} (+/- {cv.std()*2:.4f})")

    joblib.dump(model, f"models/{name}.joblib")
    save_conf_matrix(y_test, y_pred, name)
    save_class_report(y_test, y_pred, name)
    log_predictions(name, y_pred, y_test)

    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(X_test)[:, 1])
    plt.plot(fpr, tpr, label=f"{name} (AUC={auc(fpr,tpr):.2f})")

# Unsupervised models
unsupervised = {
    "KMeans": KMeans(n_clusters=2, n_init=10, random_state=42),
    "DBSCAN": DBSCAN(eps=0.5, min_samples=5),
    "IsolationForest": IsolationForest(random_state=42, contamination=0.1)
}

for name, model in unsupervised.items():
    start = time.time()
    if name == "DBSCAN":
        pred = model.fit_predict(X_test)
        # DBSCAN returns -1 for outliers, 0 for inliers
        pred = np.where(pred == -1, 1, 0)
    else:
        pred = model.fit_predict(X_test)
        pred = np.where(pred == -1, 1, 0) if name == "IsolationForest" else pred

    elapsed = time.time() - start

    acc = accuracy_score(y_test, pred)
    print(f"{name} Accuracy: {acc:.4f}, Training Time: {elapsed:.2f}s")

    save_conf_matrix(y_test, pred, name)
    save_class_report(y_test, pred, name)
    log_predictions(name, pred, y_test)

# LSTM Model
X_lstm_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_lstm_test = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

lstm_model = Sequential([
    LSTM(64, input_shape=(1, X_train.shape[1])),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

start = time.time()
history = lstm_model.fit(X_lstm_train, y_train, epochs=10, batch_size=32, validation_split=0.2, verbose=0)
elapsed = time.time() - start

lstm_pred = (lstm_model.predict(X_lstm_test) > 0.5).astype(int).flatten()
print(f"LSTM Accuracy: {accuracy_score(y_test, lstm_pred):.4f}, Training Time: {elapsed:.2f}s")

lstm_model.save("models/LSTM_model.h5")
save_conf_matrix(y_test, lstm_pred, "LSTM")
save_class_report(y_test, lstm_pred, "LSTM")
log_predictions("LSTM", lstm_pred, y_test)

fpr, tpr, _ = roc_curve(y_test, lstm_model.predict(X_lstm_test).flatten())
plt.plot(fpr, tpr, label=f"LSTM (AUC={auc(fpr,tpr):.2f})")

# Ensemble model
ensemble = VotingClassifier(
    estimators=[
        ('nb', models["NaiveBayes"]),
        ('svm', models["SVM"]),
        ('rf', models["RandomForest"])
    ],
    voting='soft'
)

ensemble.fit(X_train, y_train)
y_pred_ens = ensemble.predict(X_test)
print(f"Ensemble Accuracy: {accuracy_score(y_test, y_pred_ens):.4f}")

joblib.dump(ensemble, "models/Ensemble.joblib")
save_conf_matrix(y_test, y_pred_ens, "Ensemble")
save_class_report(y_test, y_pred_ens, "Ensemble")
log_predictions("Ensemble", y_pred_ens, y_test)

fpr, tpr, _ = roc_curve(y_test, ensemble.predict_proba(X_test)[:, 1])
plt.plot(fpr, tpr, label=f"Ensemble (AUC={auc(fpr,tpr):.2f})")

# Final ROC curve plot
plt.plot([0, 1], [0, 1], 'k--', label='Random')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curves Comparison")
plt.legend()
plt.grid(True)
plt.savefig("models/roc_curves.png", dpi=300, bbox_inches='tight')
plt.show()
plt.close()


# Load and display blockchain entries
with open("models/blockchain.json", "r") as f:
    chain = json.load(f)

block_df = pd.DataFrame(chain)
print("Blockchain sample entries:")
print(block_df.tail(10))
print(f"\nTotal entries in blockchain: {len(block_df)}")

# Save feature importance
feature_importance = pd.DataFrame({
    'feature': top_features,
    'importance': mi[mi > 0.01]
}).sort_values('importance', ascending=False)

feature_importance.to_csv("models/feature_importance.csv", index=False)
print("\nFeature importance saved to models/feature_importance.csv")

Failed to retrieve file url:

	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses.
	Check FAQ in https://github.com/wkentaro/gdown?tab=readme-ov-file#faq.

You may still be able to access the file from the browser:

	https://drive.google.com/uc?id=1Ix0hhlX00pJ6UHjZ_PMlwCRDQdhO0ENq

but Gdown can't. Please check connections and permissions.
Class distribution before merge:
NSL-KDD: label
0    0.53208
1    0.46792
Name: proportion, dtype: float64
CICIDS: label
0    0.638023
1    0.361977
Name: proportion, dtype: float64
After merge: label
0    0.585005
1    0.414995
Name: proportion, dtype: float64
Feature rankings (mutual information):
src_bytes                   0.453837
dst_bytes                   0.407876
logged_in                   0.346348
dst_host_srv_serror_rate    0.335166
dst_host_srv_count          0.332055
same_srv_rate               0.318792
dst_host_serror_rate        0.301290
count       

  "timestamp": str(datetime.utcnow()),


SVM Accuracy: 0.7389, Training Time: 2479.68s
SVM 5-Fold CV Scores: [0.58519742 0.58504729 0.62596337 0.84375938 0.8441097 ]
SVM Mean CV Score: 0.6968 (+/- 0.2421)


  "timestamp": str(datetime.utcnow()),


RandomForest Accuracy: 0.9978, Training Time: 12.96s
RandomForest 5-Fold CV Scores: [0.99744783 0.99704749 0.99769793 0.99769793 0.99669703]
RandomForest Mean CV Score: 0.9973 (+/- 0.0008)


  "timestamp": str(datetime.utcnow()),


KMeans Accuracy: 0.6796, Training Time: 0.25s


  "timestamp": str(datetime.utcnow()),


DBSCAN Accuracy: 0.5707, Training Time: 11.25s


  "timestamp": str(datetime.utcnow()),


IsolationForest Accuracy: 0.5869, Training Time: 0.43s


  "timestamp": str(datetime.utcnow()),
  super().__init__(**kwargs)


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step




LSTM Accuracy: 0.8829, Training Time: 100.25s


  "timestamp": str(datetime.utcnow()),


[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step
Ensemble Accuracy: 0.9867


  "timestamp": str(datetime.utcnow()),


Blockchain sample entries:
           model                   timestamp   index  prediction  actual  \
159854  Ensemble  2025-08-27 18:32:11.974726  179827           1       1   
159855  Ensemble  2025-08-27 18:32:11.974733  179829           0       0   
159856  Ensemble  2025-08-27 18:32:11.974739  179831           1       1   
159857  Ensemble  2025-08-27 18:32:11.974746  179833           1       1   
159858  Ensemble  2025-08-27 18:32:11.974752  179835           1       1   
159859  Ensemble  2025-08-27 18:32:11.974758  179837           1       1   
159860  Ensemble  2025-08-27 18:32:11.974765  179839           0       0   
159861  Ensemble  2025-08-27 18:32:11.974771  179841           0       0   
159862  Ensemble  2025-08-27 18:32:11.974778  179843           1       1   
159863  Ensemble  2025-08-27 18:32:11.974786  179845           0       0   

                       hash  
159854  3617636844703803078  
159855 -3683492907553031627  
159856  -308788786366558306  
159857   -373464