<a href="https://colab.research.google.com/github/selam1630/cybersecurity-ml-models/blob/main/intrusion_detection_ml.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Network Intrusion Detection using Machine Learning

## 1. Import Libraries
## 2. Load Dataset
## 3. Data Cleaning & Preprocessing
## 4. Train-Test Split
## 5. Feature Scaling
## 6. Model Training
## 7. Model Evaluation
## 8. Export Models


In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

import joblib


In [None]:
url = "https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.csv"

columns = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes",
    "land","wrong_fragment","urgent","hot","num_failed_logins","logged_in",
    "num_compromised","root_shell","su_attempted","num_root","num_file_creations",
    "num_shells","num_access_files","num_outbound_cmds","is_host_login",
    "is_guest_login","count","srv_count","serror_rate","srv_serror_rate",
    "rerror_rate","srv_rerror_rate","same_srv_rate","diff_srv_rate",
    "srv_diff_host_rate","dst_host_count","dst_host_srv_count",
    "dst_host_same_srv_rate","dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate",
    "dst_host_serror_rate","dst_host_srv_serror_rate",
    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label","difficulty"
]

df = pd.read_csv(url, names=columns)
df.head()
df.shape



(125973, 43)

In [None]:
# Convert multi-class labels into binary
# 0 = Normal traffic, 1 = Attack

df["label"] = df["label"].apply(lambda x: 0 if x == "normal" else 1)

df["label"].value_counts()


Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,67343
1,58630


In [None]:
categorical_cols = ["protocol_type", "service", "flag"]

label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le


In [None]:
df.drop("difficulty", axis=1, inplace=True)


In [None]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125973 entries, 0 to 125972
Data columns (total 42 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   duration                     125973 non-null  int64  
 1   protocol_type                125973 non-null  int64  
 2   service                      125973 non-null  int64  
 3   flag                         125973 non-null  int64  
 4   src_bytes                    125973 non-null  int64  
 5   dst_bytes                    125973 non-null  int64  
 6   land                         125973 non-null  int64  
 7   wrong_fragment               125973 non-null  int64  
 8   urgent                       125973 non-null  int64  
 9   hot                          125973 non-null  int64  
 10  num_failed_logins            125973 non-null  int64  
 11  logged_in                    125973 non-null  int64  
 12  num_compromised              125973 non-null  int64  
 13 

In [None]:
X = df.drop("label", axis=1)
y = df["label"]


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [None]:
print("Training samples:", X_train.shape)
print("Testing samples:", X_test.shape)


Training samples: (100778, 41)
Testing samples: (25195, 41)


In [None]:
scaler = StandardScaler()


In [None]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
X_train_scaled.mean(axis=0)[:5]


array([ 1.12104125e-17,  2.38591421e-16,  5.23152583e-17, -3.29966858e-17,
        5.46419477e-19])

In [None]:
# Logistic Regression model
logistic_model = LogisticRegression(
    max_iter=1000,
    random_state=42
)

logistic_model.fit(X_train_scaled, y_train)


In [None]:
# Decision Tree model
decision_tree_model = DecisionTreeClassifier(
    max_depth=10,
    random_state=42
)

decision_tree_model.fit(X_train_scaled, y_train)


In [None]:
# Logistic Regression Evaluation
y_pred_lr = logistic_model.predict(X_test_scaled)

print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr))


Logistic Regression Accuracy: 0.9519745981345505

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.96      0.96     13469
           1       0.96      0.94      0.95     11726

    accuracy                           0.95     25195
   macro avg       0.95      0.95      0.95     25195
weighted avg       0.95      0.95      0.95     25195



In [None]:
# Decision Tree Evaluation
y_pred_dt = decision_tree_model.predict(X_test_scaled)

print("Decision Tree Accuracy:", accuracy_score(y_test, y_pred_dt))
print("\nClassification Report:\n", classification_report(y_test, y_pred_dt))


Decision Tree Accuracy: 0.9975788846993451

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     13469
           1       1.00      1.00      1.00     11726

    accuracy                           1.00     25195
   macro avg       1.00      1.00      1.00     25195
weighted avg       1.00      1.00      1.00     25195



In [None]:
joblib.dump(logistic_model, "logistic_intrusion_model.pkl")


['logistic_intrusion_model.pkl']

In [None]:
joblib.dump(decision_tree_model, "decision_tree_intrusion_model.pkl")


['decision_tree_intrusion_model.pkl']

In [None]:
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']

In [None]:
!ls


decision_tree_intrusion_model.pkl  sample_data
logistic_intrusion_model.pkl	   scaler.pkl


In [None]:
from google.colab import files

files.download("logistic_intrusion_model.pkl")
files.download("decision_tree_intrusion_model.pkl")
files.download("scaler.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files

files.download("decision_tree_intrusion_model.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>