# 1. Libraries

In [251]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report

# 2. Load Test Data

In [252]:
train_df = pd.read_csv('KDDTrain+.txt', )

# 3. Data Pre-Processing

## 3.1. Adjust column

In [253]:
columns = (['duration'
,'protocol_type'
,'service'
,'flag'
,'src_bytes'
,'dst_bytes'
,'land'
,'wrong_fragment'
,'urgent'
,'hot'
,'num_failed_logins'
,'logged_in'
,'num_compromised'
,'root_shell'
,'su_attempted'
,'num_root'
,'num_file_creations'
,'num_shells'
,'num_access_files'
,'num_outbound_cmds'
,'is_host_login'
,'is_guest_login'
,'count','srv_count'
,'serror_rate'
,'srv_serror_rate'
,'rerror_rate'
,'srv_rerror_rate'
,'same_srv_rate'
,'diff_srv_rate'
,'srv_diff_host_rate'
,'dst_host_count'
,'dst_host_srv_count'
,'dst_host_same_srv_rate'
,'dst_host_diff_srv_rate'
,'dst_host_same_src_port_rate'
,'dst_host_srv_diff_host_rate'
,'dst_host_serror_rate'
,'dst_host_srv_serror_rate'
,'dst_host_rerror_rate'
,'dst_host_srv_rerror_rate'
,'attack'
,'level'])

train_df.columns = columns

In [254]:
train_df["attack"].value_counts()

attack
normal             67342
neptune            41214
satan               3633
ipsweep             3599
portsweep           2931
smurf               2646
nmap                1493
back                 956
teardrop             892
warezclient          890
pod                  201
guess_passwd          53
buffer_overflow       30
warezmaster           20
land                  18
imap                  11
rootkit               10
loadmodule             9
ftp_write              8
multihop               7
phf                    4
perl                   3
spy                    2
Name: count, dtype: int64

## 3.2. Classify Attack Column as Normal & attack

In [255]:
attack_n = []
for i in train_df.attack :
  if i == 'normal':
    attack_n.append("normal")
  else:
    attack_n.append("attack")
train_df['attack'] = attack_n 

## 3.3. Preprocess categorical features

In [256]:
protocol_type_le = preprocessing.LabelEncoder()
service_le = preprocessing.LabelEncoder()
flag_le = preprocessing.LabelEncoder()

## 3.4. Convert Categorical Data into Numerical Data

In [257]:
train_df['protocol_type'] = protocol_type_le.fit_transform(train_df['protocol_type'])
train_df['service'] = service_le.fit_transform(train_df['service'])
train_df['flag'] = flag_le.fit_transform(train_df['flag'])

## 3.5. Define features and labels

In [258]:
X_train = train_df.drop(columns=['attack'])
Y_train = train_df['attack']

### 3.5.1. Check how many data points are labeled as "attack" and "normal"

In [259]:
attack_counts = train_df['attack'].value_counts()
print("Attack counts:\n", attack_counts)

Attack counts:
 attack
normal    67342
attack    58630
Name: count, dtype: int64


# 4. Model Training

## 4.1. Standardizing the Features

In [260]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

## 4.2. Creating the Gradient Boosting Model

In [261]:
model = GradientBoostingClassifier(random_state=40)

## 4.3. Model Training

In [262]:
model.fit(X_train, Y_train)

## 4.4. Save the Model and Scaler

In [263]:
joblib.dump(model, 'gbm_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

## 4.5. Evaluating the Model

In [264]:
Y_pred = model.predict(X_train)

accuracy = accuracy_score(Y_train, Y_pred)
report = classification_report(Y_train, Y_pred)

print(f"Accuracy: {accuracy:.4f}")
print("Classification Report:")
print(report)

Accuracy: 0.9980
Classification Report:
              precision    recall  f1-score   support

      attack       1.00      1.00      1.00     58630
      normal       1.00      1.00      1.00     67342

    accuracy                           1.00    125972
   macro avg       1.00      1.00      1.00    125972
weighted avg       1.00      1.00      1.00    125972

