In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

In [2]:
dtypes = {
    'ip.src': 'object',
    'ip.dst': 'object',
    'ip.proto': 'int64',
    'frame.len': 'int64',
    'tcp.srcport': 'int64',
    'tcp.dstport': 'int64', 
    'tcp.seq': 'int64',
    'tcp.ack': 'int64', 
    'Bytes': 'int64',  
    'Label': 'object'
}

In [3]:
nRowsRead = None # specify 'None' if want to read whole file

dataset = pd.read_csv(
    '../data/DDoS-Dataset.csv',
    dtype=dtypes,
    engine='c',
    low_memory=True,
    delimiter=',', 
    nrows = nRowsRead
)

dataset.dataframeName = 'DDoS-Dataset.csv'

nRow, nCol = dataset.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 151200 rows and 23 columns


In [4]:
ddrop = ['tcp.flags.syn', 'tcp.flags.reset', 'tcp.flags.push', 'tcp.flags.ack', 'ip.flags.mf', 'ip.flags.df', 'ip.flags.rb', 'frame.time', 'Packets', 'Tx Packets', 'Tx Bytes', 'Rx Packets', 'Rx Bytes' ]
dataset.drop(ddrop, inplace=True, axis=1)

# rename Bytes with tcp.len
dataset.rename(columns={'Bytes': 'tcp.len'}, inplace=True)

dataset.head(5) 

Unnamed: 0,ip.src,ip.dst,tcp.srcport,tcp.dstport,ip.proto,frame.len,tcp.seq,tcp.ack,tcp.len,Label
0,192.168.1.1,192.168.23.2,2412,8000,6,54,1,1,432,DDoS-PSH-ACK
1,192.168.1.1,192.168.23.2,2413,8000,6,54,1,1,540,DDoS-PSH-ACK
2,192.168.1.1,192.168.23.2,2414,8000,6,54,1,1,648,DDoS-PSH-ACK
3,192.168.1.1,192.168.23.2,2415,8000,6,54,1,1,540,DDoS-PSH-ACK
4,192.168.1.1,192.168.23.2,2416,8000,6,54,1,1,324,DDoS-PSH-ACK


In [5]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151200 entries, 0 to 151199
Data columns (total 10 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   ip.src       151200 non-null  object
 1   ip.dst       151200 non-null  object
 2   tcp.srcport  151200 non-null  int64 
 3   tcp.dstport  151200 non-null  int64 
 4   ip.proto     151200 non-null  int64 
 5   frame.len    151200 non-null  int64 
 6   tcp.seq      151200 non-null  int64 
 7   tcp.ack      151200 non-null  int64 
 8   tcp.len      151200 non-null  int64 
 9   Label        151200 non-null  object
dtypes: int64(7), object(3)
memory usage: 11.5+ MB


In [6]:
def ip_to_int(ip):
    parts = ip.split('.')
    return int(''.join([part.zfill(3) for part in parts]))

# # Apply the custom function to the "ip_address" column
dataset['ip.src'] = dataset['ip.src'].apply(ip_to_int)
dataset['ip.dst'] = dataset['ip.dst'].apply(ip_to_int)

In [7]:
# replace 1 and 0 for ddos or not
def label_update(label):
    if label == 'DDoS-PSH-ACK' or label == 'DDoS-ACK':
        return 1
    elif label == 'Benign':
        return 0

dataset['Label'] = dataset['Label'].apply(label_update)

In [8]:
dataset.describe(include='all')

Unnamed: 0,ip.src,ip.dst,tcp.srcport,tcp.dstport,ip.proto,frame.len,tcp.seq,tcp.ack,tcp.len,Label
count,151200.0,151200.0,151200.0,151200.0,151200.0,151200.0,151200.0,151200.0,151200.0,151200.0
mean,192168000000.0,192168000000.0,27376.943247,8000.0,6.0,99.025126,1.0,1.0,799.422937,0.5
std,6020.817,0.0,19634.879546,0.0,0.0,71.226673,0.0,0.0,373.759057,0.500002
min,192168000000.0,192168000000.0,1302.0,8000.0,6.0,54.0,1.0,1.0,54.0,0.0
25%,192168000000.0,192168000000.0,8621.0,8000.0,6.0,54.0,1.0,1.0,432.0,0.0
50%,192168000000.0,192168000000.0,26263.0,8000.0,6.0,60.0,1.0,1.0,1101.5,0.5
75%,192168000000.0,192168000000.0,45566.0,8000.0,6.0,105.0,1.0,1.0,1146.0,1.0
max,192168000000.0,192168000000.0,60998.0,8000.0,6.0,223.0,1.0,1.0,1229.0,1.0


In [9]:
# search for missing value
#dataset.isna().sum()

X = dataset.drop('Label', axis=1).values
y = dataset['Label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=32)

In [10]:
# readymade model
from sklearn.ensemble import GradientBoostingClassifier

# Create a Gradient Boosting classifier object
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.2, max_depth=4)

In [11]:
# Train the classifier on the training data
gb_classifier.fit(X_train, y_train)

In [12]:
from sklearn.metrics import accuracy_score

# Predict the labels of the test data
y_pred = gb_classifier.predict(X_test)

# Evaluate the accuracy of the classifier
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Accuracy: 1.0


In [13]:
# save the model
import pickle

filename = './done/ddos_model.pkl'
pickle.dump(gb_classifier, open(filename, 'wb'))

In [14]:
import pandas as pd
import numpy as np
import pickle

loaded_model = pickle.load(open('./done/ddos_model.pkl', 'rb'))
# result = loaded_model.score(X_test, y_test)
# print(result)

In [15]:
# realtime attack prediction
saved_data = pd.read_csv('../data/prddata.csv')

saved_data.head(5)

Unnamed: 0,ip.src,ip.dst,ip.proto,frame.len,tcp.srcport,tcp.dstport,tcp.seq,tcp.ack,tcp.len
0,140.82.112.26,192.168.1.5,6,92,443,35398,1,1,26
1,192.168.1.5,140.82.112.26,6,96,35398,443,1,27,30
2,140.82.112.26,192.168.1.5,6,66,443,35398,27,31,0
3,192.168.59.110,23.57.12.251,6,66,40184,443,1,1,0
4,23.57.12.251,192.168.59.110,6,97,443,40184,1,2,31


In [16]:
def ip_to_int(ip):
    parts = ip.split('.')
    return int(''.join([part.zfill(3) for part in parts]))

# # Apply the custom function to the "ip_address" column
saved_data['ip.src'] = saved_data['ip.src'].apply(ip_to_int)
saved_data['ip.dst'] = saved_data['ip.dst'].apply(ip_to_int)

saved_data.head(5)

Unnamed: 0,ip.src,ip.dst,ip.proto,frame.len,tcp.srcport,tcp.dstport,tcp.seq,tcp.ack,tcp.len
0,140082112026,192168001005,6,92,443,35398,1,1,26
1,192168001005,140082112026,6,96,35398,443,1,27,30
2,140082112026,192168001005,6,66,443,35398,27,31,0
3,192168059110,23057012251,6,66,40184,443,1,1,0
4,23057012251,192168059110,6,97,443,40184,1,2,31


In [17]:
# pred = loaded_model.predict(saved_data)

#X = np.delete(X, (0), axis=0)

pred = loaded_model.predict(X)

print("\n\nPredicted values:\n")
d=0
n=0
for i in pred:
    if i==1:
        d+=1
    elif i==0:
        n+=1
        
print(f"Predicted DDoS attack: {d}")
print(f"Predicted noraml call: {n}")



Predicted values:

Predicted DDoS attack: 75600
Predicted noraml call: 75600
