In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('APA-DDoS-Dataset.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151200 entries, 0 to 151199
Data columns (total 23 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   ip.src           151200 non-null  object
 1   ip.dst           151200 non-null  object
 2   tcp.srcport      151200 non-null  int64 
 3   tcp.dstport      151200 non-null  int64 
 4   ip.proto         151200 non-null  int64 
 5   frame.len        151200 non-null  int64 
 6   tcp.flags.syn    151200 non-null  int64 
 7   tcp.flags.reset  151200 non-null  int64 
 8   tcp.flags.push   151200 non-null  int64 
 9   tcp.flags.ack    151200 non-null  int64 
 10  ip.flags.mf      151200 non-null  int64 
 11  ip.flags.df      151200 non-null  int64 
 12  ip.flags.rb      151200 non-null  int64 
 13  tcp.seq          151200 non-null  int64 
 14  tcp.ack          151200 non-null  int64 
 15  frame.time       151200 non-null  object
 16  Packets          151200 non-null  int64 
 17  Bytes     

In [3]:
mydf=data[['ip.src','ip.dst']]
mydf.to_csv('fuel_x_sample.csv',index=False)
mydf

Unnamed: 0,ip.src,ip.dst
0,192.168.1.1,192.168.23.2
1,192.168.1.1,192.168.23.2
2,192.168.1.1,192.168.23.2
3,192.168.1.1,192.168.23.2
4,192.168.1.1,192.168.23.2
...,...,...
151195,192.168.19.1,192.168.23.2
151196,192.168.19.1,192.168.23.2
151197,192.168.19.1,192.168.23.2
151198,192.168.19.1,192.168.23.2


In [2]:
# Step 2: Data Cleaning and Preprocessing
# Drop any rows with missing values
data = data.dropna()

# Convert non-numeric columns to numeric using LabelEncoder
label_encoder = LabelEncoder()
data['ip.src'] = label_encoder.fit_transform(data['ip.src'])
data['ip.dst'] = label_encoder.fit_transform(data['ip.dst'])
data['Label'] = label_encoder.fit_transform(data['Label'])

# Drop the 'time' column
data = data.drop('frame.time', axis=1)

# Drop any other columns that are not suitable for KNN classification (if needed)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151200 entries, 0 to 151199
Data columns (total 22 columns):
 #   Column           Non-Null Count   Dtype
---  ------           --------------   -----
 0   ip.src           151200 non-null  int32
 1   ip.dst           151200 non-null  int32
 2   tcp.srcport      151200 non-null  int64
 3   tcp.dstport      151200 non-null  int64
 4   ip.proto         151200 non-null  int64
 5   frame.len        151200 non-null  int64
 6   tcp.flags.syn    151200 non-null  int64
 7   tcp.flags.reset  151200 non-null  int64
 8   tcp.flags.push   151200 non-null  int64
 9   tcp.flags.ack    151200 non-null  int64
 10  ip.flags.mf      151200 non-null  int64
 11  ip.flags.df      151200 non-null  int64
 12  ip.flags.rb      151200 non-null  int64
 13  tcp.seq          151200 non-null  int64
 14  tcp.ack          151200 non-null  int64
 15  Packets          151200 non-null  int64
 16  Bytes            151200 non-null  int64
 17  Tx Packets       151200 non-n

In [7]:
data.head()

Unnamed: 0,ip.src,ip.dst,tcp.srcport,tcp.dstport,ip.proto,frame.len,tcp.flags.syn,tcp.flags.reset,tcp.flags.push,tcp.flags.ack,...,ip.flags.rb,tcp.seq,tcp.ack,Packets,Bytes,Tx Packets,Tx Bytes,Rx Packets,Rx Bytes,Label
0,0,0,2412,8000,6,54,0,0,1,1,...,0,1,1,8,432,4,216,4,216,2
1,0,0,2413,8000,6,54,0,0,1,1,...,0,1,1,10,540,5,270,5,270,2
2,0,0,2414,8000,6,54,0,0,1,1,...,0,1,1,12,648,6,324,6,324,2
3,0,0,2415,8000,6,54,0,0,1,1,...,0,1,1,10,540,5,270,5,270,2
4,0,0,2416,8000,6,54,0,0,1,1,...,0,1,1,6,324,3,162,3,162,2


In [3]:
# Step 4: K-Nearest Neighbors (KNN) Classification
# Separate features and target variable
X = data.drop('Label', axis=1)
y = data['Label']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the KNN classifier
k_value = 5  # You can set the desired value for k here
knn_classifier = KNeighborsClassifier(n_neighbors=k_value)
X_test.head()

Unnamed: 0,ip.src,ip.dst,tcp.srcport,tcp.dstport,ip.proto,frame.len,tcp.flags.syn,tcp.flags.reset,tcp.flags.push,tcp.flags.ack,...,ip.flags.df,ip.flags.rb,tcp.seq,tcp.ack,Packets,Bytes,Tx Packets,Tx Bytes,Rx Packets,Rx Bytes
90870,12,0,16271,8000,6,54,0,0,0,1,...,0,0,1,1,6,324,3,162,3,162
118547,8,0,52332,8000,6,66,0,0,0,1,...,1,0,1,1,10,1144,6,560,4,584
43005,8,0,41648,8000,6,222,0,0,1,1,...,1,0,1,1,10,1168,6,560,4,608
72756,7,0,49114,8000,6,222,0,0,1,1,...,1,0,1,1,10,1168,6,560,4,608
98003,4,0,3200,8000,6,54,0,0,0,1,...,0,0,1,1,12,648,6,324,6,324


In [4]:
# Train the classifier
knn_classifier.fit(X_train, y_train)

# import joblib
# filename = 'knn_classifier_model.pkl'
# joblib.dump(knn_classifier, filename)
# print(f"Model saved as {filename}")

In [5]:
X_test

Unnamed: 0,ip.src,ip.dst,tcp.srcport,tcp.dstport,ip.proto,frame.len,tcp.flags.syn,tcp.flags.reset,tcp.flags.push,tcp.flags.ack,...,ip.flags.df,ip.flags.rb,tcp.seq,tcp.ack,Packets,Bytes,Tx Packets,Tx Bytes,Rx Packets,Rx Bytes
90870,12,0,16271,8000,6,54,0,0,0,1,...,0,0,1,1,6,324,3,162,3,162
118547,8,0,52332,8000,6,66,0,0,0,1,...,1,0,1,1,10,1144,6,560,4,584
43005,8,0,41648,8000,6,222,0,0,1,1,...,1,0,1,1,10,1168,6,560,4,608
72756,7,0,49114,8000,6,222,0,0,1,1,...,1,0,1,1,10,1168,6,560,4,608
98003,4,0,3200,8000,6,54,0,0,0,1,...,0,0,1,1,12,648,6,324,6,324
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85458,10,0,11282,8000,6,54,0,0,0,1,...,0,0,1,1,6,324,3,162,3,162
81702,10,0,4278,8000,6,54,0,0,0,1,...,0,0,1,1,6,324,3,162,3,162
13738,12,0,9677,8000,6,54,0,0,1,1,...,0,0,1,1,10,540,5,270,5,270
66967,1,0,56364,8000,6,222,0,0,1,1,...,1,0,1,1,10,1175,6,560,4,615


In [6]:
# Make predictions on the test set
y_pred = knn_classifier.predict(X_test)

# Evaluate the performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report and confusion matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

AttributeError: 'Flags' object has no attribute 'c_contiguous'