## Importing All The Necessary Stuff

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Loading The Data to prepare a Model

In [17]:
# Load the data from CSV file
# df = pd.read_csv(r"D:\Stuff\CyberSec\archive\02-14-2018.csv")
df = pd.read_csv(r"D:\Stuff\CyberSec\MachineLearningCSV\MachineLearningCVE\Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv")

# Remove any rows with missing values
df = df.dropna()

# Drop columns where all values are 0
df = df.loc[:, (df != 0).any(axis=0)]

columns = list(df.columns)

### Display the Imported Model

In [34]:
# display the dataframe
df.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


### Printing all the Features available in our Model

In [40]:
# Print column names with their respective column numbers
for i, col_name in enumerate(df.columns):
    print(f"Feature {i+1}:\t\"{col_name}\"")

Feature 1:	" Destination Port"
Feature 2:	" Flow Duration"
Feature 3:	" Total Fwd Packets"
Feature 4:	" Total Backward Packets"
Feature 5:	"Total Length of Fwd Packets"
Feature 6:	" Total Length of Bwd Packets"
Feature 7:	" Fwd Packet Length Max"
Feature 8:	" Fwd Packet Length Min"
Feature 9:	" Fwd Packet Length Mean"
Feature 10:	" Fwd Packet Length Std"
Feature 11:	"Bwd Packet Length Max"
Feature 12:	" Bwd Packet Length Min"
Feature 13:	" Bwd Packet Length Mean"
Feature 14:	" Bwd Packet Length Std"
Feature 15:	"Flow Bytes/s"
Feature 16:	" Flow Packets/s"
Feature 17:	" Flow IAT Mean"
Feature 18:	" Flow IAT Std"
Feature 19:	" Flow IAT Max"
Feature 20:	" Flow IAT Min"
Feature 21:	"Fwd IAT Total"
Feature 22:	" Fwd IAT Mean"
Feature 23:	" Fwd IAT Std"
Feature 24:	" Fwd IAT Max"
Feature 25:	" Fwd IAT Min"
Feature 26:	"Bwd IAT Total"
Feature 27:	" Bwd IAT Mean"
Feature 28:	" Bwd IAT Std"
Feature 29:	" Bwd IAT Max"
Feature 30:	" Bwd IAT Min"
Feature 31:	"Fwd PSH Flags"
Feature 32:	" Fwd Heade

## Preparing the Classification model using Decision Tree Classifier

### Training and Obtaining accuracy for a range of features individually

In [27]:
# to display individual features with a given range
startC = 1
endC = 69
for start_col in range(startC-1, endC, 1):
    end_col = start_col + 1
    try:
        X = df.iloc[:, start_col:end_col].values
        y = df.iloc[:, -1].values

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42)

        # Train the decision tree classifier
        clf = DecisionTreeClassifier()
        clf.fit(X_train, y_train)

        # Test the classifier
        accuracy = clf.score(X_test, y_test)
        print(f"Accuracy for column '{columns[end_col-1]}' : {accuracy}")
    except:
        print(f"'{columns[end_col-1]}' isn't a feature column")


Accuracy for column ' Destination Port' : 0.9603756450862699
Accuracy for column ' Flow Duration' : 0.7671487740592261
Accuracy for column ' Total Fwd Packets' : 0.9091674234202308
Accuracy for column ' Total Backward Packets' : 0.8554342288865756
Accuracy for column 'Total Length of Fwd Packets' : 0.98974506633591
Accuracy for column ' Total Length of Bwd Packets' : 0.9407738820350395
Accuracy for column ' Fwd Packet Length Max' : 0.8947042016434472
Accuracy for column ' Fwd Packet Length Min' : 0.7287204589248931
Accuracy for column ' Fwd Packet Length Mean' : 0.89908968083457
Accuracy for column ' Fwd Packet Length Std' : 0.79374958470841
Accuracy for column 'Bwd Packet Length Max' : 0.936255509535095
Accuracy for column ' Bwd Packet Length Min' : 0.8235841325389267
Accuracy for column ' Bwd Packet Length Mean' : 0.9407295842654322
Accuracy for column ' Bwd Packet Length Std' : 0.7938603291324282
'Flow Bytes/s' isn't a feature column
' Flow Packets/s' isn't a feature column
Accuracy

### Training and Obtaining accuracy for a group of features together

In [30]:
# Select the 6th column as the feature and the last column as the label
# example:
# from 4th col to 16th col
bCol = 1
eCol = 14
X = df.iloc[:, bCol-1:eCol].values
# X = df.iloc[:, 3:4].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train the decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Test the classifier
accuracy = clf.score(X_test, y_test)

selected_cols = df.iloc[:,bCol-1:eCol]

print("Accuracy for the following features combined",selected_cols.columns.tolist(),"is: ",accuracy)


Accuracy for the following features combined [' Destination Port', ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', 'Total Length of Fwd Packets', ' Total Length of Bwd Packets', ' Fwd Packet Length Max', ' Fwd Packet Length Min', ' Fwd Packet Length Mean', ' Fwd Packet Length Std', 'Bwd Packet Length Max', ' Bwd Packet Length Min', ' Bwd Packet Length Mean', ' Bwd Packet Length Std'] is:  0.999623468958338


### Obtaining group accuracy with Feature names as Input

In [36]:
# specify column names to select
selected_cols = [' SYN Flag Count',' RST Flag Count']

# select columns by name using loc
X = df.loc[:, selected_cols].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train the decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Test the classifier
accuracy = clf.score(X_test, y_test)


print("Accuracy for the following features combined",selected_cols,"is: ",accuracy)


Accuracy for the following features combined [' SYN Flag Count', ' RST Flag Count'] is:  0.6034242175906444


### Obtaining individual accuracy with Feature names as Input

In [41]:
# Define the list of column names to iterate over
col_names = [' SYN Flag Count',' RST Flag Count']

for col_name in col_names:
    try:
        X = df[[col_name]].values
        y = df.iloc[:, -1].values

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42)

        # Train the decision tree classifier
        clf = DecisionTreeClassifier()
        clf.fit(X_train, y_train)

        # Test the classifier
        accuracy = clf.score(X_test, y_test)
        print(f"Accuracy for column '{col_name}' : {accuracy}")
    except:
        print(f"'{col_name}' isn't a feature column")


Accuracy for column ' SYN Flag Count' : 0.6033134731666261
Accuracy for column ' RST Flag Count' : 0.5700458481915436


### Obtaining individual accuracy with Feature numbers as Input

In [45]:
cols = [42,43] # Example list of column indices to test
for start_col in cols:
    start_col-=1
    end_col = start_col + 1
    try:
        X = df.iloc[:, start_col:end_col].values
        y = df.iloc[:, -1].values

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42)

        # Train the decision tree classifier
        clf = DecisionTreeClassifier()
        clf.fit(X_train, y_train)

        # Test the classifier
        accuracy = clf.score(X_test, y_test)
        print(f"Accuracy for column '{columns[end_col-1]}' : {accuracy}")
    except:
        print(f"'{columns[end_col-1]}' isn't a feature column")


Accuracy for column ' SYN Flag Count' : 0.6033134731666261
Accuracy for column ' RST Flag Count' : 0.5700458481915436


### Obtaining group accuracy with Feature numbers as Input

In [49]:
# specify column indexes to select
selected_cols_idx = [42, 43]

selected_cols_idx = [x - 1 for x in selected_cols_idx]

# select columns by index using iloc
X = df.iloc[:, selected_cols_idx].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train the decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Test the classifier
accuracy = clf.score(X_test, y_test)

# Get the names of the selected columns
selected_cols = list(df.columns[selected_cols_idx])

print("Accuracy for the following features combined", selected_cols, "is:", accuracy)


Accuracy for the following features combined [' SYN Flag Count', ' RST Flag Count'] is: 0.6034242175906444
