## Importing All The Necessary Stuff

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## Loading The Data to prepare a Model

In [32]:
# Load the data from CSV file
# df = pd.read_csv(r"D:\Stuff\CyberSec\archive\03-02-2018.csv")
df = pd.read_csv(r"D:\Stuff\CyberSec\MachineLearningCSV\MachineLearningCVE\Wednesday-workingHours.pcap_ISCX.csv")

# Remove any rows with missing values
# df = df.dropna()

# Drop columns where all values are 0
# df = df.loc[:, (df != 0).any(axis=0)]

columns = list(df.columns)

### Display the Imported Model

In [33]:
# display the dataframe
df.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,80,38308,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
1,389,479,11,5,172,326,79,0,15.636364,31.449238,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
2,88,1095,10,6,3150,3150,1575,0,315.0,632.561635,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
3,389,15206,17,12,3452,6660,1313,0,203.058823,425.778474,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN
4,88,1092,9,6,3150,3152,1575,0,350.0,694.509719,...,32,0.0,0.0,0,0,0.0,0.0,0,0,BENIGN


### Printing all the Features available in our Model

In [4]:
# Print column names with their respective column numbers
for i, col_name in enumerate(df.columns):
    print(f"Feature {i+1}:\t\"{col_name}\"")

Feature 1:	" Destination Port"
Feature 2:	" Flow Duration"
Feature 3:	" Total Fwd Packets"
Feature 4:	" Total Backward Packets"
Feature 5:	"Total Length of Fwd Packets"
Feature 6:	" Total Length of Bwd Packets"
Feature 7:	" Fwd Packet Length Max"
Feature 8:	" Fwd Packet Length Min"
Feature 9:	" Fwd Packet Length Mean"
Feature 10:	" Fwd Packet Length Std"
Feature 11:	"Bwd Packet Length Max"
Feature 12:	" Bwd Packet Length Min"
Feature 13:	" Bwd Packet Length Mean"
Feature 14:	" Bwd Packet Length Std"
Feature 15:	"Flow Bytes/s"
Feature 16:	" Flow Packets/s"
Feature 17:	" Flow IAT Mean"
Feature 18:	" Flow IAT Std"
Feature 19:	" Flow IAT Max"
Feature 20:	" Flow IAT Min"
Feature 21:	"Fwd IAT Total"
Feature 22:	" Fwd IAT Mean"
Feature 23:	" Fwd IAT Std"
Feature 24:	" Fwd IAT Max"
Feature 25:	" Fwd IAT Min"
Feature 26:	"Bwd IAT Total"
Feature 27:	" Bwd IAT Mean"
Feature 28:	" Bwd IAT Std"
Feature 29:	" Bwd IAT Max"
Feature 30:	" Bwd IAT Min"
Feature 31:	"Fwd PSH Flags"
Feature 32:	" Bwd PSH F

## Preparing the Classification model using Decision Tree Classifier

### Training and Obtaining accuracy for a **`range`** of features **`individually`**

In [5]:
# to display individual features with a given range
startC = 1
endC = 10
for start_col in range(startC-1, endC, 1):
    end_col = start_col + 1
    try:
        X = df.iloc[:, start_col:end_col].values
        y = df.iloc[:, -1].values

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42)

        # Train the decision tree classifier
        clf = DecisionTreeClassifier()
        clf.fit(X_train, y_train)

        # Test the classifier
        accuracy = clf.score(X_test, y_test)
        print(f"Accuracy for column '{columns[end_col-1]}' : {accuracy}")
    except:
        print(f"'{columns[end_col-1]}' isn't a feature column")


Accuracy for column 'Dst Port' : 0.9956549587961989
Accuracy for column 'Protocol' : 0.7262337240205732
'Timestamp' isn't a feature column
Accuracy for column 'Flow Duration' : 0.9491596718991989
Accuracy for column 'Tot Fwd Pkts' : 0.7796026844611001
Accuracy for column 'Tot Bwd Pkts' : 0.8112177587426627
Accuracy for column 'TotLen Fwd Pkts' : 0.861603984627445
Accuracy for column 'TotLen Bwd Pkts' : 0.8618429857937707
Accuracy for column 'Fwd Pkt Len Max' : 0.8614319037876905
Accuracy for column 'Fwd Pkt Len Min' : 0.7262337240205732


### Training and Obtaining accuracy for a **`group`** of features **`together`**

In [6]:
# Select the 6th column as the feature and the last column as the label
# example:
# from 4th col to 16th col
bCol = 4
eCol = 10
X = df.iloc[:, bCol-1:eCol].values
# X = df.iloc[:, 3:4].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train the decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Test the classifier
accuracy = clf.score(X_test, y_test)

selected_cols = df.iloc[:,bCol-1:eCol]

print("Accuracy for the following features combined",selected_cols.columns.tolist(),"is: ",accuracy)


Accuracy for the following features combined ['Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max', 'Fwd Pkt Len Min'] is:  0.9985468729087398


### Obtaining **`group`** accuracy with **`Feature names`** as Input

In [7]:
# specify column names to select
selected_cols = ['Fwd Pkts/s','Bwd Pkts/s']

# select columns by name using loc
X = df.loc[:, selected_cols].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train the decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Test the classifier
accuracy = clf.score(X_test, y_test)


print("Accuracy for the following features combined",selected_cols,"is: ",accuracy)


Accuracy for the following features combined ['Fwd Pkts/s', 'Bwd Pkts/s'] is:  0.994632033804325


### Obtaining **`individual`** accuracy with **`Feature names`** as Input

In [8]:
# Define the list of column names to iterate over
col_names = ['Fwd Pkts/s','Bwd Pkts/s']

for col_name in col_names:
    try:
        X = df[[col_name]].values
        y = df.iloc[:, -1].values

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42)

        # Train the decision tree classifier
        clf = DecisionTreeClassifier()
        clf.fit(X_train, y_train)

        # Test the classifier
        accuracy = clf.score(X_test, y_test)
        print(f"Accuracy for column '{col_name}' : {accuracy}")
    except:
        print(f"'{col_name}' isn't a feature column")


Accuracy for column 'Fwd Pkts/s' : 0.9521997667348616
Accuracy for column 'Bwd Pkts/s' : 0.848635781342613


### Obtaining **`individual`** accuracy with **`Feature numbers`** as Input

In [34]:
cols = [1,2,3,4,9,13,17,22,27,31,32,33,34,35,36,41,67,68,71,75] # Example list of column indices to test
for start_col in cols:
    start_col-=1
    end_col = start_col + 1
    try:
        X = df.iloc[:, start_col:end_col].values
        y = df.iloc[:, -1].values

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42)

        # Train the decision tree classifier
        clf = DecisionTreeClassifier()
        clf.fit(X_train, y_train)

        # Test the classifier
        accuracy = clf.score(X_test, y_test)
        print(f"Accuracy for column '{columns[end_col-1]}' : {accuracy}")
    except:
        print(f"'{columns[end_col-1]}' isn't a feature column")


Accuracy for column ' Destination Port' : 0.8998202698118246
Accuracy for column ' Flow Duration' : 0.8749972932200576
Accuracy for column ' Total Fwd Packets' : 0.7744133505604839
Accuracy for column ' Total Backward Packets' : 0.8173609256465595
Accuracy for column ' Fwd Packet Length Mean' : 0.8329447600349356
Accuracy for column ' Bwd Packet Length Mean' : 0.8739290174027905
Accuracy for column ' Flow IAT Mean' : 0.8701250893237381
Accuracy for column ' Fwd IAT Mean' : 0.8706520091525253
Accuracy for column ' Bwd IAT Mean' : 0.7988321146808526
Accuracy for column 'Fwd PSH Flags' : 0.6364181000570228
Accuracy for column ' Bwd PSH Flags' : 0.6364181000570228
Accuracy for column ' Fwd URG Flags' : 0.6364181000570228
Accuracy for column ' Bwd URG Flags' : 0.6364181000570228
Accuracy for column ' Fwd Header Length' : 0.8167618250193084
Accuracy for column ' Bwd Header Length' : 0.8509610873315481
Accuracy for column ' Packet Length Mean' : 0.9005926043553893
Accuracy for column 'Init_Wi

### Obtaining **`group`** accuracy with **`Feature numbers`** as Input

In [35]:
# specify column indexes to select
selected_cols_idx = [1,2,3,4,9,13,17,22,27,31,32,33,34,35,36,41,67,68,71,75]

selected_cols_idx = [x - 1 for x in selected_cols_idx]

# select columns by index using iloc
X = df.iloc[:, selected_cols_idx].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train the decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Test the classifier
accuracy = clf.score(X_test, y_test)

# Get the names of the selected columns
selected_cols = list(df.columns[selected_cols_idx])

print("Accuracy for the following features combined", selected_cols, "is:", accuracy)


Accuracy for the following features combined [' Destination Port', ' Flow Duration', ' Total Fwd Packets', ' Total Backward Packets', ' Fwd Packet Length Mean', ' Bwd Packet Length Mean', ' Flow IAT Mean', ' Fwd IAT Mean', ' Bwd IAT Mean', 'Fwd PSH Flags', ' Bwd PSH Flags', ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length', ' Bwd Header Length', ' Packet Length Mean', 'Init_Win_bytes_forward', ' Init_Win_bytes_backward', 'Active Mean', 'Idle Mean'] is: 0.9995091705704449
