## Importing All The Necessary Stuff

In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

## Loading The Data to prepare a Model

In [2]:
# Load the data from CSV file
# df = pd.read_csv(r"D:\Stuff\CyberSec\archive\03-02-2018.csv")
df = pd.read_csv(r"..\Datasets\IDS2018\02-14-2018.csv")

# Remove any rows with missing values
# df = df.dropna()

# Drop columns where all values are 0
# df = df.loc[:, (df != 0).any(axis=0)]

columns = list(df.columns)

# to select first n rows only
# df = df.iloc[:500000,:]

### Display the Imported Model

In [3]:
# display the dataframe
df.head()
# df

Unnamed: 0,Dst Port,Protocol,Timestamp,Flow Duration,Tot Fwd Pkts,Tot Bwd Pkts,TotLen Fwd Pkts,TotLen Bwd Pkts,Fwd Pkt Len Max,Fwd Pkt Len Min,...,Fwd Seg Size Min,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,0,0,14/02/2018 08:31:01,112641719,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320859.5,139.300036,56320958,56320761,Benign
1,0,0,14/02/2018 08:33:50,112641466,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56320733.0,114.551299,56320814,56320652,Benign
2,0,0,14/02/2018 08:36:39,112638623,3,0,0,0,0,0,...,0,0.0,0.0,0,0,56319311.5,301.934596,56319525,56319098,Benign
3,22,6,14/02/2018 08:40:13,6453966,15,10,1239,2273,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign
4,22,6,14/02/2018 08:40:23,8804066,14,11,1143,2209,744,0,...,32,0.0,0.0,0,0,0.0,0.0,0,0,Benign


### Printing all the Features available in our Model

In [4]:
# Print column names with their respective column numbers
for i, col_name in enumerate(df.columns):
    print(f"Feature {i+1}:\t\"{col_name}\"")

Feature 1:	"Dst Port"
Feature 2:	"Protocol"
Feature 3:	"Timestamp"
Feature 4:	"Flow Duration"
Feature 5:	"Tot Fwd Pkts"
Feature 6:	"Tot Bwd Pkts"
Feature 7:	"TotLen Fwd Pkts"
Feature 8:	"TotLen Bwd Pkts"
Feature 9:	"Fwd Pkt Len Max"
Feature 10:	"Fwd Pkt Len Min"
Feature 11:	"Fwd Pkt Len Mean"
Feature 12:	"Fwd Pkt Len Std"
Feature 13:	"Bwd Pkt Len Max"
Feature 14:	"Bwd Pkt Len Min"
Feature 15:	"Bwd Pkt Len Mean"
Feature 16:	"Bwd Pkt Len Std"
Feature 17:	"Flow Byts/s"
Feature 18:	"Flow Pkts/s"
Feature 19:	"Flow IAT Mean"
Feature 20:	"Flow IAT Std"
Feature 21:	"Flow IAT Max"
Feature 22:	"Flow IAT Min"
Feature 23:	"Fwd IAT Tot"
Feature 24:	"Fwd IAT Mean"
Feature 25:	"Fwd IAT Std"
Feature 26:	"Fwd IAT Max"
Feature 27:	"Fwd IAT Min"
Feature 28:	"Bwd IAT Tot"
Feature 29:	"Bwd IAT Mean"
Feature 30:	"Bwd IAT Std"
Feature 31:	"Bwd IAT Max"
Feature 32:	"Bwd IAT Min"
Feature 33:	"Fwd PSH Flags"
Feature 34:	"Bwd PSH Flags"
Feature 35:	"Fwd URG Flags"
Feature 36:	"Bwd URG Flags"
Feature 37:	"Fwd Hea

## Decision Tree

In [5]:
# specify column indexes to select
selected_cols_idx = [1,2,4,5,6,11,15,19,29,33,34,35,40,46,48,58,59,62,66]

selected_cols_idx = [x - 1 for x in selected_cols_idx]

# select columns by index using iloc
X = df.iloc[:, selected_cols_idx].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train the decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Test the classifier
accuracy = clf.score(X_test, y_test)

# Get the names of the selected columns
selected_cols = list(df.columns[selected_cols_idx])

print("Accuracy for the following features combined", selected_cols, "is:", accuracy)


Accuracy for the following features combined ['Dst Port', 'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'Fwd Pkt Len Mean', 'Bwd Pkt Len Mean', 'Flow IAT Mean', 'Bwd IAT Mean', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd Pkts/s', 'FIN Flag Cnt', 'RST Flag Cnt', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Bwd Pkts/b Avg', 'Subflow Bwd Pkts'] is: 0.9999666213670935


## KNN

In [9]:
# specify column indexes to select
selected_cols_idx = [1,2,4,5,6,11,15,19,29,33,34,35,40,46,48,58,59,62,66]
selected_cols_idx = [x - 1 for x in selected_cols_idx]

# select columns by index using iloc
X = df.iloc[:, selected_cols_idx].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train the KNN classifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Test the classifier
accuracy = knn.score(X_test, y_test)

# Get the names of the selected columns
selected_cols = list(df.columns[selected_cols_idx])

print("Accuracy for the following features combined", selected_cols, "is:", accuracy)


Accuracy for the following features combined ['Dst Port', 'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'Fwd Pkt Len Mean', 'Bwd Pkt Len Mean', 'Flow IAT Mean', 'Bwd IAT Mean', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd Pkts/s', 'FIN Flag Cnt', 'RST Flag Cnt', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Bwd Pkts/b Avg', 'Subflow Bwd Pkts'] is: 0.9997091290561


## Weighted KNN 


In [10]:
# specify column indexes to select
selected_cols_idx = [1,2,4,5,6,11,15,19,29,33,34,35,40,46,48,58,59,62,66]
selected_cols_idx = [x - 1 for x in selected_cols_idx]

# select columns by index using iloc
X = df.iloc[:, selected_cols_idx].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train the weighted KNN classifier
knn = KNeighborsClassifier(weights='distance')
knn.fit(X_train, y_train)

# Test the classifier
accuracy = knn.score(X_test, y_test)

# Get the names of the selected columns
selected_cols = list(df.columns[selected_cols_idx])

print("Accuracy for the following features combined", selected_cols, "is:", accuracy)


Accuracy for the following features combined ['Dst Port', 'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'Fwd Pkt Len Mean', 'Bwd Pkt Len Mean', 'Flow IAT Mean', 'Bwd IAT Mean', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd Pkts/s', 'FIN Flag Cnt', 'RST Flag Cnt', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Bwd Pkts/b Avg', 'Subflow Bwd Pkts'] is: 0.9998235700832082


## Random Forest

In [6]:
# specify column indexes to select
selected_cols_idx = [1,2,4,5,6,11,15,19,29,33,34,35,40,46,48,58,59,62,66]
selected_cols_idx = [x - 1 for x in selected_cols_idx]

# select columns by index using iloc
X = df.iloc[:, selected_cols_idx].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train the Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Test the classifier
accuracy = rf.score(X_test, y_test)

# Get the names of the selected columns
selected_cols = list(df.columns[selected_cols_idx])

print("Accuracy for the following features combined", selected_cols, "is:", accuracy)


Accuracy for the following features combined ['Dst Port', 'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'Fwd Pkt Len Mean', 'Bwd Pkt Len Mean', 'Flow IAT Mean', 'Bwd IAT Mean', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd Pkts/s', 'FIN Flag Cnt', 'RST Flag Cnt', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Bwd Pkts/b Avg', 'Subflow Bwd Pkts'] is: 0.999971389743223


## Gaussian Naive Bayes

In [7]:
# Specify column indexes to select
selected_cols_idx = [1,2,4,5,6,11,15,19,29,33,34,35,40,46,48,58,59,62,66]
selected_cols_idx = [x - 1 for x in selected_cols_idx]

# Select columns by index using iloc
X = df.iloc[:, selected_cols_idx].values
y = df.iloc[:, -1].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

# Train the Gaussian Naive Bayes classifier
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Test the classifier
accuracy = gnb.score(X_test, y_test)

# Get the names of the selected columns
selected_cols = list(df.columns[selected_cols_idx])

print("Accuracy for the following features combined", selected_cols, "is:", accuracy)


Accuracy for the following features combined ['Dst Port', 'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'Fwd Pkt Len Mean', 'Bwd Pkt Len Mean', 'Flow IAT Mean', 'Bwd IAT Mean', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd Pkts/s', 'FIN Flag Cnt', 'RST Flag Cnt', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Bwd Pkts/b Avg', 'Subflow Bwd Pkts'] is: 0.4937796533390554


## MLP 


In [8]:
# specify column indexes to select
selected_cols_idx = [1, 2, 4, 5, 6, 11, 15, 19, 29, 33, 34, 35, 40, 46, 48, 58, 59, 62, 66]
selected_cols_idx = [x - 1 for x in selected_cols_idx]

# select columns by index using iloc
X = df.iloc[:, selected_cols_idx].values
y = df.iloc[:, -1].values

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# instantiate the MLP classifier
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=300, alpha=0.0001,
                    solver='adam', random_state=42, tol=0.0001)

# train the MLP classifier
mlp.fit(X_train, y_train)

# test the MLP classifier
accuracy = mlp.score(X_test, y_test)

# get the names of the selected columns
selected_cols = list(df.columns[selected_cols_idx])

print("Accuracy for the following features combined", selected_cols, "is:", accuracy)


Accuracy for the following features combined ['Dst Port', 'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'Fwd Pkt Len Mean', 'Bwd Pkt Len Mean', 'Flow IAT Mean', 'Bwd IAT Mean', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd Pkts/s', 'FIN Flag Cnt', 'RST Flag Cnt', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Bwd Pkts/b Avg', 'Subflow Bwd Pkts'] is: 0.76881958848914


## QDA 

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

# specify column indexes to select
selected_cols_idx = [1, 2, 4, 5, 6, 11, 15, 19, 29, 33, 34, 35, 40, 46, 48, 58, 59, 62, 66]
selected_cols_idx = [x - 1 for x in selected_cols_idx]

# select columns by index using iloc
X = df.iloc[:, selected_cols_idx].values
y = df.iloc[:, -1].values

# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# scale the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# instantiate the QDA classifier
qda = QuadraticDiscriminantAnalysis(reg_param=0.1)

# train the QDA classifier
qda.fit(X_train, y_train)

# test the QDA classifier
accuracy = qda.score(X_test, y_test)

# get the names of the selected columns
selected_cols = list(df.columns[selected_cols_idx])

print("Accuracy for the following features combined", selected_cols, "is:", accuracy)


Accuracy for the following features combined ['Dst Port', 'Protocol', 'Flow Duration', 'Tot Fwd Pkts', 'Tot Bwd Pkts', 'Fwd Pkt Len Mean', 'Bwd Pkt Len Mean', 'Flow IAT Mean', 'Bwd IAT Mean', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd Pkts/s', 'FIN Flag Cnt', 'RST Flag Cnt', 'Fwd Byts/b Avg', 'Fwd Pkts/b Avg', 'Bwd Pkts/b Avg', 'Subflow Bwd Pkts'] is: 0.7561166344801278


## GAN MODEL
(HAVE TO IMPORT DATASET OF KAGGLE MICROSOFT MALWARE CLASSIFICATION CHALLENGE)

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Reshape, Flatten, Dropout, LeakyReLU
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

# Load data
data = pd.read_csv('malware_data.csv')

# Remove missing values
data = data.dropna()

# Convert categorical columns to numeric
for col in data.select_dtypes('object').columns:
    data[col] = pd.Categorical(data[col]).codes

# Split data into X and y
X = data.drop('HasDetections', axis=1).values.astype(np.float32)
y = data['HasDetections'].values.astype(np.float32)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the generator network
def make_generator():
    model = Sequential([
        Dense(256, input_shape=(100,), activation='relu'),
        Dense(512, activation='relu'),
        Dense(X.shape[1], activation='tanh')
    ])
    return model

# Define the discriminator network
def make_discriminator():
    model = Sequential([
        Dense(512, input_shape=(X.shape[1],), activation='relu'),
        Dropout(0.3),
        Dense(256, activation='relu'),
        Dropout(0.3),
        Dense(1, activation='sigmoid')
    ])
    return model

# Compile the discriminator
discriminator = make_discriminator()
discriminator.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0002, beta_1=0.5))

# Define the GAN
generator = make_generator()
discriminator.trainable = False
gan_input = Input(shape=(100,))
fake_data = generator(gan_input)
gan_output = discriminator(fake_data)
gan = Model(gan_input, gan_output)
gan.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0002, beta_1=0.5))

# Train the GAN
epochs = 10000
batch_size = 32
for epoch in range(epochs):
    # Generate noise
    noise = np.random.normal(0, 1, size=[batch_size, 100])
    
    # Generate fake data
    fake_data = generator.predict(noise)
    
    # Combine real and fake data
    X_combined = np.concatenate([X_train, fake_data])
    y_combined = np.concatenate([y_train, np.zeros(batch_size)])
    
    # Train discriminator
    discriminator_loss = discriminator.train_on_batch(X_combined, y_combined)
    
    # Generate new noise
    noise = np.random.normal(0, 1, size=[batch_size, 100])
    
    # Set labels to 'real' (i.e. 1)
    y_mislabeled = np.ones(batch_size)
    
    # Train generator
    generator_loss = gan.train_on_batch(noise, y_mislabeled)
    
    # Print loss every 100 epochs
    if epoch % 100 == 0:
        print(f"Epoch: {epoch}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}")
    
# Generate fake data and test the classifier
noise = np.random.normal(0, 1, size=[X_test.shape[0], 100])
fake_data = generator.predict(noise)
accuracy = discriminator.evaluate(X_test,
