In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/sjsu nsf reu/merged_files.csv")
df.shape

(19976700, 15)

# Feature Engineering

In [None]:
df = df.drop(columns=['Sport', 'Dir', 'Dport', 'sTos', 'dTos'])
df.shape

(19976700, 10)

In [None]:
#the rows without Proto as UDP, TCP, HTTP, or ICMP will be filtered out,
#and, of the remaining rows, only those with connection state CON, URP, or FSPA FPSA

df = df[(df.Proto == 'udp') | (df.Proto == 'tcp') | (df.Proto == 'http') | (df.Proto == 'icmp')]
df = df[(df.State == 'CON') | (df.State == 'URP') | (df.State == 'FSPA FPSA')]


In [None]:
df.shape

(14415958, 10)

In [None]:
df['Label'] = df['Label'].astype(str)
df = df[~df['Label'].str.contains('To')]

In [None]:
#Background
df.loc[df['Label'].str.contains("Background"), 'Label'] = "Background"

#Normal
df.loc[df['Label'].str.contains("Normal"), 'Label'] = "Normal"

#Botnet
df.loc[df['Label'].str.contains("Botnet"), 'Label'] = "Botnet"

In [None]:
df['Label'].value_counts()

Background    5923970
Normal         227716
Botnet         143960
Name: Label, dtype: int64

In [None]:
df.head(5)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,DstAddr,State,TotPkts,TotBytes,SrcBytes,Label
101,2011/08/10 09:46:53.047277,3550.182373,udp,212.50.71.179,147.32.84.229,CON,12,875,413,Background
102,2011/08/10 09:46:53.048843,0.000883,udp,84.13.246.132,147.32.84.229,CON,2,135,75,Background
105,2011/08/10 09:46:53.053937,3427.768066,udp,74.89.223.204,147.32.84.229,CON,42,2856,1596,Background
107,2011/08/10 09:46:53.058746,3589.631348,udp,182.239.167.121,147.32.84.229,CON,12,1494,1122,Background
109,2011/08/10 09:46:53.062095,3118.470947,udp,24.117.206.20,147.32.84.229,CON,13,4328,840,Background


In [None]:
df.shape

(6295646, 10)

In [None]:
selected_df = df

In [None]:
df.StartTime.tail(5)

19976671    2011/08/19 11:45:43.212731
19976674    2011/08/19 11:45:43.320646
19976675    2011/08/19 11:45:43.338609
19976676    2011/08/19 11:45:43.390920
19976680    2011/08/19 11:45:43.420099
Name: StartTime, dtype: object

In [None]:
df.StartTime.head(5)

101    2011/08/10 09:46:53.047277
102    2011/08/10 09:46:53.048843
105    2011/08/10 09:46:53.053937
107    2011/08/10 09:46:53.058746
109    2011/08/10 09:46:53.062095
Name: StartTime, dtype: object

In [None]:
df.State.value_counts()

CON    6173266
URP     122380
Name: State, dtype: int64

#Train test splitting

In [None]:
from sklearn.model_selection import train_test_split
import random

In [None]:
X = selected_df.loc[:, ['StartTime', 'Dur', 'Proto', 'SrcAddr', 'DstAddr', 'State', 'TotPkts', 'TotBytes', 'SrcBytes', 'Label']]
y = selected_df.loc[:, ['Label']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 420)

In [None]:
#9
background_count = 0
normal_count = 0
botnet_count = 0

for x in X_train['Label']:
    if 'Background' in str(x): #Background
        background_count += 1
    if 'Normal' in str(x): #Normal
        normal_count += 1
    if 'Botnet' in str(x): #Botnet
        botnet_count += 1

print(str(float(background_count/11532766)))
print(str(float(normal_count/11532766)))
print(str(float(botnet_count/11532766)))

botnet_ratio_b = (float(botnet_count/background_count))
botnet_ratio_n = (float(botnet_count/normal_count))


0.4109347228583325
0.015800285898456622
0.009978612242717835


In [None]:
filtered_df_1 = X_train.copy()

# Create a boolean mask to identify rows with "background" in the 'Label' column
mask_b = filtered_df_1['Label'].astype(str).str.contains('Background')

# Get the indices of rows with "background"
indices_background = filtered_df_1.index[mask_b]

# Calculate the number of rows to drop (background_count - botnet_count)
num_rows_to_drop = background_count - botnet_count

# Randomly select 'num_rows_to_drop' indices from the background indices
indices_to_drop_b = random.sample(indices_background.tolist(), num_rows_to_drop)
filtered_y = y_train.copy()

# Drop the specified rows from the DataFrame
filtered_df_1 = filtered_df_1.drop(indices_to_drop_b)
filtered_y = filtered_y.drop(indices_to_drop_b)

# Reset the index of the filtered DataFrame
filtered_df_1 = filtered_df_1.reset_index(drop=True)
filtered_y = filtered_y.reset_index(drop=True)

In [None]:
#15
background_count = 0
normal_count = 0
botnet_count = 0

for x in filtered_df_1['Label']:
    if 'Background' in str(x): #Background
        background_count += 1
    if 'Normal' in str(x): #Normal
        normal_count += 1
    if 'Botnet' in str(x): #Botnet
        botnet_count += 1

print(str(float(background_count/413824)))
print(str(float(normal_count/413824)))
print(str(float(botnet_count/413824)))

botnet_ratio_b = (float(botnet_count/background_count))
botnet_ratio_n = (float(botnet_count/normal_count))

0.27809165248994744
0.4403345383544695
0.27809165248994744


In [None]:
filtered_df = filtered_df_1.copy()

# Create a boolean mask to identify rows with "normal" in the 'Label' column
mask_bot = filtered_df['Label'].astype(str).str.contains('Normal')

# Get the indices of rows with "background"
indices_background = filtered_df.index[mask_bot]

# Calculate the number of rows to drop (background_count - botnet_count)
num_rows_to_drop = normal_count - botnet_count

# Randomly select 'num_rows_to_drop' indices from the background indices
indices_to_drop_bot = random.sample(indices_background.tolist(), num_rows_to_drop)
filtered_y_2 = filtered_y.copy()

# Drop the specified rows from the DataFrame
filtered_df = filtered_df.drop(indices_to_drop_bot)
filtered_y_2 = filtered_y_2.drop(indices_to_drop_bot)

# Reset the index of the filtered DataFrame
filtered_df = filtered_df.reset_index(drop=True)
filtered_y_2 = filtered_y_2.reset_index(drop=True)

In [None]:
#19
background_count = 0
normal_count = 0
botnet_count = 0

for x in filtered_df['Label']:
    if 'Background' in str(x):
        background_count += 1
    if 'Normal' in str(x):
        normal_count += 1
    if 'Botnet' in str(x):
        botnet_count += 1

print(str(float(background_count/345804)))
print(str(float(normal_count/345804)))
print(str(float(botnet_count/345804)))

0.33279256457415185
0.33279256457415185
0.33279256457415185


In [None]:
X_train = filtered_df

In [None]:
y_train = filtered_y_2

In [None]:
X_train = X_train.drop(columns=['Label'])
X_test = X_test.drop(columns=['Label'])

#changing everything by a small percentage

In [None]:
import random

In [None]:
X_test.head(5)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,DstAddr,State,TotPkts,TotBytes,SrcBytes
11449165,2011/08/15 11:10:30.828135,0.000285,udp,109.68.234.4,147.32.85.84,CON,2,128,60
3037783,2011/08/11 10:08:53.811103,0.001018,udp,121.217.246.103,147.32.84.229,CON,2,138,78
5750436,2011/08/13 08:29:17.315981,0.459957,udp,109.242.98.209,147.32.85.56,CON,10,640,300
1305298,2011/08/10 12:34:46.612630,362.831207,udp,95.134.119.238,147.32.84.229,CON,14,931,511
12576162,2011/08/16 14:01:06.762460,0.000863,udp,46.33.250.185,147.32.84.229,CON,2,245,183


In [None]:
def modify_dur_by_random_percentage(row):
    # Generate a random percentage between -10% and +10%
    random_percentage = random.uniform(-0.5, 0.5)
    # Calculate the new 'dur' value after applying the random percentage
    new_dur = row['Dur'] * (1 + random_percentage)
    # Round the new_dur to an integer value (if needed)
    #new_dur = round(new_dur)
    return new_dur

# Apply the modification to each row of the DataFrame
X_test['Dur'] = X_test.apply(modify_dur_by_random_percentage, axis=1)

In [None]:

def modify_srcbytes_by_random_percentage(row):
    # Generate a random percentage between -10% and +10%
    random_percentage = random.uniform(-0.5, 0.5)
    # Calculate the new 'dur' value after applying the random percentage
    new_srcbytes = row['SrcBytes'] * (1 + random_percentage)
    # Round the new_dur to an integer value (if needed)
    new_srcbytes = round(new_srcbytes)
    return new_srcbytes

# Apply the modification to each row of the DataFrame
X_test['SrcBytes'] = X_test.apply(modify_srcbytes_by_random_percentage, axis=1)


In [None]:

def modify_totpkts_by_random_percentage(row):
    # Generate a random percentage between -10% and +10%
    random_percentage = random.uniform(-0.5, 0.5)
    # Calculate the new 'dur' value after applying the random percentage
    new_totpkts = row['TotPkts'] * (1 + random_percentage)
    # Round the new_dur to an integer value (if needed)
    new_totpkts = round(new_totpkts)
    return new_totpkts

# Apply the modification to each row of the DataFrame
X_test['TotPkts'] = X_test.apply(modify_totpkts_by_random_percentage, axis=1)


In [None]:

def modify_st_by_random_percentage(row):
    time = row['StartTime']
    random_num_1 = random.randint(-5, 5)
    day = int(time[8:10]) + random_num_1
    time = time[:8] + str(day).zfill(2) + time[10:]

    random_num = random.randint(-12, 12)
    hour = int(time[11:13]) + random_num
    time = time[:11] + str(hour).zfill(2) + time[13:]

    return time

# Apply the modification to each row of the DataFrame
X_test['StartTime'] = X_test.apply(modify_st_by_random_percentage, axis=1)


In [None]:

def modify_srcaddr_by_random_percentage(row):
    separate_numbers = row['SrcAddr'].split(".")
    random_num = random.uniform(-0.5, 0.5)
    for i in range(len(separate_numbers)):
        separate_numbers[i] = str(round(float(separate_numbers[i]) * (1 + random_num)))
    combined = ".".join(separate_numbers)
    return combined

# Apply the modification to each row of the DataFrame
X_test['SrcAddr'] = X_test.apply(modify_srcaddr_by_random_percentage, axis=1)


In [None]:

def modify_state_by_random_percentage(row):
    possible_states = ['CON', 'URP']
    state = row['State']
    random_num = random.randint(0, 1)
    if random_num == 0:
        state = 'CON'
    else:
        state = 'URP'
    return state

# Apply the modification to each row of the DataFrame
X_test['State'] = X_test.apply(modify_state_by_random_percentage, axis=1)


In [None]:
X_test.head(5)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,DstAddr,State,TotPkts,TotBytes,SrcBytes
11449165,2011/08/12 07:10:30.828135,0.000331,udp,140.87.300.5,147.32.85.84,URP,1,128,56
3037783,2011/08/13 11:08:53.811103,0.001241,udp,144.259.294.123,147.32.84.229,URP,2,138,71
5750436,2011/08/09 09:29:17.315981,0.250663,udp,84.185.75.160,147.32.85.56,CON,10,640,196
1305298,2011/08/11 21:34:46.612630,394.96376,udp,79.112.99.199,147.32.84.229,URP,13,931,268
12576162,2011/08/16 08:01:06.762460,0.001182,udp,60.43.324.239,147.32.84.229,CON,3,245,132


In [None]:
X_test.SrcAddr.head(5)

11449165       140.87.300.5
3037783     144.259.294.123
5750436       84.185.75.160
1305298       79.112.99.199
12576162      60.43.324.239
Name: SrcAddr, dtype: object

#Pre-processing all four dataframes

In [None]:
#X_train
change_items = ['StartTime', 'Dur', 'Proto', 'SrcAddr', 'DstAddr', 'State', 'TotPkts', 'TotBytes', 'SrcBytes']

for item in change_items:
    value_counts = X_train[item].value_counts()
    rank_mapping = {value: rank for rank, value in enumerate(value_counts.index, 1)}
    X_train[item] = X_train[item].map(rank_mapping)

In [None]:
#X_test
change_items = ['StartTime', 'Dur', 'Proto', 'SrcAddr', 'DstAddr', 'State', 'TotPkts', 'TotBytes', 'SrcBytes']

for item in change_items:
    value_counts = X_test[item].value_counts()
    rank_mapping = {value: rank for rank, value in enumerate(value_counts.index, 1)}
    X_test[item] = X_test[item].map(rank_mapping)

In [None]:
#y_train
change_items = ['Label']

for item in change_items:
    value_counts = y_train[item].value_counts()
    rank_mapping = {value: rank for rank, value in enumerate(value_counts.index, 1)}
    y_train[item] = y_train[item].map(rank_mapping)

In [None]:
#y_test
change_items = ['Label']

for item in change_items:
    value_counts = y_test[item].value_counts()
    rank_mapping = {value: rank for rank, value in enumerate(value_counts.index, 1)}
    y_test[item] = y_test[item].map(rank_mapping)

# Random Forest


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [None]:
clf = RandomForestClassifier(max_depth = 30, min_samples_leaf = 1, n_estimators =200)

In [None]:
train_score = clf.fit(X_train, y_train).score(X_train, y_train)
train_score

1.0

In [None]:
pred_test_y = clf.predict(X_test)
pred_train_y = clf.predict(X_train)

In [None]:
confusion_matrix(y_test, pred_test_y)

array([[239674,   6988, 938094],
       [     0,      0,  45495],
       [     0,      1,  28878]])

In [None]:
print("Random Forest Classifier")

print(f"Test Accuracy is {accuracy_score(y_test, pred_test_y)}")
#print(f"precision for botnet is {oo / (oa + oo + on)}")
#print(f"recall for botnet is {oo / (ao + oo + no)}")
print(f"Test Precision is {precision_score(y_test, pred_test_y, pos_label='positive', average='micro')}")
print(f"Test Recall is {recall_score(y_test, pred_test_y, pos_label='positive', average='micro')}")
print()
print(f"Train Accuracy is {train_score}")
print(f"F1 score is {f1_score(y_test, pred_test_y, average='macro')}")

Random Forest Classifier
Test Accuracy is 0.21328377530517104
Test Precision is 0.21328377530517104
Test Recall is 0.21328377530517104

Train Accuracy is 1.0
F1 score is 0.13066066758328562


#Naive Bayes

In [None]:
from sklearn.datasets import load_iris
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [None]:
gnb = GaussianNB()

In [None]:
y_pred = gnb.fit(X_train, y_train).predict(X_test)

In [None]:
confusion_matrix(y_test, y_pred)

array([[      8,    2571, 1182177],
       [      0,       0,   45495],
       [      0,       0,   28879]])

In [None]:
y_train_pred = gnb.predict(X_train)
#train_score = accuracy_score(y_train, y_pred)

In [None]:
aa, ao, an, oa, oo, on, na, no, nn = confusion_matrix(y_test, y_pred).ravel()
print(f"Test Accuracy is {accuracy_score(y_test, y_pred)}")
#print(f"precision for botnet is {oo / (oa + oo + on)}")
#print(f"recall for botnet is {oo / (ao + oo + no)}")
print(f"Test Precision is {precision_score(y_test, y_pred, pos_label='positive', average='micro')}")
print(f"Test Recall is {recall_score(y_test, y_pred, pos_label='positive', average='micro')}")
print()
#print(f"Train Accuracy is {train_score}")
print(f"F1 score is {f1_score(y_test, y_pred, average='macro')}")


Test Accuracy is 0.022942031402635153
Test Precision is 0.022942031402635153
Test Recall is 0.022942031402635153

F1 score is 0.014982109611601596


# KNN
n_neighbors, weights, algorithm, leaf_size, p

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [None]:
neigh = KNeighborsClassifier(leaf_size = 30, n_neighbors = 20, weights = 'distance')#ADD IN THE PARAMETERS

In [None]:
train_score = neigh.fit(X_train, y_train).score(X_train, y_train)

In [None]:
pred_test_y = neigh.predict(X_test)
pred_train_y = neigh.predict(X_train)
print(pred_test_y)

[3 3 3 ... 2 3 3]


In [None]:
confusion_matrix(y_test, pred_test_y)

array([[   2542,  104488, 1077726],
       [     77,   44158,    1260],
       [      7,   26809,    2063]])

In [None]:
print("K Neighbors Classifier")
print(f"Test Accuracy is {accuracy_score(y_test, pred_test_y)}")
#print(f"precision for botnet is {oo / (oa + oo + on)}")
#print(f"recall for botnet is {oo / (ao + oo + no)}")
print(f"Test Precision is {precision_score(y_test, pred_test_y, pos_label='positive', average='micro')}")
print(f"Test Recall is {recall_score(y_test, pred_test_y, pos_label='positive', average='micro')}")
print()
print(f"Train Accuracy is {train_score}")
print(f"F1 score is {f1_score(y_test, pred_test_y, average='macro')}")

K Neighbors Classifier
Test Accuracy is 0.03872753409099934
Test Precision is 0.03872753409099934
Test Recall is 0.03872753409099934

Train Accuracy is 1.0
F1 score is 0.13590312952395273
