In [None]:
import pandas as pd
import numpy as np
import math

In [None]:
import sklearn.preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn import svm
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE

In [None]:
df = pd.read_csv('/content/drive/MyDrive/sjsu nsf reu/merged_files.csv')
df.shape

(19976700, 15)

## Feature Selection

In [None]:
#start removing unecessary features according to Fabios paper, save it into "selected_df". Make any future changes related to the dataset in "selected_df"
selected_df = df.drop(columns=['Sport', 'Dir', 'Dport', 'sTos', 'dTos'])
selected_df.shape

(19976700, 10)

In [None]:
#the rows without Proto as UDP, TCP, HTTP, or ICMP will be filtered out,
#and, of the remaining rows, only those with connection state CON, URP, or FSPA FPSA

selected_df = selected_df[(selected_df.Proto == 'udp') | (selected_df.Proto == 'tcp') | (selected_df.Proto == 'http') | (selected_df.Proto == 'icmp')]
selected_df = selected_df[(selected_df.State == 'CON') | (selected_df.State == 'URP') | (selected_df.State == 'FSPA FPSA')]


In [None]:
selected_df.shape

(14415958, 10)

In [None]:
#Simplify the 1400 labels into their three groups, "background", "normal", or "botnet"

#Background
selected_df.loc[df['Label'].str.contains("Background"), 'Label'] = "Background"

#Normal
selected_df.loc[df['Label'].str.contains("Normal"), 'Label'] = "Normal"

#Botnet
selected_df.loc[df['Label'].str.contains("Botnet"), 'Label'] = "Botnet"

# Preprocessing
### Changing the values to numerical values based on their frequency. The more frequent, the higher the number

In [None]:
selected_df = df

In [None]:
#things to change
change_items = ['StartTime', 'Dur', 'Proto', 'SrcAddr', 'DstAddr', 'State', 'TotPkts', 'TotBytes', 'SrcBytes', 'Label']

for item in change_items:
    value_counts = selected_df[item].value_counts()
    rank_mapping = {value: rank for rank, value in enumerate(value_counts.index, 1)}
    selected_df[item] = selected_df[item].map(rank_mapping)

In [None]:
selected_df.head(5)

Unnamed: 0,StartTime,Dur,Proto,SrcAddr,DstAddr,State,TotPkts,TotBytes,SrcBytes,Label
101,1,599848,1,172772,2,1,6,1339,996,1
102,3603991,692,1,83179,2,1,1,4,5,1
105,9610636,1849061,1,134000,2,1,30,2899,1434,1
107,9610637,1849060,1,274531,2,1,6,1056,636,1
109,9610638,1849059,1,1008149,2,1,22,4821,521,1


In [None]:
selected_df.shape

(19976700, 15)

In [None]:
#Use this section of code to save new datset after feature selection and pre-processing into a new csv file for efficiency
#selected_df.to_csv('final_merged_files_all15.csv', index=False)
!cp final_merged_files_all15.csv "drive/My Drive/sjsu nsf reu"

# Train test Splitting

In [None]:
#Use this section to access a new dataset that needs to be used for the splitting and model
import pandas as pd
selected_df = pd.read_csv('/content/drive/MyDrive/sjsu nsf reu/edited_merged_files.csv')
selected_df.shape

(14415958, 10)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = selected_df.loc[:, ['StartTime', 'Dur', 'Proto', 'SrcAddr', 'DstAddr', 'State', 'TotPkts', 'TotBytes', 'SrcBytes', 'Label']]
y = selected_df.loc[:, 'Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 420)

In [None]:
X_train.shape

(11532766, 10)

In [None]:
y_train.shape

(11532766,)

In [None]:
#9
background_count = 0
normal_count = 0
botnet_count = 0

for x in X_train['Label']:
    if '1' in str(x): #Background
        background_count += 1
    if '2' in str(x): #Normal
        normal_count += 1
    if '3' in str(x): #Botnet
        botnet_count += 1

print(str(float(background_count/11532766)))
print(str(float(normal_count/11532766)))
print(str(float(botnet_count/11532766)))

botnet_ratio_b = (float(botnet_count/background_count))
botnet_ratio_n = (float(botnet_count/normal_count))


0.9741123681864351
0.015892804900402905
0.009994826913162029


In [None]:
print(background_count)
print(normal_count)
print(botnet_count)

11234210
183288
115268


In [None]:
#14
filtered_df_1 = X_train.copy()

# Create a boolean mask to identify rows with "background" in the 'Label' column
mask_b = filtered_df_1['Label'].astype(str).str.contains('1')

# Identify the indices of the rows to drop
indices_to_drop_b = filtered_df_1.index[mask_b].tolist()[:(background_count-botnet_count)]
filtered_y = y_train.copy()

# Drop the specified rows from the DataFrame
filtered_df_1 = filtered_df_1.drop(indices_to_drop_b)
filtered_y = filtered_y.drop(indices_to_drop_b)

# Reset the index of the filtered DataFrame
filtered_df_1 = filtered_df_1.reset_index(drop=True)
filtered_y = filtered_y.reset_index(drop=True)

In [None]:
filtered_y.shape

(413824,)

In [None]:
filtered_df_1.shape

(413824, 10)

In [None]:
#filtered_y.shape

In [None]:
#15
background_count = 0
normal_count = 0
botnet_count = 0

for x in filtered_df_1['Label']:
    if '1' in str(x): #Background
        background_count += 1
    if '2' in str(x): #Normal
        normal_count += 1
    if '3' in str(x): #Botnet
        botnet_count += 1

print(str(float(background_count/413824)))
print(str(float(normal_count/413824)))
print(str(float(botnet_count/413824)))

botnet_ratio_b = (float(botnet_count/background_count))
botnet_ratio_n = (float(botnet_count/normal_count))

0.2785435354160223
0.44291292916795544
0.2785435354160223


In [None]:
#16
background_count

115268

In [None]:
#17
filtered_df = filtered_df_1.copy()

# Create a boolean mask to identify rows with "normal" in the 'Label' column
mask_n = filtered_df['Label'].astype(str).str.contains('2')

# Identify the indices of the rows to drop
indices_to_drop_n = filtered_df.index[mask_n].tolist()[:(normal_count-botnet_count)]
filtered_y_2 = filtered_y.copy()

# Drop the specified rows from the DataFrame
filtered_df = filtered_df.drop(indices_to_drop_n)
filtered_y_2 = filtered_y_2.drop(indices_to_drop_n)

# Reset the index of the filtered DataFrame
filtered_df = filtered_df.reset_index(drop=True)
filtered_y_2 = filtered_y_2.reset_index(drop=True)

In [None]:
#18
filtered_df.shape

(345804, 10)

In [None]:
filtered_y_2.shape

(345804,)

In [None]:
#19
background_count = 0
normal_count = 0
botnet_count = 0

for x in filtered_df['Label']:
    if '1' in str(x):
        background_count += 1
    if '2' in str(x):
        normal_count += 1
    if '3' in str(x):
        botnet_count += 1

print(str(float(background_count/345804)))
print(str(float(normal_count/345804)))
print(str(float(botnet_count/345804)))

0.3333333333333333
0.3333333333333333
0.3333333333333333


In [None]:
X_train = filtered_df

In [None]:
#rows_to_drop = y_train.sample(n=11186962).index
#y_train = y_train.drop(rows_to_drop)

In [None]:
y_train = filtered_y_2

In [None]:
X_train.shape

(345804, 10)

In [None]:
y_train.shape

(345804,)

In [None]:
X_train = X_train.drop(columns=['Label'])
X_test = X_test.drop(columns=['Label'])

In [None]:
X_train.shape

(345804, 9)

In [None]:
X_test.shape

(2883192, 9)

In [None]:
y_train.value_counts()

3    115268
2    115268
1    115268
Name: Label, dtype: int64

In [None]:
y_test.value_counts()

1    2808812
2      45688
3      28692
Name: Label, dtype: int64

# Filtering

In [None]:
#9
background_count = 0
normal_count = 0
botnet_count = 0

for x in X_train['Label']:
    if '1' in str(x): #Background
        background_count += 1
    if '2' in str(x): #Normal
        normal_count += 1
    if '3' in str(x): #Botnet
        botnet_count += 1

print(str(float(background_count/14415958)))
print(str(float(normal_count/14415958)))
print(str(float(botnet_count/14415958)))

botnet_ratio_b = (float(botnet_count/background_count))
botnet_ratio_n = (float(botnet_count/normal_count))


In [None]:
#10
background_count

In [None]:
#11
normal_count

In [None]:
#12
botnet_count

trying to get all count values to equal the botnet count

In [None]:
#14
filtered_df_1 = selected_df.copy()

# Create a boolean mask to identify rows with "background" in the 'Label' column
mask_b = filtered_df_1['Label'].str.contains('Background')

# Identify the indices of the rows to drop
indices_to_drop_b = filtered_df_1.index[mask_b].tolist()[:(background_count-botnet_count)]

# Drop the specified rows from the DataFrame
filtered_df_1 = filtered_df_1.drop(indices_to_drop_b)

# Reset the index of the filtered DataFrame
filtered_df_1 = filtered_df_1.reset_index(drop=True)

In [None]:
#15
background_count = 0
normal_count = 0
botnet_count = 0

for x in filtered_df_1['Label']:
    if 'Background' in str(x):
        background_count += 1
    if 'Normal' in str(x):
        normal_count += 1
    if 'Botnet' in str(x):
        botnet_count += 1

print(str(float(background_count/14415958)))
print(str(float(normal_count/14415958)))
print(str(float(botnet_count/14415958)))

botnet_ratio_b = (float(botnet_count/background_count))
botnet_ratio_n = (float(botnet_count/normal_count))


In [None]:
#16
background_count

In [None]:
#17
filtered_df = filtered_df_1.copy()

# Create a boolean mask to identify rows with "background" in the 'Label' column
mask_n = filtered_df['Label'].str.contains('Normal')

# Identify the indices of the rows to drop
indices_to_drop_n = filtered_df.index[mask_n].tolist()[:(normal_count-botnet_count)]

# Drop the specified rows from the DataFrame
filtered_df = filtered_df.drop(indices_to_drop_n)

# Reset the index of the filtered DataFrame
filtered_df = filtered_df.reset_index(drop=True)

In [None]:
#18
filtered_df.shape

In [None]:
#19
background_count = 0
normal_count = 0
botnet_count = 0

for x in filtered_df['Label']:
    if 'Background' in str(x):
        background_count += 1
    if 'Normal' in str(x):
        normal_count += 1
    if 'Botnet' in str(x):
        botnet_count += 1

print(str(float(background_count/10412239)))
print(str(float(normal_count/10412239)))
print(str(float(botnet_count/10412239)))

In [None]:
#20
background_count

In [None]:
#21
botnet_count

In [None]:
#22
normal_count

In [None]:
#Background
filtered_df.loc[filtered_df['Label'].str.contains("Background"), 'Label'] = "Background"

#Normal
filtered_df.loc[filtered_df['Label'].str.contains("Normal"), 'Label'] = "Normal"

#Botnet
filtered_df.loc[filtered_df['Label'].str.contains("Botnet"), 'Label'] = "Botnet"

In [None]:
filtered_df['Label'].unique()

In [None]:
#things to change
change_items = ['StartTime', 'Dur', 'Proto', 'SrcAddr', 'DstAddr', 'State', 'TotPkts', 'TotBytes', 'SrcBytes', 'Label']

for item in change_items:
    value_counts = filtered_df[item].value_counts()
    rank_mapping = {value: rank for rank, value in enumerate(value_counts.index, 1)}
    filtered_df[item] = filtered_df[item].map(rank_mapping)

In [None]:
filtered_df.head(5)

In [None]:
filtered_df.isnull().sum()

In [None]:
y_test.value_counts()

# Random Forest Modeling

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

In [None]:
clf = RandomForestClassifier(max_depth = 1)

In [None]:
train_score = clf.fit(X_train, y_train).score(X_train, y_train)
train_score

0.6961429017593781

In [None]:
pred_test_y = clf.predict(X_test)
pred_train_y = clf.predict(X_train)
print(pred_test_y)

[1 3 1 ... 1 1 1]


In [None]:
confusion_matrix(y_test, pred_test_y)

array([[1748895,  490837,  569080],
       [   1792,   38286,    5610],
       [   1222,    9731,   17739]])

In [None]:
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [None]:
aa, ao, an, oa, oo, on, na, no, nn = confusion_matrix(y_test, pred_test_y).ravel()
print(f"Test Accuracy is {(aa + oo + nn)/(aa + ao + an + oa + oo + on + na + no + nn)}")
#print(f"precision for botnet is {oo / (oa + oo + on)}")
#print(f"recall for botnet is {oo / (ao + oo + no)}")
print(f"Test Precision is {precision_score(y_test, pred_test_y, pos_label='positive', average='micro')}")
print(f"Test Recall is {recall_score(y_test, pred_test_y, pos_label='positive', average='micro')}")
print()
print(f"Train Accuracy is {train_score}")
print(f"F1 score is {f1_score(y_test, pred_test_y, average='macro')}")


Test Accuracy is 0.6260145005951737
Test Precision is 0.6260145005951737
Test Recall is 0.6260145005951737

Train Accuracy is 0.6961429017593781
F1 score is 0.31835070549478567


try with different combos/number of features and track accuracy and plot on spreadsheet or print graph through python