# Creating

In [8]:
import pandas as pd
import numpy as np

In [301]:
input_dir = "./UNSW/Raw"
output_file = "./UNSW/Raw/UNSW-NB15_preprocessed.csv"

In [302]:
# Reading of all 4 csv files of UNSW
dfs = []
for i in range(1, 5):
    path = input_dir + f"/UNSW-NB15_{i}.csv"  # There are 4 input csv files
    dfs.append(pd.read_csv(path, header=None, low_memory=False))
all_data = pd.concat(dfs).reset_index(drop=True)

# Adding Column names to the CSV file
df_col = pd.read_csv(input_dir + "/NUSW-NB15_features.csv", encoding="ISO-8859-1")
df_col["Name"] = df_col["Name"].apply(lambda x: x.strip().replace(" ", "").lower())
all_data.columns = df_col["Name"]
print(all_data.shape)

(2540047, 49)


In [303]:
all_data["attack_cat"] = all_data.attack_cat.fillna(value="normal").apply(lambda x: x.strip().lower())

In [304]:
all_data["attack_cat"] = all_data["attack_cat"].replace("backdoors", "backdoor", regex=True).apply(lambda x: x.strip().lower())

In [305]:
all_data["service"] = all_data["service"].apply(lambda x: "None" if x == "-" else x)

In [306]:
all_data["ct_ftp_cmd"] = all_data["ct_ftp_cmd"].apply(lambda x: 0 if x == " " else x).astype(int)

In [307]:
all_data.rename(columns={'srcip': 'source_ip', 'dstip': 'destination_ip', 'sport': 'source_port', 'dsport': 'destination_port', 'proto': 'protocol', 'attack_cat':'attack_label', 'label':'binary_label'}, inplace=True)

In [308]:
all_data.drop_duplicates(inplace=True)

In [309]:
all_data

Name,source_ip,source_port,destination_ip,destination_port,protocol,state,dur,sbytes,dbytes,sttl,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_label,binary_label
0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,0,3,7,1,3,1,1,1,normal,0
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,0,2,4,2,3,1,1,2,normal,0
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,0,12,8,1,2,2,1,1,normal,0
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,0,6,9,1,1,1,1,1,normal,0
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,0,7,9,1,1,1,1,1,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2540039,59.166.0.1,38606,149.171.126.9,80,tcp,CON,0.564998,14106,772406,31,...,0,1,1,4,2,2,2,2,normal,0
2540041,59.166.0.1,38606,149.171.126.9,80,tcp,CON,0.564998,14106,772406,31,...,0,2,1,4,2,2,2,2,normal,0
2540042,59.166.0.5,33094,149.171.126.7,43433,tcp,FIN,0.087306,320,1828,31,...,0,1,2,3,3,1,1,3,normal,0
2540045,59.166.0.9,35433,149.171.126.0,80,tcp,CON,2.200934,3498,166054,31,...,0,1,1,2,4,2,2,2,normal,0


In [310]:
start_value = 1
end_value = start_value + len(all_data)
all_data['flow_id'] = range(start_value, end_value)

In [311]:
all_data

Name,source_ip,source_port,destination_ip,destination_port,protocol,state,dur,sbytes,dbytes,sttl,...,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_label,binary_label,flow_id
0,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,31,...,3,7,1,3,1,1,1,normal,0,1
1,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,31,...,2,4,2,3,1,1,2,normal,0,2
2,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,31,...,12,8,1,2,2,1,1,normal,0,3
3,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,31,...,6,9,1,1,1,1,1,normal,0,4
4,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,31,...,7,9,1,1,1,1,1,normal,0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2540039,59.166.0.1,38606,149.171.126.9,80,tcp,CON,0.564998,14106,772406,31,...,1,1,4,2,2,2,2,normal,0,2059411
2540041,59.166.0.1,38606,149.171.126.9,80,tcp,CON,0.564998,14106,772406,31,...,2,1,4,2,2,2,2,normal,0,2059412
2540042,59.166.0.5,33094,149.171.126.7,43433,tcp,FIN,0.087306,320,1828,31,...,1,2,3,3,1,1,3,normal,0,2059413
2540045,59.166.0.9,35433,149.171.126.0,80,tcp,CON,2.200934,3498,166054,31,...,1,1,2,4,2,2,2,normal,0,2059414


In [312]:
flow_id = all_data.pop('flow_id')
all_data.insert(0, 'flow_id', flow_id)

In [313]:
all_data.reset_index(drop=True, inplace=True)

In [314]:
all_data.rename_axis(None, inplace=True)

In [315]:
all_data

Name,flow_id,source_ip,source_port,destination_ip,destination_port,protocol,state,dur,sbytes,dbytes,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_label,binary_label
0,1,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,...,0,3,7,1,3,1,1,1,normal,0
1,2,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,...,0,2,4,2,3,1,1,2,normal,0
2,3,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,...,0,12,8,1,2,2,1,1,normal,0
3,4,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,...,0,6,9,1,1,1,1,1,normal,0
4,5,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,...,0,7,9,1,1,1,1,1,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2059410,2059411,59.166.0.1,38606,149.171.126.9,80,tcp,CON,0.564998,14106,772406,...,0,1,1,4,2,2,2,2,normal,0
2059411,2059412,59.166.0.1,38606,149.171.126.9,80,tcp,CON,0.564998,14106,772406,...,0,2,1,4,2,2,2,2,normal,0
2059412,2059413,59.166.0.5,33094,149.171.126.7,43433,tcp,FIN,0.087306,320,1828,...,0,1,2,3,3,1,1,3,normal,0
2059413,2059414,59.166.0.9,35433,149.171.126.0,80,tcp,CON,2.200934,3498,166054,...,0,1,1,2,4,2,2,2,normal,0


In [316]:
all_data.to_csv('./UNSW/Export/UNSW_Flow.csv', index=False)

# Processing

In [3]:
df = pd.read_csv('./UNSW/output11.csv')

  df = pd.read_csv('./UNSW/output11.csv')


In [4]:
df.rename(columns={'packet': 'packet_hex', 'payload': 'payload_hex', 'srcip': 'source_ip', 'dstip': 'destination_ip', 'sport': 'source_port', 'dsport': 'destination_port', 'protocol_m': 'protocol'}, inplace=True)

In [5]:
df

Unnamed: 0,stime,source_ip,source_port,destination_ip,destination_port,protocol,sttl,total_len,first_layer,packet_hex,...,DNS labels,DNS inception,DNS keytag,DNS signature,DNS nextname,TFTP Ack block,TFTP_Options oname,TFTP_Options value,TFTP Data block,LDAP present
0,1.424225e+09,149.171.126.1,143.0,59.166.0.9,35632.0,tcp,29.0,180.0,cooked linux,000400010006005056a5776300000800450000b4256940...,...,,,,,,,,,,
1,1.424225e+09,149.171.126.1,143.0,59.166.0.9,35632.0,tcp,30.0,52.0,cooked linux,000000010006001b17059e1c0000080045000034256840...,...,,,,,,,,,,
2,1.424225e+09,149.171.126.1,143.0,59.166.0.9,35632.0,tcp,29.0,52.0,cooked linux,000400010006005056a577630000080045000034256840...,...,,,,,,,,,,
3,1.424225e+09,149.171.126.1,143.0,59.166.0.9,35632.0,tcp,30.0,73.0,cooked linux,000000010006001b17059e1c0000080045000049256b40...,...,,,,,,,,,,
4,1.424225e+09,149.171.126.1,143.0,59.166.0.9,35632.0,tcp,29.0,73.0,cooked linux,000400010006005056a577630000080045000049256b40...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,1.424230e+09,59.166.0.2,32339.0,149.171.126.5,6881.0,tcp,32.0,52.0,cooked linux,000000010006021ac5000000000008004500003428f240...,...,,,,,,,,,,
9999996,1.424230e+09,59.166.0.2,32339.0,149.171.126.5,6881.0,tcp,32.0,52.0,cooked linux,000000010006021ac5000000000008004500003428fe40...,...,,,,,,,,,,
9999997,1.424230e+09,59.166.0.2,32339.0,149.171.126.5,6881.0,tcp,31.0,52.0,cooked linux,000400010006005056a524c2000008004500003428fe40...,...,,,,,,,,,,
9999998,1.424230e+09,59.166.0.3,43805.0,149.171.126.9,143.0,tcp,32.0,52.0,cooked linux,000000010006021ac50000000000080045000034865d40...,...,,,,,,,,,,


In [9]:
flow1 = pd.read_csv('./UNSW/Export/UNSW_Flow.csv')

In [4]:
flow1

Unnamed: 0,flow_id,source_ip,source_port,destination_ip,destination_port,protocol,state,dur,sbytes,dbytes,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_label,binary_label
0,1,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,...,0,3,7,1,3,1,1,1,normal,0
1,2,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,...,0,2,4,2,3,1,1,2,normal,0
2,3,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,...,0,12,8,1,2,2,1,1,normal,0
3,4,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,...,0,6,9,1,1,1,1,1,normal,0
4,5,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,...,0,7,9,1,1,1,1,1,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2059410,2059411,59.166.0.1,38606,149.171.126.9,80,tcp,CON,0.564998,14106,772406,...,0,1,1,4,2,2,2,2,normal,0
2059411,2059412,59.166.0.1,38606,149.171.126.9,80,tcp,CON,0.564998,14106,772406,...,0,2,1,4,2,2,2,2,normal,0
2059412,2059413,59.166.0.5,33094,149.171.126.7,43433,tcp,FIN,0.087306,320,1828,...,0,1,2,3,3,1,1,3,normal,0
2059413,2059414,59.166.0.9,35433,149.171.126.0,80,tcp,CON,2.200934,3498,166054,...,0,1,1,2,4,2,2,2,normal,0


In [8]:
flow1 = flow1[['flow_id', 'source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol']]

In [9]:
flow1

Unnamed: 0,flow_id,source_ip,source_port,destination_ip,destination_port,protocol
0,1,59.166.0.0,1390,149.171.126.6,53,udp
1,2,59.166.0.0,33661,149.171.126.9,1024,udp
2,3,59.166.0.6,1464,149.171.126.7,53,udp
3,4,59.166.0.5,3593,149.171.126.5,53,udp
4,5,59.166.0.3,49664,149.171.126.0,53,udp
...,...,...,...,...,...,...
2059410,2059411,59.166.0.1,38606,149.171.126.9,80,tcp
2059411,2059412,59.166.0.1,38606,149.171.126.9,80,tcp
2059412,2059413,59.166.0.5,33094,149.171.126.7,43433,tcp
2059413,2059414,59.166.0.9,35433,149.171.126.0,80,tcp


In [10]:
flow2 = flow1.copy()

In [11]:
flow2.rename(columns={'source_ip': 'destination_ip', 'destination_ip': 'source_ip', 'source_port': 'destination_port', 'destination_port': 'source_port'}, inplace=True)

In [12]:
flow2

Unnamed: 0,flow_id,destination_ip,destination_port,source_ip,source_port,protocol
0,1,59.166.0.0,1390,149.171.126.6,53,udp
1,2,59.166.0.0,33661,149.171.126.9,1024,udp
2,3,59.166.0.6,1464,149.171.126.7,53,udp
3,4,59.166.0.5,3593,149.171.126.5,53,udp
4,5,59.166.0.3,49664,149.171.126.0,53,udp
...,...,...,...,...,...,...
2059410,2059411,59.166.0.1,38606,149.171.126.9,80,tcp
2059411,2059412,59.166.0.1,38606,149.171.126.9,80,tcp
2059412,2059413,59.166.0.5,33094,149.171.126.7,43433,tcp
2059413,2059414,59.166.0.9,35433,149.171.126.0,80,tcp


In [13]:
flow = pd.concat([flow1, flow2])

In [14]:
flow

Unnamed: 0,flow_id,source_ip,source_port,destination_ip,destination_port,protocol
0,1,59.166.0.0,1390,149.171.126.6,53,udp
1,2,59.166.0.0,33661,149.171.126.9,1024,udp
2,3,59.166.0.6,1464,149.171.126.7,53,udp
3,4,59.166.0.5,3593,149.171.126.5,53,udp
4,5,59.166.0.3,49664,149.171.126.0,53,udp
...,...,...,...,...,...,...
2059410,2059411,149.171.126.9,80,59.166.0.1,38606,tcp
2059411,2059412,149.171.126.9,80,59.166.0.1,38606,tcp
2059412,2059413,149.171.126.7,43433,59.166.0.5,33094,tcp
2059413,2059414,149.171.126.0,80,59.166.0.9,35433,tcp


In [16]:
flow.drop_duplicates(subset=flow.columns.difference(['flow_id']), inplace=True)

In [17]:
flow

Unnamed: 0,flow_id,source_ip,source_port,destination_ip,destination_port,protocol
0,1,59.166.0.0,1390,149.171.126.6,53,udp
1,2,59.166.0.0,33661,149.171.126.9,1024,udp
2,3,59.166.0.6,1464,149.171.126.7,53,udp
3,4,59.166.0.5,3593,149.171.126.5,53,udp
4,5,59.166.0.3,49664,149.171.126.0,53,udp
...,...,...,...,...,...,...
2059406,2059407,149.171.126.4,21,59.166.0.7,20848,tcp
2059407,2059408,149.171.126.2,80,59.166.0.4,59563,tcp
2059408,2059409,149.171.126.0,80,59.166.0.9,35433,tcp
2059410,2059411,149.171.126.9,80,59.166.0.1,38606,tcp


In [18]:
df

Unnamed: 0,stime,source_ip,source_port,destination_ip,destination_port,protocol,sttl,total_len,first_layer,packet_hex,...,DNS labels,DNS inception,DNS keytag,DNS signature,DNS nextname,TFTP Ack block,TFTP_Options oname,TFTP_Options value,TFTP Data block,LDAP present
0,1.424225e+09,149.171.126.1,143.0,59.166.0.9,35632.0,tcp,29.0,180.0,cooked linux,000400010006005056a5776300000800450000b4256940...,...,,,,,,,,,,
1,1.424225e+09,149.171.126.1,143.0,59.166.0.9,35632.0,tcp,30.0,52.0,cooked linux,000000010006001b17059e1c0000080045000034256840...,...,,,,,,,,,,
2,1.424225e+09,149.171.126.1,143.0,59.166.0.9,35632.0,tcp,29.0,52.0,cooked linux,000400010006005056a577630000080045000034256840...,...,,,,,,,,,,
3,1.424225e+09,149.171.126.1,143.0,59.166.0.9,35632.0,tcp,30.0,73.0,cooked linux,000000010006001b17059e1c0000080045000049256b40...,...,,,,,,,,,,
4,1.424225e+09,149.171.126.1,143.0,59.166.0.9,35632.0,tcp,29.0,73.0,cooked linux,000400010006005056a577630000080045000049256b40...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,1.424230e+09,59.166.0.2,32339.0,149.171.126.5,6881.0,tcp,32.0,52.0,cooked linux,000000010006021ac5000000000008004500003428f240...,...,,,,,,,,,,
9999996,1.424230e+09,59.166.0.2,32339.0,149.171.126.5,6881.0,tcp,32.0,52.0,cooked linux,000000010006021ac5000000000008004500003428fe40...,...,,,,,,,,,,
9999997,1.424230e+09,59.166.0.2,32339.0,149.171.126.5,6881.0,tcp,31.0,52.0,cooked linux,000400010006005056a524c2000008004500003428fe40...,...,,,,,,,,,,
9999998,1.424230e+09,59.166.0.3,43805.0,149.171.126.9,143.0,tcp,32.0,52.0,cooked linux,000000010006021ac50000000000080045000034865d40...,...,,,,,,,,,,


In [19]:
df['source_port'] = df['source_port'].astype(int)
df['destination_port'] = df['destination_port'].astype(int)

In [20]:
columns_to_match = ['source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol']

In [21]:
merged_df = df.merge(flow, on=columns_to_match, how='left')

In [22]:
merged_df

Unnamed: 0,stime,source_ip,source_port,destination_ip,destination_port,protocol,sttl,total_len,first_layer,packet_hex,...,DNS inception,DNS keytag,DNS signature,DNS nextname,TFTP Ack block,TFTP_Options oname,TFTP_Options value,TFTP Data block,LDAP present,flow_id
0,1.424225e+09,149.171.126.1,143,59.166.0.9,35632,tcp,29.0,180.0,cooked linux,000400010006005056a5776300000800450000b4256940...,...,,,,,,,,,,1127213
1,1.424225e+09,149.171.126.1,143,59.166.0.9,35632,tcp,30.0,52.0,cooked linux,000000010006001b17059e1c0000080045000034256840...,...,,,,,,,,,,1127213
2,1.424225e+09,149.171.126.1,143,59.166.0.9,35632,tcp,29.0,52.0,cooked linux,000400010006005056a577630000080045000034256840...,...,,,,,,,,,,1127213
3,1.424225e+09,149.171.126.1,143,59.166.0.9,35632,tcp,30.0,73.0,cooked linux,000000010006001b17059e1c0000080045000049256b40...,...,,,,,,,,,,1127213
4,1.424225e+09,149.171.126.1,143,59.166.0.9,35632,tcp,29.0,73.0,cooked linux,000400010006005056a577630000080045000049256b40...,...,,,,,,,,,,1127213
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,1.424230e+09,59.166.0.2,32339,149.171.126.5,6881,tcp,32.0,52.0,cooked linux,000000010006021ac5000000000008004500003428f240...,...,,,,,,,,,,1241519
9999996,1.424230e+09,59.166.0.2,32339,149.171.126.5,6881,tcp,32.0,52.0,cooked linux,000000010006021ac5000000000008004500003428fe40...,...,,,,,,,,,,1241519
9999997,1.424230e+09,59.166.0.2,32339,149.171.126.5,6881,tcp,31.0,52.0,cooked linux,000400010006005056a524c2000008004500003428fe40...,...,,,,,,,,,,1241519
9999998,1.424230e+09,59.166.0.3,43805,149.171.126.9,143,tcp,32.0,52.0,cooked linux,000000010006021ac50000000000080045000034865d40...,...,,,,,,,,,,1241468


In [23]:
flow_id = merged_df.pop('flow_id')
merged_df.insert(0, 'flow_id', flow_id)

  merged_df.insert(0, 'flow_id', flow_id)


In [24]:
merged_df

Unnamed: 0,flow_id,stime,source_ip,source_port,destination_ip,destination_port,protocol,sttl,total_len,first_layer,...,DNS labels,DNS inception,DNS keytag,DNS signature,DNS nextname,TFTP Ack block,TFTP_Options oname,TFTP_Options value,TFTP Data block,LDAP present
0,1127213,1.424225e+09,149.171.126.1,143,59.166.0.9,35632,tcp,29.0,180.0,cooked linux,...,,,,,,,,,,
1,1127213,1.424225e+09,149.171.126.1,143,59.166.0.9,35632,tcp,30.0,52.0,cooked linux,...,,,,,,,,,,
2,1127213,1.424225e+09,149.171.126.1,143,59.166.0.9,35632,tcp,29.0,52.0,cooked linux,...,,,,,,,,,,
3,1127213,1.424225e+09,149.171.126.1,143,59.166.0.9,35632,tcp,30.0,73.0,cooked linux,...,,,,,,,,,,
4,1127213,1.424225e+09,149.171.126.1,143,59.166.0.9,35632,tcp,29.0,73.0,cooked linux,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,1241519,1.424230e+09,59.166.0.2,32339,149.171.126.5,6881,tcp,32.0,52.0,cooked linux,...,,,,,,,,,,
9999996,1241519,1.424230e+09,59.166.0.2,32339,149.171.126.5,6881,tcp,32.0,52.0,cooked linux,...,,,,,,,,,,
9999997,1241519,1.424230e+09,59.166.0.2,32339,149.171.126.5,6881,tcp,31.0,52.0,cooked linux,...,,,,,,,,,,
9999998,1241468,1.424230e+09,59.166.0.3,43805,149.171.126.9,143,tcp,32.0,52.0,cooked linux,...,,,,,,,,,,


# Testing

In [25]:
merged_df.flow_id.isna().sum()

0

In [271]:
merged_df.iloc[13]

stime                                1421927414
source_ip                           10.40.182.3
source_port                                   0
destination_ip                      10.40.182.1
destination_port                              0
                                       ...     
HSRP MD5 Authentication sourceip            NaN
HSRP MD5 Authentication keyid               NaN
SCTPChunkInit addr                          NaN
flow_id                                148629.0
protocol_y                                  arp
Name: 13, Length: 389, dtype: object

In [254]:
flow[flow.protocol=='others']

Unnamed: 0,flow_id,source_ip,source_port,destination_ip,destination_port,protocol


In [272]:
flow[(flow['source_ip']=='10.40.182.3') & (flow['destination_ip']=='10.40.182.1') & (flow['source_port']==0)]

Unnamed: 0,flow_id,source_ip,source_port,destination_ip,destination_port,protocol
1966462,521,10.40.182.3,0,10.40.182.1,0,others


In [212]:
flow[(flow['source_ip']=='59.166.0.3') & (flow['destination_ip']=='149.171.126.4') & (flow['source_port']==6103) & (flow['destination_port']=='52633')]

Unnamed: 0,flow_id,source_ip,source_port,destination_ip,destination_port,protocol
111536,115902,59.166.0.3,6103,149.171.126.4,52633,tcp


In [175]:
merged_df.flow_id.value_counts(dropna=False)

NaN         9935521
664410.0       1904
791440.0       1312
806229.0       1308
752940.0       1308
             ...   
346583.0          4
723549.0          4
821934.0          2
189265.0          2
149675.0          2
Name: flow_id, Length: 847, dtype: int64

In [15]:
from pycaret.classification import *
s = setup(flow1, target = 'attack_label', normalize = True, fix_imbalance=True, use_gpu=True, session_id = 123)

[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recomp

Unnamed: 0,Description,Value
0,Session id,123
1,Target,attack_label
2,Target type,Multiclass
3,Target mapping,"analysis: 0, backdoor: 1, dos: 2, exploits: 3, fuzzers: 4, generic: 5, normal: 6, reconnaissance: 7, shellcode: 8, worms: 9"
4,Original data shape,"(2059415, 50)"
5,Transformed data shape,"(14336225, 10)"
6,Transformed train set shape,"(13718400, 10)"
7,Transformed test set shape,"(617825, 10)"
8,Numeric features,44
9,Categorical features,5


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


In [16]:
rf = create_model('rf')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9813,0.9985,0.9813,0.984,0.9824,0.8056,0.806
1,0.9817,0.9986,0.9817,0.9844,0.9828,0.8095,0.8098
2,0.982,0.9984,0.982,0.9845,0.9831,0.8138,0.8141
3,0.982,0.9983,0.982,0.9846,0.9831,0.8131,0.8134
4,0.9815,0.9985,0.9815,0.9841,0.9827,0.8082,0.8085
5,0.9816,0.9985,0.9816,0.984,0.9827,0.8084,0.8086
6,0.9814,0.9985,0.9814,0.984,0.9825,0.8074,0.8077
7,0.9809,0.9985,0.9809,0.9837,0.9822,0.8027,0.8031
8,0.9805,0.9987,0.9805,0.9838,0.9819,0.8006,0.8013
9,0.9815,0.9986,0.9815,0.984,0.9825,0.8074,0.8076


In [17]:
evaluate_model(rf)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [18]:
rf.feature_importances_

array([0.23158914, 0.1070963 , 0.17587187, 0.05399162, 0.04281908,
       0.03992106, 0.09035387, 0.13516337, 0.12319368])

In [3]:
flow1

Unnamed: 0,flow_id,source_ip,source_port,destination_ip,destination_port,protocol,state,dur,sbytes,dbytes,...,ct_ftp_cmd,ct_srv_src,ct_srv_dst,ct_dst_ltm,ct_src_ltm,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,attack_label,binary_label
0,1,59.166.0.0,1390,149.171.126.6,53,udp,CON,0.001055,132,164,...,0,3,7,1,3,1,1,1,normal,0
1,2,59.166.0.0,33661,149.171.126.9,1024,udp,CON,0.036133,528,304,...,0,2,4,2,3,1,1,2,normal,0
2,3,59.166.0.6,1464,149.171.126.7,53,udp,CON,0.001119,146,178,...,0,12,8,1,2,2,1,1,normal,0
3,4,59.166.0.5,3593,149.171.126.5,53,udp,CON,0.001209,132,164,...,0,6,9,1,1,1,1,1,normal,0
4,5,59.166.0.3,49664,149.171.126.0,53,udp,CON,0.001169,146,178,...,0,7,9,1,1,1,1,1,normal,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2059410,2059411,59.166.0.1,38606,149.171.126.9,80,tcp,CON,0.564998,14106,772406,...,0,1,1,4,2,2,2,2,normal,0
2059411,2059412,59.166.0.1,38606,149.171.126.9,80,tcp,CON,0.564998,14106,772406,...,0,2,1,4,2,2,2,2,normal,0
2059412,2059413,59.166.0.5,33094,149.171.126.7,43433,tcp,FIN,0.087306,320,1828,...,0,1,2,3,3,1,1,3,normal,0
2059413,2059414,59.166.0.9,35433,149.171.126.0,80,tcp,CON,2.200934,3498,166054,...,0,1,1,2,4,2,2,2,normal,0


In [10]:
flow1 = flow1[['source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol', 'dur', 'sbytes', 'dbytes', 'sload', 'dload', 'spkts', 'dpkts', 'attack_label']]

In [11]:
flow1

Unnamed: 0,source_ip,source_port,destination_ip,destination_port,protocol,dur,sbytes,dbytes,sload,dload,spkts,dpkts,attack_label
0,59.166.0.0,1390,149.171.126.6,53,udp,0.001055,132,164,500473.937500,6.218009e+05,2,2,normal
1,59.166.0.0,33661,149.171.126.9,1024,udp,0.036133,528,304,87676.085940,5.048017e+04,4,4,normal
2,59.166.0.6,1464,149.171.126.7,53,udp,0.001119,146,178,521894.531300,6.362824e+05,2,2,normal
3,59.166.0.5,3593,149.171.126.5,53,udp,0.001209,132,164,436724.562500,5.425972e+05,2,2,normal
4,59.166.0.3,49664,149.171.126.0,53,udp,0.001169,146,178,499572.250000,6.090676e+05,2,2,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2059410,59.166.0.1,38606,149.171.126.9,80,tcp,0.564998,14106,772406,198981.250000,1.091598e+07,262,526,normal
2059411,59.166.0.1,38606,149.171.126.9,80,tcp,0.564998,14106,772406,198981.250000,1.091598e+07,262,526,normal
2059412,59.166.0.5,33094,149.171.126.7,43433,tcp,0.087306,320,1828,24465.671880,1.466108e+05,6,8,normal
2059413,59.166.0.9,35433,149.171.126.0,80,tcp,2.200934,3498,166054,12496.513670,5.983751e+05,58,116,normal


In [12]:
flow1.to_csv('./unsw_flow1.csv', index=False)