In [1]:
import pandas as pd
from datetime import datetime
import pytz
from scapy.all import Ether, CookedLinux, Raw
import re
import logging
import pyarrow
import random
import os

In [2]:
edt = pytz.timezone('US/Eastern')
def write_log(message):
  current_time = str(datetime.now(edt).strftime('%Y-%m-%d %H:%M:%S'))
  f = open("LOG_UNSW.txt", "a")
  f.write(current_time + ' : ' + message + '\n')
  f.close()

# Create CSV Files

In [2]:
main_df = pd.read_csv('UNSW.csv')

In [3]:
main_df

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,payload,t_delta,stime_flow,attack_cat,label,ltime_max
0,1421927377,10.40.182.1,0,224.0.0.5,0,ospf,1,64,cooked linux,000400010006005056a524c20000080045c00040ef1500...,0201002cc0a8f1f30000000089d8000000000000000000...,0.0,1.421927e+09,normal,0.0,1.421927e+09
1,1421927377,10.40.85.1,0,224.0.0.5,0,ospf,1,64,cooked linux,000400010006005056a577630000080045c00040ef1400...,0201002cc0a8f1f300000000ead8000000000000000000...,0.0,1.421927e+09,normal,0.0,1.421927e+09
2,1421927387,10.40.85.1,0,224.0.0.5,0,ospf,1,64,cooked linux,000400010006005056a577630000080045c00040ef1600...,0201002cc0a8f1f300000000ead8000000000000000000...,6.0,1.421927e+09,normal,0.0,1.421927e+09
3,1421927387,10.40.182.1,0,224.0.0.5,0,ospf,1,64,cooked linux,000400010006005056a524c20000080045c00040ef1700...,0201002cc0a8f1f30000000089d8000000000000000000...,0.0,1.421927e+09,normal,0.0,1.421927e+09
4,1421927397,10.40.182.1,0,224.0.0.5,0,ospf,1,64,cooked linux,000400010006005056a524c20000080045c00040ef1900...,0201002cc0a8f1f30000000089d8000000000000000000...,0.0,1.421927e+09,normal,0.0,1.421927e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179759341,1424262069,149.171.126.9,80,59.166.0.1,38606,tcp,30,1500,cooked linux,000000010006001b17059e1c00000800450005dc6d5e40...,f8d7033e6b99c39e74a4671e7fe282b3d233e5647a2124...,0.0,1.424262e+09,normal,0.0,1.424262e+09
179759342,1424262069,149.171.126.9,80,59.166.0.1,38606,tcp,29,1500,cooked linux,000400010006005056a5776300000800450005dc6d5f40...,6af7cb077a1edf0aed333e2a960084ddb45dc2872efa98...,0.0,1.424262e+09,normal,0.0,1.424262e+09
179759343,1424262069,149.171.126.9,80,59.166.0.1,38606,tcp,30,1500,cooked linux,000000010006001b17059e1c00000800450005dc6d5f40...,6af7cb077a1edf0aed333e2a960084ddb45dc2872efa98...,0.0,1.424262e+09,normal,0.0,1.424262e+09
179759344,1424262069,149.171.126.9,80,59.166.0.1,38606,tcp,30,1500,cooked linux,000000010006001b17059e1c00000800450005dc6d5840...,ddb752b3de517a85324680f2342a51a774932569685b10...,0.0,1.424262e+09,normal,0.0,1.424262e+09


In [None]:
chunk = 1
chunksize = 10_000_000

for main_df in pd.read_csv('UNSW.csv', chunksize=chunksize):
    
    write_log(f'<<<<<<<<----- Started Processing Chunk {chunk} ----->>>>>>>>')
    
    if os.path.isfile(f'./UNSW/output{chunk}.csv'):
        write_log(f'------------ Skipping DataFrame {chunk} as CSV File already exists ------------')
        chunk += 1
        continue

    packet_info = []
    packet_details = []
    log_records = 100000

    count = 0
    for i in range(len(main_df)):
        packet_type = main_df.iloc[i,8]
        packet_bytes = bytes.fromhex(main_df.iloc[i,9])

        if packet_type == 'cooked linux':
            packet = CookedLinux(packet_bytes)
        elif packet_type == 'Ethernet':
            packet = Ether(packet_bytes)
        else:
            print('Error -> First Layer is not valid')
        details = packet.show(dump=True)
        packet_info.append(details)
        count += 1
        if count%log_records == 0:
          write_log(f'Packets Parsed: {count}')

    write_log(f'------------ All Packets Parsed Successfully for Chunk {chunk} ------------')

    count = 0
    for packet in packet_info:
        fields_values = {}
        current_layer = ""
        for line in packet.split("\n"):
            if line.startswith("###[") and "]" in line:
                current_layer = line.split("]")[0].split("[")[1].strip()
                fields_values[current_layer] = {}
            elif current_layer != "":
                matches = re.findall(r"\s+([a-z_]+)\s+=\s+(.+)", line)
                for match in matches:
                    field_name = match[0]
                    field_value = match[1]
                    fields_values[current_layer][field_name] = field_value
        packet_details.append(fields_values)
        count += 1
        if count%log_records == 0:
          write_log(f'Packet Fields Parsed: {count}')

    write_log(f'------------ All Packet Fields Parsed Successfully for Chunk {chunk} ------------')

    count = 0
    df_list = []
    for fields_values in packet_details:
        row = {}
        for layer, fields in fields_values.items():
            for field in fields:
                column_name = f"{layer} {field}"
                row[column_name] = fields_values[layer][field]
        df_list.append(row)
        count += 1
        if count%log_records == 0:
          write_log(f'Packets appended to DataFrame: {count}')

    df = pd.DataFrame(df_list)

    write_log(f'------------ All Packets appended to the DataFrame {chunk} ------------')

    df = pd.concat([main_df, df], axis=1)
    df.to_csv(f'./UNSW/output{chunk}.csv', index=False)

    write_log(f'------------ DataFrame {chunk} saved to CSV File ------------')
    
    chunk += 1



# Process CSV Files

In [None]:
for i in range(13,19):
    if not os.path.isfile(f'./UNSW/output{i}.csv'):
        continue
    write_log(f'<<<<<<<<----- Started Processing CSV File {i} ----->>>>>>>>')
    df = pd.read_csv(f'./UNSW/output{i}.csv')
    if df.shape[0]>10_000_000 or i==18:
        write_log(f'------------ Misaligned CSV File {i} Processing ------------')
        rows = df.shape[0]//2
        df1 = df.iloc[rows:,:16]
        df2 = df.iloc[:rows,16:]
        df = pd.concat([df1.reset_index(drop=True), df2.reset_index(drop=True)], axis=1)
    df.drop_duplicates(inplace=True)
    write_log(f'------------ CSV File {i} Processed. Final shape is {df.shape} ------------')
    df.to_csv(f'./UNSW/output{i}.csv', index=False)
    write_log(f'------------ CSV File {i} Overwritten and Saved ------------')

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


# Sampling Packets

In [2]:
df = pd.read_csv('UNSW.csv')

In [3]:
df

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,payload,t_delta,stime_flow,attack_cat,label,ltime_max
0,1421927377,10.40.182.1,0,224.0.0.5,0,ospf,1,64,cooked linux,000400010006005056a524c20000080045c00040ef1500...,0201002cc0a8f1f30000000089d8000000000000000000...,0.0,1.421927e+09,normal,0.0,1.421927e+09
1,1421927377,10.40.85.1,0,224.0.0.5,0,ospf,1,64,cooked linux,000400010006005056a577630000080045c00040ef1400...,0201002cc0a8f1f300000000ead8000000000000000000...,0.0,1.421927e+09,normal,0.0,1.421927e+09
2,1421927387,10.40.85.1,0,224.0.0.5,0,ospf,1,64,cooked linux,000400010006005056a577630000080045c00040ef1600...,0201002cc0a8f1f300000000ead8000000000000000000...,6.0,1.421927e+09,normal,0.0,1.421927e+09
3,1421927387,10.40.182.1,0,224.0.0.5,0,ospf,1,64,cooked linux,000400010006005056a524c20000080045c00040ef1700...,0201002cc0a8f1f30000000089d8000000000000000000...,0.0,1.421927e+09,normal,0.0,1.421927e+09
4,1421927397,10.40.182.1,0,224.0.0.5,0,ospf,1,64,cooked linux,000400010006005056a524c20000080045c00040ef1900...,0201002cc0a8f1f30000000089d8000000000000000000...,0.0,1.421927e+09,normal,0.0,1.421927e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
179759341,1424262069,149.171.126.9,80,59.166.0.1,38606,tcp,30,1500,cooked linux,000000010006001b17059e1c00000800450005dc6d5e40...,f8d7033e6b99c39e74a4671e7fe282b3d233e5647a2124...,0.0,1.424262e+09,normal,0.0,1.424262e+09
179759342,1424262069,149.171.126.9,80,59.166.0.1,38606,tcp,29,1500,cooked linux,000400010006005056a5776300000800450005dc6d5f40...,6af7cb077a1edf0aed333e2a960084ddb45dc2872efa98...,0.0,1.424262e+09,normal,0.0,1.424262e+09
179759343,1424262069,149.171.126.9,80,59.166.0.1,38606,tcp,30,1500,cooked linux,000000010006001b17059e1c00000800450005dc6d5f40...,6af7cb077a1edf0aed333e2a960084ddb45dc2872efa98...,0.0,1.424262e+09,normal,0.0,1.424262e+09
179759344,1424262069,149.171.126.9,80,59.166.0.1,38606,tcp,30,1500,cooked linux,000000010006001b17059e1c00000800450005dc6d5840...,ddb752b3de517a85324680f2342a51a774932569685b10...,0.0,1.424262e+09,normal,0.0,1.424262e+09


In [4]:
df['attack_cat'].value_counts()

normal            175822483
exploits            2317277
dos                  656620
fuzzers              421364
generic              333069
reconnaissance       150365
backdoor              16370
analysis              14183
shellcode             13983
worms                 13632
Name: attack_cat, dtype: int64

In [4]:
labels = df['attack_cat'].unique().tolist()

In [5]:
labels

['normal',
 'reconnaissance',
 'exploits',
 'dos',
 'generic',
 'shellcode',
 'fuzzers',
 'worms',
 'backdoor',
 'analysis']

In [6]:
final_df = pd.DataFrame(columns = df.columns)
for label in labels:
  try:
    if label == 'normal':
        final_df = final_df.append(df[(df['attack_cat']==label) & (label=='normal') & (df['payload'].notnull())].sample(5400, replace=False), ignore_index=True)
    else:
        final_df = final_df.append(df[(df['attack_cat']==label) & (df['payload'].notnull())].sample(600, replace=False), ignore_index=True)
  except ValueError as Error:
    final_df = final_df.append(df[(df['attack_cat']==label) & (df['payload'].notnull())].sample(600, replace=True), ignore_index=True)

In [7]:
final_df

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,payload,t_delta,stime_flow,attack_cat,label,ltime_max
0,1424229994,149.171.126.3,6881,59.166.0.8,37207,tcp,30,1500,cooked linux,000000010006001b17059e1c00000800450005dcd67040...,a9e203e374664b68d3d6ee23d2800acfb211df830ca3ae...,0.0,1.424230e+09,normal,0.0,1.424230e+09
1,1421928284,149.171.126.8,6881,59.166.0.6,56689,tcp,29,120,cooked linux,000400010006005056a57763000008004500007834f240...,13426974546f7272656e742070726f746f636f6c000000...,0.0,1.421928e+09,normal,0.0,1.421928e+09
2,1424239875,59.166.0.8,64300,149.171.126.9,16137,tcp,32,90,cooked linux,000000010006021ac5000000000008004500005ad68a40...,e32100000047cf2a8bebb3412359f087ea49aefa1623d5...,0.0,1.424240e+09,normal,0.0,1.424240e+09
3,1421944419,149.171.126.0,53,59.166.0.5,13425,udp,29,89,cooked linux,000400010006005056a577630000080045000059439040...,91d4818000010001000000000f7365727665722d393561...,0.0,1.421944e+09,normal,0.0,1.421944e+09
4,1421933389,149.171.126.6,22,59.166.0.2,53586,tcp,29,116,cooked linux,000400010006005056a577630000080045000074ab7b40...,c3431e5e00280a1a99940ecfd83defc7f1b300182e684c...,0.0,1.421933e+09,normal,0.0,1.421933e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10795,1424239582,149.171.126.11,80,175.45.176.3,31557,tcp,253,425,cooked linux,000000010006001b17059e1c00000800450001a990e700...,485454502f312e3120323030204f4b0d0a446174653a20...,0.0,1.424240e+09,analysis,1.0,1.424240e+09
10796,1424249492,175.45.176.1,0,149.171.126.15,0,ospf,254,108,cooked linux,000400010006005056a524c2000008004500006cc27200...,020100586ec85ed86ec85ed0bd0700005c7c002116fb15...,0.0,1.424249e+09,analysis,1.0,1.424250e+09
10797,1424249610,175.45.176.1,0,149.171.126.15,0,ospf,255,108,cooked linux,000000010006021ac5000000000008004500006cc28d00...,020100586ec85ed86ec85ed011eb0002a9a1ff615726a0...,0.0,1.424250e+09,analysis,1.0,1.424250e+09
10798,1424252908,175.45.176.2,0,149.171.126.16,0,ospf,255,64,cooked linux,000000010006021ac50000000000080045000040e97e00...,0201002c6ec85ed86ec85ed00a59000127000000000000...,0.0,1.424253e+09,analysis,1.0,1.424253e+09


In [8]:
final_df['attack_cat'].value_counts()

normal            5400
reconnaissance     600
exploits           600
dos                600
generic            600
shellcode          600
fuzzers            600
worms              600
backdoor           600
analysis           600
Name: attack_cat, dtype: int64

In [9]:
packet_info = []
packet_details = []

for i in range(0, len(final_df)):
  packet_bytes = bytes.fromhex(final_df.iloc[i,9])
  if final_df.iloc[i,8] == 'cooked linux':
    packet = CookedLinux(packet_bytes)
  elif final_df.iloc[i,8] == 'Ethernet':
    packet = Ether(packet_bytes)
  else:
    print('Error -> First Layer is not valid')
  details = packet.show(dump=True)
  packet_info.append(details)

for packet in packet_info:
  fields_values = {}
  current_layer = ""
  for line in packet.split("\n"):
      if line.startswith("###[") and "]" in line:
          current_layer = line.split("]")[0].split("[")[1].strip()
          fields_values[current_layer] = {}
      elif current_layer != "":
          matches = re.findall(r"\s+([a-z_]+)\s+=\s+(.+)", line)
          for match in matches:
              field_name = match[0]
              field_value = match[1]
              fields_values[current_layer][field_name] = field_value
  packet_details.append(fields_values)

df1 = pd.DataFrame()
df_list = []
for fields_values in packet_details:
    row = {}
    for layer, fields in fields_values.items():
        for field in fields:
            column_name = f"{layer} {field}"
            row[column_name] = fields_values[layer][field]
    df_list.append(row)
df1 = pd.concat([pd.DataFrame(df_list[i], index=[i]) for i in range(len(df_list))], ignore_index=True)

In [10]:
df1

Unnamed: 0,cooked linux pkttype,cooked linux lladdrlen,cooked linux src,cooked linux proto,IP version,IP ihl,IP tos,IP len,IP id,IP flags,...,SCTPChunkData type,SCTPChunkData reserved,SCTPChunkData unordered,SCTPChunkData beginning,SCTPChunkData ending,SCTPChunkData len,SCTPChunkData tsn,SCTPChunkData stream_id,SCTPChunkData proto_id,SCTPChunkData data
0,unicast,6,'\x00\x1b\x17\x05\\x9e\x1c',IPv4,4,5,0x0,1500,54896,DF,...,,,,,,,,,,
1,sent-by-us,6,'\x00PV\\xa5wc',IPv4,4,5,0x0,120,13554,DF,...,,,,,,,,,,
2,unicast,6,'\x02\x1a\\xc5',IPv4,4,5,0x0,90,54922,DF,...,,,,,,,,,,
3,sent-by-us,6,'\x00PV\\xa5wc',IPv4,4,5,0x0,89,17296,DF,...,,,,,,,,,,
4,sent-by-us,6,'\x00PV\\xa5wc',IPv4,4,5,0x0,116,43899,DF,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10795,unicast,6,'\x00\x1b\x17\x05\\x9e\x1c',IPv4,4,5,0x0,425,37095,,...,,,,,,,,,,
10796,sent-by-us,6,'\x00PV\\xa5$\\xc2',IPv4,4,5,0x0,108,49778,,...,,,,,,,,,,
10797,unicast,6,'\x02\x1a\\xc5',IPv4,4,5,0x0,108,49805,,...,,,,,,,,,,
10798,unicast,6,'\x02\x1a\\xc5',IPv4,4,5,0x0,64,59774,,...,,,,,,,,,,


In [11]:
df2 = pd.concat([final_df, df1], axis=1)

In [12]:
df2

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,...,SCTPChunkData type,SCTPChunkData reserved,SCTPChunkData unordered,SCTPChunkData beginning,SCTPChunkData ending,SCTPChunkData len,SCTPChunkData tsn,SCTPChunkData stream_id,SCTPChunkData proto_id,SCTPChunkData data
0,1424229994,149.171.126.3,6881,59.166.0.8,37207,tcp,30,1500,cooked linux,000000010006001b17059e1c00000800450005dcd67040...,...,,,,,,,,,,
1,1421928284,149.171.126.8,6881,59.166.0.6,56689,tcp,29,120,cooked linux,000400010006005056a57763000008004500007834f240...,...,,,,,,,,,,
2,1424239875,59.166.0.8,64300,149.171.126.9,16137,tcp,32,90,cooked linux,000000010006021ac5000000000008004500005ad68a40...,...,,,,,,,,,,
3,1421944419,149.171.126.0,53,59.166.0.5,13425,udp,29,89,cooked linux,000400010006005056a577630000080045000059439040...,...,,,,,,,,,,
4,1421933389,149.171.126.6,22,59.166.0.2,53586,tcp,29,116,cooked linux,000400010006005056a577630000080045000074ab7b40...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10795,1424239582,149.171.126.11,80,175.45.176.3,31557,tcp,253,425,cooked linux,000000010006001b17059e1c00000800450001a990e700...,...,,,,,,,,,,
10796,1424249492,175.45.176.1,0,149.171.126.15,0,ospf,254,108,cooked linux,000400010006005056a524c2000008004500006cc27200...,...,,,,,,,,,,
10797,1424249610,175.45.176.1,0,149.171.126.15,0,ospf,255,108,cooked linux,000000010006021ac5000000000008004500006cc28d00...,...,,,,,,,,,,
10798,1424252908,175.45.176.2,0,149.171.126.16,0,ospf,255,64,cooked linux,000000010006021ac50000000000080045000040e97e00...,...,,,,,,,,,,


In [13]:
df2.iloc[7325].dropna()

stime                                                            1424227938
srcip                                                          175.45.176.2
sport                                                                 58075
dstip                                                        149.171.126.17
dsport                                                                 7002
protocol_m                                                              tcp
sttl                                                                    255
total_len                                                                40
first_layer                                                    cooked linux
packet                    000000010006021ac50000000000080045000028087b00...
payload                                                        3d30303d3030
t_delta                                                                 0.0
stime_flow                                                     1424227932.0
attack_cat  

In [14]:
df2.to_csv('UNSW_1000_ANM.csv', index=False)

# Sample Records

In [3]:
for i in range(0,20):
    if not os.path.isfile(f'./UNSW/output{i}.csv'):
        continue
    write_log(f'<<<<<<<<----- Started Processing CSV File {i} ----->>>>>>>>')
    df = pd.read_csv(f'./UNSW/output{i}.csv')
    df = df[(df['protocol_m'] == 'tcp') | (df['protocol_m'] == 'udp')]
    df = df.dropna(subset=['payload'])
    df = df.dropna(axis=1, how='all')
    write_log(f'------------ CSV File {i} Processed. Final shape is {df.shape} ------------')
    df.to_csv(f'./UNSW/UNSW-1/output{i}.csv', index=False)
    write_log(f'------------ CSV File {i} Saved ------------')

KeyboardInterrupt: 

In [46]:
df = pd.read_csv('UNSW/output1.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [47]:
df

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,...,HSRP reserved,HSRP auth,HSRP MD5 Authentication type,HSRP MD5 Authentication len,HSRP MD5 Authentication algo,HSRP MD5 Authentication padding,HSRP MD5 Authentication flags,HSRP MD5 Authentication sourceip,HSRP MD5 Authentication keyid,SCTPChunkInit addr
0,1421927377,10.40.182.1,0,224.0.0.5,0,ospf,1,64,cooked linux,000400010006005056a524c20000080045c00040ef1500...,...,,,,,,,,,,
1,1421927377,10.40.85.1,0,224.0.0.5,0,ospf,1,64,cooked linux,000400010006005056a577630000080045c00040ef1400...,...,,,,,,,,,,
2,1421927387,10.40.85.1,0,224.0.0.5,0,ospf,1,64,cooked linux,000400010006005056a577630000080045c00040ef1600...,...,,,,,,,,,,
3,1421927387,10.40.182.1,0,224.0.0.5,0,ospf,1,64,cooked linux,000400010006005056a524c20000080045c00040ef1700...,...,,,,,,,,,,
4,1421927397,10.40.182.1,0,224.0.0.5,0,ospf,1,64,cooked linux,000400010006005056a524c20000080045c00040ef1900...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,1421932056,59.166.0.3,6103,149.171.126.4,52633,tcp,31,52,cooked linux,000400010006005056a524c2000008004500003408ca40...,...,,,,,,,,,,
9999996,1421932056,59.166.0.3,6103,149.171.126.4,52633,tcp,32,52,cooked linux,000000010006021ac5000000000008004500003408ca40...,...,,,,,,,,,,
9999997,1421932056,59.166.0.3,6103,149.171.126.4,52633,tcp,31,52,cooked linux,000400010006005056a524c2000008004500003408cb40...,...,,,,,,,,,,
9999998,1421932056,59.166.0.8,4793,149.171.126.9,22,tcp,32,668,cooked linux,000000010006021ac5000000000008004500029cc1f040...,...,,,,,,,,,,


In [None]:
df['protocol_m'].value_counts()

tcp            9829540
udp             164413
ospf              2970
arp               1525
others             990
sctp                64
pim                 42
any                 30
icmp                20
sep                 18
ipv6                16
sun-nd              14
swipe               14
mobile              14
encap               10
crtp                10
etherip             10
gmtp                10
pnni                10
snp                 10
iplt                10
fire                10
crudp               10
sccopmce            10
pipe                10
micp                10
sps                 10
fc                  10
ib                  10
aes-sp3-d           10
rvd                 10
ipip                10
ax.25               10
larp                10
dgp                 10
vmtp                10
secure-vmtp         10
gre                 10
rsvp                10
rdp                 10
hmp                 10
emcon               10
nvp                 10
pup        

In [48]:
df = df[(df['protocol_m'] == 'tcp') | (df['protocol_m'] == 'udp')]

In [49]:
df

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,...,HSRP reserved,HSRP auth,HSRP MD5 Authentication type,HSRP MD5 Authentication len,HSRP MD5 Authentication algo,HSRP MD5 Authentication padding,HSRP MD5 Authentication flags,HSRP MD5 Authentication sourceip,HSRP MD5 Authentication keyid,SCTPChunkInit addr
8,1421927414,149.171.126.0,53,59.166.0.3,49664,udp,30,89,cooked linux,000000010006001b17059e1c0000080045000059f17a40...,...,,,,,,,,,,
9,1421927414,149.171.126.5,53,59.166.0.5,3593,udp,29,82,cooked linux,000400010006005056a577630000080045000052456e40...,...,,,,,,,,,,
10,1421927414,149.171.126.5,53,59.166.0.5,3593,udp,30,82,cooked linux,000000010006001b17059e1c0000080045000052456e40...,...,,,,,,,,,,
11,1421927414,149.171.126.16,80,175.45.176.0,13284,tcp,252,48,cooked linux,000400010006005056a577630000080045000030d80700...,...,,,,,,,,,,
12,1421927414,149.171.126.16,80,175.45.176.0,13284,tcp,253,48,cooked linux,000000010006001b17059e1c0000080045000030d80700...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999995,1421932056,59.166.0.3,6103,149.171.126.4,52633,tcp,31,52,cooked linux,000400010006005056a524c2000008004500003408ca40...,...,,,,,,,,,,
9999996,1421932056,59.166.0.3,6103,149.171.126.4,52633,tcp,32,52,cooked linux,000000010006021ac5000000000008004500003408ca40...,...,,,,,,,,,,
9999997,1421932056,59.166.0.3,6103,149.171.126.4,52633,tcp,31,52,cooked linux,000400010006005056a524c2000008004500003408cb40...,...,,,,,,,,,,
9999998,1421932056,59.166.0.8,4793,149.171.126.9,22,tcp,32,668,cooked linux,000000010006021ac5000000000008004500029cc1f040...,...,,,,,,,,,,


In [50]:
df = df.dropna(subset=['payload'])

In [51]:
df

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,...,HSRP reserved,HSRP auth,HSRP MD5 Authentication type,HSRP MD5 Authentication len,HSRP MD5 Authentication algo,HSRP MD5 Authentication padding,HSRP MD5 Authentication flags,HSRP MD5 Authentication sourceip,HSRP MD5 Authentication keyid,SCTPChunkInit addr
8,1421927414,149.171.126.0,53,59.166.0.3,49664,udp,30,89,cooked linux,000000010006001b17059e1c0000080045000059f17a40...,...,,,,,,,,,,
9,1421927414,149.171.126.5,53,59.166.0.5,3593,udp,29,82,cooked linux,000400010006005056a577630000080045000052456e40...,...,,,,,,,,,,
10,1421927414,149.171.126.5,53,59.166.0.5,3593,udp,30,82,cooked linux,000000010006001b17059e1c0000080045000052456e40...,...,,,,,,,,,,
15,1421927414,149.171.126.0,53,59.166.0.3,49664,udp,29,89,cooked linux,000400010006005056a577630000080045000059f17a40...,...,,,,,,,,,,
24,1421927414,149.171.126.4,53,59.166.0.6,2142,udp,29,82,cooked linux,000400010006005056a577630000080045000052f17c40...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999974,1421932056,59.166.0.8,51456,149.171.126.3,6881,tcp,32,69,cooked linux,000000010006021ac50000000000080045000045c6a740...,...,,,,,,,,,,
9999986,1421932056,59.166.0.8,4793,149.171.126.9,22,tcp,31,668,cooked linux,000400010006005056a524c2000008004500029cc1f040...,...,,,,,,,,,,
9999987,1421932056,59.166.0.8,4793,149.171.126.9,22,tcp,31,80,cooked linux,000400010006005056a524c20000080045000050c1f140...,...,,,,,,,,,,
9999988,1421932056,59.166.0.8,4793,149.171.126.9,22,tcp,32,80,cooked linux,000000010006021ac50000000000080045000050c1f140...,...,,,,,,,,,,


In [58]:
df = df.loc[:, df.isna().sum() <= 1000]

In [59]:
df

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,...,IP ihl,IP tos,IP len,IP id,IP frag,IP ttl,IP proto,IP chksum,IP src,IP dst
8,1421927414,149.171.126.0,53,59.166.0.3,49664,udp,30,89,cooked linux,000000010006001b17059e1c0000080045000059f17a40...,...,5.0,0x0,89.0,61818.0,0.0,30.0,udp,0x1bc5,149.171.126.0,59.166.0.3
9,1421927414,149.171.126.5,53,59.166.0.5,3593,udp,29,82,cooked linux,000400010006005056a577630000080045000052456e40...,...,5.0,0x0,82.0,17774.0,0.0,29.0,udp,0xc8d1,149.171.126.5,59.166.0.5
10,1421927414,149.171.126.5,53,59.166.0.5,3593,udp,30,82,cooked linux,000000010006001b17059e1c0000080045000052456e40...,...,5.0,0x0,82.0,17774.0,0.0,30.0,udp,0xc7d1,149.171.126.5,59.166.0.5
15,1421927414,149.171.126.0,53,59.166.0.3,49664,udp,29,89,cooked linux,000400010006005056a577630000080045000059f17a40...,...,5.0,0x0,89.0,61818.0,0.0,29.0,udp,0x1cc5,149.171.126.0,59.166.0.3
24,1421927414,149.171.126.4,53,59.166.0.6,2142,udp,29,82,cooked linux,000400010006005056a577630000080045000052f17c40...,...,5.0,0x0,82.0,61820.0,0.0,29.0,udp,0x1cc3,149.171.126.4,59.166.0.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999974,1421932056,59.166.0.8,51456,149.171.126.3,6881,tcp,32,69,cooked linux,000000010006021ac50000000000080045000045c6a740...,...,5.0,0x0,69.0,50855.0,0.0,32.0,tcp,0x44af,59.166.0.8,149.171.126.3
9999986,1421932056,59.166.0.8,4793,149.171.126.9,22,tcp,31,668,cooked linux,000400010006005056a524c2000008004500029cc1f040...,...,5.0,0x0,668.0,49648.0,0.0,31.0,tcp,0x4809,59.166.0.8,149.171.126.9
9999987,1421932056,59.166.0.8,4793,149.171.126.9,22,tcp,31,80,cooked linux,000400010006005056a524c20000080045000050c1f140...,...,5.0,0x0,80.0,49649.0,0.0,31.0,tcp,0x4a54,59.166.0.8,149.171.126.9
9999988,1421932056,59.166.0.8,4793,149.171.126.9,22,tcp,32,80,cooked linux,000000010006021ac50000000000080045000050c1f140...,...,5.0,0x0,80.0,49649.0,0.0,32.0,tcp,0x4954,59.166.0.8,149.171.126.9


In [60]:
df.columns

Index(['stime', 'srcip', 'sport', 'dstip', 'dsport', 'protocol_m', 'sttl',
       'total_len', 'first_layer', 'packet', 'payload', 't_delta',
       'stime_flow', 'attack_cat', 'label', 'ltime_max',
       'cooked linux pkttype', 'cooked linux lladdrlen', 'cooked linux src',
       'cooked linux proto', 'IP version', 'IP ihl', 'IP tos', 'IP len',
       'IP id', 'IP frag', 'IP ttl', 'IP proto', 'IP chksum', 'IP src',
       'IP dst'],
      dtype='object')

In [None]:
df['payload_len'] = df.payload.apply(len)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['payload_len'] = df.payload.apply(len)


In [None]:
df = df[df['payload_len'] > 100]

In [None]:
df

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,...,HSRP auth,HSRP MD5 Authentication type,HSRP MD5 Authentication len,HSRP MD5 Authentication algo,HSRP MD5 Authentication padding,HSRP MD5 Authentication flags,HSRP MD5 Authentication sourceip,HSRP MD5 Authentication keyid,SCTPChunkInit addr,payload_len
8,1421927414,149.171.126.0,53,59.166.0.3,49664,udp,30,89,cooked linux,000000010006001b17059e1c0000080045000059f17a40...,...,,,,,,,,,,122
9,1421927414,149.171.126.5,53,59.166.0.5,3593,udp,29,82,cooked linux,000400010006005056a577630000080045000052456e40...,...,,,,,,,,,,108
10,1421927414,149.171.126.5,53,59.166.0.5,3593,udp,30,82,cooked linux,000000010006001b17059e1c0000080045000052456e40...,...,,,,,,,,,,108
15,1421927414,149.171.126.0,53,59.166.0.3,49664,udp,29,89,cooked linux,000400010006005056a577630000080045000059f17a40...,...,,,,,,,,,,122
24,1421927414,149.171.126.4,53,59.166.0.6,2142,udp,29,82,cooked linux,000400010006005056a577630000080045000052f17c40...,...,,,,,,,,,,108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9999895,1421932056,59.166.0.8,4006,149.171.126.3,80,tcp,32,285,cooked linux,000000010006021ac5000000000008004500011dc74a40...,...,,,,,,,,,,466
9999922,1421932056,175.45.176.1,37328,149.171.126.15,80,tcp,254,293,cooked linux,000400010006005056a524c20000080045000125761700...,...,,,,,,,,,,506
9999923,1421932056,175.45.176.1,37328,149.171.126.15,80,tcp,255,293,cooked linux,000000010006021ac50000000000080045000125761700...,...,,,,,,,,,,506
9999986,1421932056,59.166.0.8,4793,149.171.126.9,22,tcp,31,668,cooked linux,000400010006005056a524c2000008004500029cc1f040...,...,,,,,,,,,,1232


In [21]:
df['attack_cat'].value_counts()

normal            4137696
exploits           141748
dos                 48767
fuzzers             20695
generic             18036
reconnaissance       2264
worms                1248
shellcode             246
backdoor              124
Name: attack_cat, dtype: int64

In [None]:
count_dict = df['attack_cat'].value_counts().to_dict()

In [None]:
total_sample_size = 5000
sample_size_per_category = total_sample_size // len(count_dict)

In [36]:
sample_df = pd.DataFrame()

for category in proportions.keys():
    num_samples = min(len(df[df['attack_cat'] == category]), sample_size_per_category)
    samples = df[df['attack_cat'] == category].sample(n=num_samples)
    sample_df = pd.concat([sample_df, samples])

if len(sample_df) < total_sample_size:
    additional_samples = total_sample_size - len(sample_df)
    normal_rows = df[df['attack_cat'] == 'normal']
    normal_rows = normal_rows.loc[~normal_rows.index.isin(sample_df.index)]
    normal_samples = normal_rows.sample(n=min(len(normal_rows), additional_samples))
    sample_df = pd.concat([sample_df, normal_samples])

sample_df.reset_index(drop=True, inplace=True)

In [37]:
sample_df

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,...,HSRP auth,HSRP MD5 Authentication type,HSRP MD5 Authentication len,HSRP MD5 Authentication algo,HSRP MD5 Authentication padding,HSRP MD5 Authentication flags,HSRP MD5 Authentication sourceip,HSRP MD5 Authentication keyid,SCTPChunkInit addr,payload_len
0,1421927938,59.166.0.1,4785,149.171.126.3,29746,tcp,32,1500,cooked linux,000000010006021ac500000000000800450005dccb4e40...,...,,,,,,,,,,2896
1,1421930041,149.171.126.1,13417,59.166.0.7,34687,tcp,30,1352,cooked linux,000000010006001b17059e1c0000080045000548ab6f40...,...,,,,,,,,,,2600
2,1421929999,149.171.126.4,80,59.166.0.0,8502,tcp,29,1500,cooked linux,000400010006005056a5776300000800450005dccf5a40...,...,,,,,,,,,,2896
3,1421930859,149.171.126.1,14589,59.166.0.3,63488,tcp,30,1352,cooked linux,000000010006001b17059e1c0000080045000548bc8f40...,...,,,,,,,,,,2600
4,1421927594,149.171.126.1,80,59.166.0.2,61722,tcp,30,1500,cooked linux,000000010006001b17059e1c00000800450005dc2f0a40...,...,,,,,,,,,,2896
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1421927479,149.171.126.1,6881,59.166.0.2,17586,tcp,30,1500,cooked linux,000000010006001b17059e1c00000800450005dcc88140...,...,,,,,,,,,,2896
4996,1421931865,149.171.126.5,6881,59.166.0.6,43379,tcp,30,1500,cooked linux,000000010006001b17059e1c00000800450005dc014d40...,...,,,,,,,,,,2896
4997,1421930827,149.171.126.4,143,59.166.0.3,62912,tcp,30,204,cooked linux,000000010006001b17059e1c00000800450000cc0fb240...,...,,,,,,,,,,304
4998,1421932036,149.171.126.6,53,59.166.0.0,63717,udp,29,89,cooked linux,000400010006005056a577630000080045000059f4e740...,...,,,,,,,,,,122


In [38]:
sample_df['attack_cat'].value_counts()

normal            1300
exploits           555
dos                555
fuzzers            555
generic            555
reconnaissance     555
worms              555
shellcode          246
backdoor           124
Name: attack_cat, dtype: int64

# Concatenate Files

In [4]:
df = pd.DataFrame()
for i in range(13,20):
    if not os.path.isfile(f'./UNSW/UNSW-1/output{i}.csv'):
        continue
    write_log(f'<<<<<<<<----- Started Reading CSV File {i} ----->>>>>>>>')
    temp_df = pd.read_csv(f'./UNSW/UNSW-1/output{i}.csv')
    df = pd.concat([df, temp_df], ignore_index=True)
    write_log(f'------------ CSV File {i} added to DataFrame ------------')

  temp_df = pd.read_csv(f'./UNSW/UNSW-1/output{i}.csv')
  temp_df = pd.read_csv(f'./UNSW/UNSW-1/output{i}.csv')
  temp_df = pd.read_csv(f'./UNSW/UNSW-1/output{i}.csv')
  temp_df = pd.read_csv(f'./UNSW/UNSW-1/output{i}.csv')
  temp_df = pd.read_csv(f'./UNSW/UNSW-1/output{i}.csv')
  temp_df = pd.read_csv(f'./UNSW/UNSW-1/output{i}.csv')


In [21]:
df

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,...,DCE/RPC v5 ptype,DCE/RPC v5 pfc_flags,DCE/RPC v5 endian,DCE/RPC v5 encoding,DCE/RPC v5 float,DCE/RPC v5 frag_len,DCE/RPC v5 auth_len,DCE/RPC v5 call_id,DCE/RPC v5 - Bind reserved,DCE/RPC v5 - Bind if_uuid
0,1.421933e+09,149.171.126.2,143.0,59.166.0.6,65090.0,tcp,30.0,73.0,cooked linux,000000010006001b17059e1c0000080045000049a5ae40...,...,,,,,,,,,,
1,1.421928e+09,149.171.126.6,80.0,59.166.0.7,47518.0,tcp,30.0,1500.0,cooked linux,000000010006001b17059e1c00000800450005dc590f40...,...,,,,,,,,,,
2,1.421940e+09,149.171.126.4,5190.0,59.166.0.0,40335.0,tcp,29.0,76.0,cooked linux,000400010006005056a57763000008004500004ca61f40...,...,,,,,,,,,,
3,1.421933e+09,149.171.126.0,6881.0,59.166.0.4,55114.0,tcp,29.0,1500.0,cooked linux,000400010006005056a5776300000800450005dcb9a540...,...,,,,,,,,,,
4,1.421943e+09,149.171.126.8,6881.0,59.166.0.1,34569.0,tcp,29.0,521.0,cooked linux,000400010006005056a577630000080045000209344240...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287764,1.424262e+09,175.45.176.2,20503.0,149.171.126.10,33532.0,tcp,254.0,313.0,cooked linux,000400010006005056a524c20000080045000139a19e00...,...,,,,,,,,,,
287765,1.424262e+09,149.171.126.12,1014.0,175.45.176.0,60251.0,tcp,253.0,148.0,cooked linux,000000010006001b17059e1c0000080045000094043500...,...,,,,,,,,,,
287766,1.424262e+09,149.171.126.12,1014.0,175.45.176.0,60251.0,tcp,252.0,148.0,cooked linux,000400010006005056a577630000080045000094043500...,...,,,,,,,,,,
287767,1.424262e+09,175.45.176.0,60251.0,149.171.126.12,1014.0,tcp,255.0,54.0,cooked linux,000000010006021ac5000000000008004500003603ad00...,...,,,,,,,,,,


In [5]:
df['attack_cat'].value_counts() # Part 1

normal            32636134
exploits            287057
dos                  88175
fuzzers              47422
generic              39075
reconnaissance        8264
worms                 1733
shellcode              918
analysis               483
backdoor               449
Name: attack_cat, dtype: int64

In [8]:
df['attack_cat'].value_counts() # Part 2

normal            32208840
exploits            552233
dos                 164759
generic              75602
fuzzers              61236
reconnaissance       25306
worms                 3026
shellcode             1785
analysis              1100
backdoor               890
Name: attack_cat, dtype: int64

In [6]:
df['attack_cat'].value_counts() # Part 3

normal            31653081
exploits           1104825
dos                 317321
generic             162265
fuzzers             144444
reconnaissance       32193
worms                 6619
shellcode             3549
analysis              2288
backdoor              2028
Name: attack_cat, dtype: int64

In [7]:
df.to_csv(f'./UNSW/UNSW-2/output3.csv', index=False)

In [8]:
df

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,...,Link Local Multicast Node Resolution - Query an,Link Local Multicast Node Resolution - Query ns,Link Local Multicast Node Resolution - Query rrname,Link Local Multicast Node Resolution - Query type,Link Local Multicast Node Resolution - Query rclass,Link Local Multicast Node Resolution - Query ttl,Link Local Multicast Node Resolution - Query rdlen,Link Local Multicast Node Resolution - Query rdata,Link Local Multicast Node Resolution - Query load,Link Local Multicast Node Resolution - Query ar
0,1.421927e+09,149.171.126.0,53.0,59.166.0.3,49664.0,udp,30.0,89.0,cooked linux,000000010006001b17059e1c0000080045000059f17a40...,...,,,,,,,,,,
1,1.421927e+09,149.171.126.5,53.0,59.166.0.5,3593.0,udp,29.0,82.0,cooked linux,000400010006005056a577630000080045000052456e40...,...,,,,,,,,,,
2,1.421927e+09,149.171.126.5,53.0,59.166.0.5,3593.0,udp,30.0,82.0,cooked linux,000000010006001b17059e1c0000080045000052456e40...,...,,,,,,,,,,
3,1.421927e+09,149.171.126.0,53.0,59.166.0.3,49664.0,udp,29.0,89.0,cooked linux,000400010006005056a577630000080045000059f17a40...,...,,,,,,,,,,
4,1.421927e+09,149.171.126.4,53.0,59.166.0.6,2142.0,udp,29.0,82.0,cooked linux,000400010006005056a577630000080045000052f17c40...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49465261,1.421972e+09,149.171.126.1,21.0,59.166.0.5,46392.0,tcp,29.0,58.0,cooked linux,000400010006005056a57763000008004500003a76d440...,...,,,,,,,,,,
49465262,1.421972e+09,149.171.126.1,21.0,59.166.0.5,46392.0,tcp,30.0,58.0,cooked linux,000000010006001b17059e1c000008004500003a76d440...,...,,,,,,,,,,
49465263,1.421972e+09,149.171.126.0,22.0,59.166.0.5,47614.0,tcp,29.0,100.0,cooked linux,000400010006005056a577630000080045000064f7e140...,...,,,,,,,,,,
49465264,1.421972e+09,149.171.126.8,22.0,59.166.0.7,59131.0,tcp,30.0,100.0,cooked linux,000000010006001b17059e1c0000080045000064e83a40...,...,,,,,,,,,,


# Sampling DataFrame

In [2]:
df = pd.read_csv('./UNSW/UNSW-2/output1.csv')

  df = pd.read_csv('./UNSW/UNSW-2/output1.csv')


In [3]:
df

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,...,802.15.4 Beacon src_panid,802.15.4 Beacon src_addr,L2TP hdr,L2TP offset,L2TP len,L2TP ns,L2TP nr,MGCP verb,MGCP endpoint,MGCP version
0,1.421927e+09,149.171.126.0,53.0,59.166.0.3,49664.0,udp,30.0,89.0,cooked linux,000000010006001b17059e1c0000080045000059f17a40...,...,,,,,,,,,,
1,1.421927e+09,149.171.126.5,53.0,59.166.0.5,3593.0,udp,29.0,82.0,cooked linux,000400010006005056a577630000080045000052456e40...,...,,,,,,,,,,
2,1.421927e+09,149.171.126.5,53.0,59.166.0.5,3593.0,udp,30.0,82.0,cooked linux,000000010006001b17059e1c0000080045000052456e40...,...,,,,,,,,,,
3,1.421927e+09,149.171.126.0,53.0,59.166.0.3,49664.0,udp,29.0,89.0,cooked linux,000400010006005056a577630000080045000059f17a40...,...,,,,,,,,,,
4,1.421927e+09,149.171.126.4,53.0,59.166.0.6,2142.0,udp,29.0,82.0,cooked linux,000400010006005056a577630000080045000052f17c40...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33109705,1.421957e+09,149.171.126.6,80.0,59.166.0.9,35057.0,tcp,30.0,1500.0,cooked linux,000000010006001b17059e1c00000800450005dc5cfe40...,...,,,,,,,,,,
33109706,1.421957e+09,149.171.126.3,6881.0,59.166.0.7,21827.0,tcp,29.0,1500.0,cooked linux,000400010006005056a5776300000800450005dcf55240...,...,,,,,,,,,,
33109707,1.421957e+09,149.171.126.3,6881.0,59.166.0.7,21827.0,tcp,30.0,1500.0,cooked linux,000000010006001b17059e1c00000800450005dcf55240...,...,,,,,,,,,,
33109708,1.421957e+09,149.171.126.3,6881.0,59.166.0.7,21827.0,tcp,29.0,1500.0,cooked linux,000400010006005056a5776300000800450005dcf55040...,...,,,,,,,,,,


In [4]:
df = df[df['protocol_m']=='tcp']
columns_to_remove = df.columns[df.notna().sum() < 1000]
rows_to_remove = df[df[columns_to_remove].notna().any(axis=1)].index
df = df.drop(columns_to_remove, axis=1)
df = df.drop(rows_to_remove)
df = df.dropna(subset='Raw load')

In [5]:
df['attack_cat'].value_counts()

normal            31677346
exploits            253237
dos                  78131
generic              30859
fuzzers              25975
reconnaissance        2039
worms                 1530
backdoor               252
analysis               234
shellcode              220
Name: attack_cat, dtype: int64

In [6]:
dic = {
    'normal':18000,
'exploits':10000,
'dos':10000,
'generic':5000,
'fuzzers':5000,
'reconnaissance':50000,
'worms':50000,
'shellcode':50000,
'analysis':50000,
    'backdoor':50000
}

In [7]:
val = df['attack_cat'].value_counts().to_dict()
fdf = pd.DataFrame()
for key in dic.keys():
    if dic[key] < val[key]:
        sdf = df[df['attack_cat']==key].sample(dic[key], ignore_index=True)
    else:
        sdf = df[df['attack_cat']==key]
    fdf = pd.concat([fdf, sdf], ignore_index=True)

In [11]:
sdf = df[df['attack_cat']=='analysis']

In [27]:
sdf = df[df['attack_cat']=='normal'].sample(1500, ignore_index=True)

In [28]:
fdf = pd.concat([fdf, sdf], ignore_index=True)

In [8]:
fdf

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,...,TCP urgptr,TCP options,Padding load,SMB Negotiate Extended Security Response (SMB) load,Skinny len,Skinny res,Skinny msg,PPTP len,PPTP type,PPTP data
0,1.421933e+09,149.171.126.2,143.0,59.166.0.6,65090.0,tcp,30.0,73.0,cooked linux,000000010006001b17059e1c0000080045000049a5ae40...,...,0.0,"[('NOP', None), ('NOP', None), ('Timestamp', (...",,,,,,,,
1,1.421928e+09,149.171.126.6,80.0,59.166.0.7,47518.0,tcp,30.0,1500.0,cooked linux,000000010006001b17059e1c00000800450005dc590f40...,...,0.0,"[('NOP', None), ('NOP', None), ('Timestamp', (...",,,,,,,,
2,1.421940e+09,149.171.126.4,5190.0,59.166.0.0,40335.0,tcp,29.0,76.0,cooked linux,000400010006005056a57763000008004500004ca61f40...,...,0.0,"[('NOP', None), ('NOP', None), ('Timestamp', (...",,,,,,,,
3,1.421933e+09,149.171.126.0,6881.0,59.166.0.4,55114.0,tcp,29.0,1500.0,cooked linux,000400010006005056a5776300000800450005dcb9a540...,...,0.0,"[('NOP', None), ('NOP', None), ('Timestamp', (...",,,,,,,,
4,1.421943e+09,149.171.126.8,6881.0,59.166.0.1,34569.0,tcp,29.0,521.0,cooked linux,000400010006005056a577630000080045000209344240...,...,0.0,"[('NOP', None), ('NOP', None), ('Timestamp', (...",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52270,1.421934e+09,175.45.176.0,2137.0,149.171.126.15,80.0,tcp,255.0,241.0,cooked linux,000000010006021ac500000000000800450000f167d300...,...,0.0,[],,,,,,,,
52271,1.421934e+09,175.45.176.1,37794.0,149.171.126.10,9999.0,tcp,254.0,52.0,cooked linux,000400010006005056a524c20000080045000034b99d00...,...,0.0,[],,,,,,,,
52272,1.421934e+09,175.45.176.1,37794.0,149.171.126.10,9999.0,tcp,255.0,52.0,cooked linux,000000010006021ac50000000000080045000034b99d00...,...,0.0,[],,,,,,,,
52273,1.421934e+09,149.171.126.10,9999.0,175.45.176.1,37794.0,tcp,252.0,1481.0,cooked linux,000400010006005056a5776300000800450005c91ae800...,...,0.0,[],,,,,,,,


In [9]:
fdf['attack_cat'].value_counts()

normal            18000
exploits          10000
dos               10000
generic            5000
fuzzers            5000
reconnaissance     2039
worms              1530
backdoor            252
analysis            234
shellcode           220
Name: attack_cat, dtype: int64

In [10]:
fdf.to_csv(f'./UNSW/UNSW-3/output1.csv', index=False)

# Combining CSV Files

In [11]:
df1 = pd.read_csv('./UNSW/UNSW-3/output1.csv')

  df1 = pd.read_csv('./UNSW/UNSW-3/output1.csv')


In [12]:
df1

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,...,TCP urgptr,TCP options,Padding load,SMB Negotiate Extended Security Response (SMB) load,Skinny len,Skinny res,Skinny msg,PPTP len,PPTP type,PPTP data
0,1.421933e+09,149.171.126.2,143.0,59.166.0.6,65090.0,tcp,30.0,73.0,cooked linux,000000010006001b17059e1c0000080045000049a5ae40...,...,0.0,"[('NOP', None), ('NOP', None), ('Timestamp', (...",,,,,,,,
1,1.421928e+09,149.171.126.6,80.0,59.166.0.7,47518.0,tcp,30.0,1500.0,cooked linux,000000010006001b17059e1c00000800450005dc590f40...,...,0.0,"[('NOP', None), ('NOP', None), ('Timestamp', (...",,,,,,,,
2,1.421940e+09,149.171.126.4,5190.0,59.166.0.0,40335.0,tcp,29.0,76.0,cooked linux,000400010006005056a57763000008004500004ca61f40...,...,0.0,"[('NOP', None), ('NOP', None), ('Timestamp', (...",,,,,,,,
3,1.421933e+09,149.171.126.0,6881.0,59.166.0.4,55114.0,tcp,29.0,1500.0,cooked linux,000400010006005056a5776300000800450005dcb9a540...,...,0.0,"[('NOP', None), ('NOP', None), ('Timestamp', (...",,,,,,,,
4,1.421943e+09,149.171.126.8,6881.0,59.166.0.1,34569.0,tcp,29.0,521.0,cooked linux,000400010006005056a577630000080045000209344240...,...,0.0,"[('NOP', None), ('NOP', None), ('Timestamp', (...",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52270,1.421934e+09,175.45.176.0,2137.0,149.171.126.15,80.0,tcp,255.0,241.0,cooked linux,000000010006021ac500000000000800450000f167d300...,...,0.0,[],,,,,,,,
52271,1.421934e+09,175.45.176.1,37794.0,149.171.126.10,9999.0,tcp,254.0,52.0,cooked linux,000400010006005056a524c20000080045000034b99d00...,...,0.0,[],,,,,,,,
52272,1.421934e+09,175.45.176.1,37794.0,149.171.126.10,9999.0,tcp,255.0,52.0,cooked linux,000000010006021ac50000000000080045000034b99d00...,...,0.0,[],,,,,,,,
52273,1.421934e+09,149.171.126.10,9999.0,175.45.176.1,37794.0,tcp,252.0,1481.0,cooked linux,000400010006005056a5776300000800450005c91ae800...,...,0.0,[],,,,,,,,


In [13]:
df2 = pd.read_csv('./UNSW/UNSW-3/output2.csv')

  df2 = pd.read_csv('./UNSW/UNSW-3/output2.csv')


In [14]:
df2

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,...,TCP window,TCP chksum,TCP urgptr,TCP options,Raw load,PPTP len,PPTP type,PPTP data,Padding load,SMB Negotiate Extended Security Response (SMB) load
0,1.421966e+09,149.171.126.6,37893.0,59.166.0.9,12183.0,tcp,29.0,1352.0,cooked linux,000400010006005056a577630000080045000548ac8a40...,...,14480.0,0xa276,0.0,"[('NOP', None), ('NOP', None), ('Timestamp', (...",'\x0e%\\xaa\x12\x08\\xc6\'\\xe0\\xcb^\\x9e\\xd...,,,,,
1,1.424228e+09,59.166.0.0,34710.0,149.171.126.7,40734.0,tcp,31.0,74.0,cooked linux,000400010006005056a524c2000008004500004a381340...,...,8688.0,0xa3a6,0.0,"[('NOP', None), ('NOP', None), ('Timestamp', (...",'\\xe3\x11\x00\x00\x00O[\x7f\x1ả\\xb1\\\\x92\...,,,,,
2,1.421968e+09,149.171.126.4,80.0,59.166.0.1,10602.0,tcp,30.0,1500.0,cooked linux,000000010006001b17059e1c00000800450005dcd96340...,...,7240.0,0xb790,0.0,"[('NOP', None), ('NOP', None), ('Timestamp', (...",'S\\xdaI]\x18\\xc4\\xd6>s\\xf5\\xc5\\xe25\\x8d...,,,,,
3,1.421972e+09,59.166.0.4,36158.0,149.171.126.0,143.0,tcp,31.0,68.0,cooked linux,000400010006005056a524c20000080045000044415a40...,...,18824.0,0x6fdf,0.0,"[('NOP', None), ('NOP', None), ('Timestamp', (...","'7 lsub """" ""~*""\r\n'",,,,,
4,1.421968e+09,149.171.126.5,80.0,59.166.0.2,39224.0,tcp,30.0,1500.0,cooked linux,000000010006001b17059e1c00000800450005dc589640...,...,7240.0,0x5143,0.0,"[('NOP', None), ('NOP', None), ('Timestamp', (...",'ɑ\x10\\xbd(#\\\\xe8\\xb5K\\x97\\x93\\xb1\x05\...,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82673,1.424234e+09,149.171.126.11,9999.0,175.45.176.0,65359.0,tcp,253.0,1481.0,cooked linux,000000010006001b17059e1c00000800450005c9a0f400...,...,16383.0,0xc052,0.0,[],'ForCed EnTrY 1.49.2\r\n\r\n\r\nConnection Sta...,,,,,
82674,1.424234e+09,175.45.176.0,65359.0,149.171.126.11,9999.0,tcp,255.0,52.0,cooked linux,000000010006021ac5000000000008004500003474d000...,...,16383.0,0x76c,0.0,[],'GETOSVERSION',,,,,
82675,1.424234e+09,175.45.176.0,65359.0,149.171.126.11,9999.0,tcp,254.0,52.0,cooked linux,000400010006005056a524c2000008004500003474d000...,...,16383.0,0x76c,0.0,[],'GETOSVERSION',,,,,
82676,1.424234e+09,149.171.126.12,33471.0,175.45.176.3,35817.0,tcp,253.0,128.0,cooked linux,000000010006001b17059e1c0000080045000080f85600...,...,16383.0,0x4c8d,0.0,[],'Microsoft Windows XP [Version 5.1.2600]\r\n(C...,,,,,


In [15]:
df3 = pd.read_csv('./UNSW/UNSW-3/output3.csv')

  df3 = pd.read_csv('./UNSW/UNSW-3/output3.csv')


In [16]:
df3

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,...,DCE/RPC v5 ptype,DCE/RPC v5 pfc_flags,DCE/RPC v5 endian,DCE/RPC v5 encoding,DCE/RPC v5 float,DCE/RPC v5 frag_len,DCE/RPC v5 auth_len,DCE/RPC v5 call_id,DCE/RPC v5 - Bind reserved,DCE/RPC v5 - Bind if_uuid
0,1.424242e+09,149.171.126.1,6881.0,59.166.0.7,6394.0,tcp,30.0,1500.0,cooked linux,000000010006001b17059e1c00000800450005dcb06f40...,...,,,,,,,,,,
1,1.424257e+09,59.166.0.0,53858.0,149.171.126.5,143.0,tcp,32.0,71.0,cooked linux,000000010006021ac50000000000080045000047cca140...,...,,,,,,,,,,
2,1.424250e+09,149.171.126.9,6881.0,59.166.0.0,56762.0,tcp,29.0,1500.0,cooked linux,000400010006005056a5776300000800450005dcc63540...,...,,,,,,,,,,
3,1.424254e+09,149.171.126.0,6881.0,59.166.0.2,52183.0,tcp,29.0,1500.0,cooked linux,000400010006005056a5776300000800450005dcb7a640...,...,,,,,,,,,,
4,1.424248e+09,149.171.126.8,35358.0,59.166.0.1,7838.0,tcp,29.0,405.0,cooked linux,000400010006005056a5776300000800450001953c2440...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152811,1.424262e+09,175.45.176.2,20503.0,149.171.126.10,33532.0,tcp,254.0,313.0,cooked linux,000400010006005056a524c20000080045000139a19e00...,...,,,,,,,,,,
152812,1.424262e+09,149.171.126.12,1014.0,175.45.176.0,60251.0,tcp,253.0,148.0,cooked linux,000000010006001b17059e1c0000080045000094043500...,...,,,,,,,,,,
152813,1.424262e+09,149.171.126.12,1014.0,175.45.176.0,60251.0,tcp,252.0,148.0,cooked linux,000400010006005056a577630000080045000094043500...,...,,,,,,,,,,
152814,1.424262e+09,175.45.176.0,60251.0,149.171.126.12,1014.0,tcp,255.0,54.0,cooked linux,000000010006021ac5000000000008004500003603ad00...,...,,,,,,,,,,


In [17]:
df = pd.concat([df1, df2, df3], ignore_index=True)

In [18]:
df

Unnamed: 0,stime,srcip,sport,dstip,dsport,protocol_m,sttl,total_len,first_layer,packet,...,DCE/RPC v5 ptype,DCE/RPC v5 pfc_flags,DCE/RPC v5 endian,DCE/RPC v5 encoding,DCE/RPC v5 float,DCE/RPC v5 frag_len,DCE/RPC v5 auth_len,DCE/RPC v5 call_id,DCE/RPC v5 - Bind reserved,DCE/RPC v5 - Bind if_uuid
0,1.421933e+09,149.171.126.2,143.0,59.166.0.6,65090.0,tcp,30.0,73.0,cooked linux,000000010006001b17059e1c0000080045000049a5ae40...,...,,,,,,,,,,
1,1.421928e+09,149.171.126.6,80.0,59.166.0.7,47518.0,tcp,30.0,1500.0,cooked linux,000000010006001b17059e1c00000800450005dc590f40...,...,,,,,,,,,,
2,1.421940e+09,149.171.126.4,5190.0,59.166.0.0,40335.0,tcp,29.0,76.0,cooked linux,000400010006005056a57763000008004500004ca61f40...,...,,,,,,,,,,
3,1.421933e+09,149.171.126.0,6881.0,59.166.0.4,55114.0,tcp,29.0,1500.0,cooked linux,000400010006005056a5776300000800450005dcb9a540...,...,,,,,,,,,,
4,1.421943e+09,149.171.126.8,6881.0,59.166.0.1,34569.0,tcp,29.0,521.0,cooked linux,000400010006005056a577630000080045000209344240...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
287764,1.424262e+09,175.45.176.2,20503.0,149.171.126.10,33532.0,tcp,254.0,313.0,cooked linux,000400010006005056a524c20000080045000139a19e00...,...,,,,,,,,,,
287765,1.424262e+09,149.171.126.12,1014.0,175.45.176.0,60251.0,tcp,253.0,148.0,cooked linux,000000010006001b17059e1c0000080045000094043500...,...,,,,,,,,,,
287766,1.424262e+09,149.171.126.12,1014.0,175.45.176.0,60251.0,tcp,252.0,148.0,cooked linux,000400010006005056a577630000080045000094043500...,...,,,,,,,,,,
287767,1.424262e+09,175.45.176.0,60251.0,149.171.126.12,1014.0,tcp,255.0,54.0,cooked linux,000000010006021ac5000000000008004500003603ad00...,...,,,,,,,,,,


In [20]:
df.to_csv('./UNSW/UNSW-4/output.csv', index=False)

In [2]:
df = pd.read_csv('./UNSW/UNSW-4/output.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [19]:
print(df['attack_cat'].value_counts())

normal            50000
exploits          50000
dos               50000
generic           50000
fuzzers           50000
reconnaissance    22571
worms              9970
analysis           1896
backdoor           1834
shellcode          1498
Name: attack_cat, dtype: int64


# EDA

In [None]:
k = 1
df = pd.DataFrame()
for i in range(0,20):
    if not os.path.isfile(f'./UNSW/output{i}.csv'):
        continue
    write_log(f'<<<<<<<<----- Started Reading CSV File {i} ----->>>>>>>>')
    temp_df = pd.read_csv(f'./UNSW/output{i}.csv')
    df = pd.concat([df, temp_df], ignore_index=True)
    write_log(f'------------ CSV File {i} added to DataFrame ------------')
    if i%6 == 0:
        df.to_csv(f'./UNSW/UNSW-EDA/output{k}.csv', index=False)
        print(df['attack_cat'].value_counts())
        df = pd.DataFrame()
        k = k+1

  temp_df = pd.read_csv(f'./UNSW/output{i}.csv')
  temp_df = pd.read_csv(f'./UNSW/output{i}.csv')
  temp_df = pd.read_csv(f'./UNSW/output{i}.csv')
  temp_df = pd.read_csv(f'./UNSW/output{i}.csv')
  temp_df = pd.read_csv(f'./UNSW/output{i}.csv')


# Exporting Files

In [None]:
def hex_to_dec(hex_str):
    return [int(hex_str[i:i+2], 16) for i in range(0, len(hex_str), 2)]

start_value = 1
flow1 = pd.read_csv('./UNSW/Export/UNSW_Flow.csv')
flow1 = flow1[['flow_id', 'source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol']]
flow2 = flow1.copy()
flow2.rename(columns={'source_ip': 'destination_ip', 'destination_ip': 'source_ip', 'source_port': 'destination_port', 'destination_port': 'source_port'}, inplace=True)
flow = pd.concat([flow1, flow2])
flow.drop_duplicates(subset=flow.columns.difference(['flow_id']), inplace=True)

for i in range(20):
    
    if not os.path.isfile(f'./UNSW/output{i}.csv'):
        continue
    write_log(f'<<<<<<<<----- Started Processing CSV File {i} ----->>>>>>>>\n')
    df = pd.read_csv(f'./UNSW/output{i}.csv')
    
    write_log(f'------------ BEGIN PACKET FLOW MERGE ------------')
    df.rename(columns={'packet': 'packet_hex', 'payload': 'payload_hex', 'srcip': 'source_ip', 'dstip': 'destination_ip', 'sport': 'source_port', 'dsport': 'destination_port', 'protocol_m': 'protocol'}, inplace=True)
    df['source_port'] = df['source_port'].astype(int)
    df['destination_port'] = df['destination_port'].astype(int)
    columns_to_match = ['source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol']
    df = df.merge(flow, on=columns_to_match, how='left')
    flow_id = df.pop('flow_id')
    df.insert(0, 'flow_id', flow_id)
    end_value = start_value + len(df)
    df['packet_id'] = range(start_value, end_value)
    df['payload_length'] = df.payload_hex.apply(lambda x: len(x)//2 if isinstance(x, str) else 0)
    write_log(f'------------ END PACKET FLOW MERGE ------------')
    
    write_log(f'------------ BEGIN PAYLOAD BYTES PROCESSING ------------')
    df1 = df.dropna(subset='Raw load')
    df1.reset_index(drop=True, inplace=True)
    dec_data = df1['payload_hex'].apply(hex_to_dec)
    max_len = dec_data.apply(len).max()
    df_final = pd.DataFrame(dec_data.tolist(), columns=[f'payload_byte_{i}' for i in range(1,max_len+1)])
    df_final = pd.concat([df1[['packet_id', 'flow_id', 'source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol', 'payload_length']], df_final], axis=1)
    df_final['attack_label'] = df1['attack_cat']
    df_final.drop_duplicates(subset=df_final.columns.difference(['packet_id', 'flow_id']), inplace=True)
    write_log(f'------------ END PAYLOAD BYTES PROCESSING ------------')
    
    df_final.to_csv(f'./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.csv', index=False)
    write_log(f'------------ CSV File {i} Saved for Payload Bytes ------------')
    
    del dec_data
    del df_final
    del df1
    
    write_log(f'------------ BEGIN PACKET BYTES PROCESSING ------------')
    dec_data = df['packet_hex'].apply(hex_to_dec)
    max_len = dec_data.apply(len).max()
    df_final = pd.DataFrame(dec_data.tolist(), columns=[f'packet_byte_{i}' for i in range(1,max_len+1)])
    df_final = pd.concat([df[['packet_id', 'flow_id', 'source_ip', 'source_port', 'destination_ip', 'destination_port', 'protocol', 'payload_length']], df_final], axis=1)
    df_final['attack_label'] = df['attack_cat']
    df_final.drop_duplicates(subset=df_final.columns.difference(['packet_id', 'flow_id']), inplace=True)
    write_log(f'------------ END PACKET BYTES PROCESSING ------------')
    
    df_final.to_csv(f'./UNSW/Export/Packet-Bytes/Packet_Bytes_File_{i}.csv', index=False)
    write_log(f'------------ CSV File {i} Saved for Packet Bytes ------------')
    
    del dec_data
    del df_final
    
    write_log(f'------------ BEGIN PACKET FIELDS PROCESSING ------------')
    df.drop(['stime', 'sttl', 'total_len', 'first_layer', 't_delta', 'stime_flow', 'label', 'ltime_max', 'payload_length'], axis=1, inplace=True)
    attack_label = df.pop('attack_cat')
    df.insert(len(df.columns), 'attack_label', attack_label)
    packet_id = df.pop('packet_id')
    df.insert(0, 'packet_id', packet_id)
    df.drop_duplicates(subset=df.columns.difference(['packet_id', 'flow_id']), inplace=True)
    start_value += len(df)
    counts = '\n'.join([f'{key}:{value}' for key, value in df.attack_label.value_counts().to_dict().items()])
    f = open("UNSW_INFO.txt", "a")
    f.write(f'TOTAL PACKETS IN CSV FILE {i}: {df.shape}\n')
    f.write(f'ATTACK LABELS IN CSV FILE {i}\n' + counts + '\n\n')
    f.close()
    write_log(f'------------ END PACKET FIELDS PROCESSING ------------')
    
    df.to_csv(f'./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}.csv', index=False)
    write_log(f'------------ CSV File {i} Saved for Packet Fields ------------\n')
    
    del df

  df = pd.read_csv(f'./UNSW/output{i}.csv')
IOStream.flush timed out
  df.insert(0, 'flow_id', flow_id)
  df['packet_id'] = range(start_value, end_value)
  df['payload_length'] = df.payload_hex.apply(lambda x: len(x)//2 if isinstance(x, str) else 0)


In [None]:
for i in range(2,19):
    file_name_1 = f"./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.csv"
    zip_file_name_1 = f"./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}"
    !zip {zip_file_name_1} {file_name_1}
    
    file_name_2 = f"./UNSW/Export/Packet-Bytes/Packet_Bytes_File_{i}.csv"
    zip_file_name_2 = f"./UNSW/Export/Packet-Bytes/Packet_Bytes_File_{i}"
    !zip {zip_file_name_2} {file_name_2}
    
    file_name_3 = f"./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}.csv"
    zip_file_name_3 = f"./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}"
    !zip {zip_file_name_3} {file_name_3}
    
    write_log(f"FILE {i} ZIPPED SUCCESSFULLY")

  adding: UNSW/Export/Payload-Bytes/Payload_Bytes_File_2.csv (deflated 84%)
  adding: UNSW/Export/Packet-Bytes/Packet_Bytes_File_2.csv (deflated 90%)
  adding: UNSW/Export/Packet-Fields/Packet_Fields_File_2.csv (deflated 82%)
  adding: UNSW/Export/Payload-Bytes/Payload_Bytes_File_3.csv (deflated 85%)
  adding: UNSW/Export/Packet-Bytes/Packet_Bytes_File_3.csv (deflated 90%)
  adding: UNSW/Export/Packet-Fields/Packet_Fields_File_3.csv (deflated 82%)
  adding: UNSW/Export/Payload-Bytes/Payload_Bytes_File_4.csv (deflated 84%)
  adding: UNSW/Export/Packet-Bytes/Packet_Bytes_File_4.csv (deflated 89%)
  adding: UNSW/Export/Packet-Fields/Packet_Fields_File_4.csv (deflated 81%)
  adding: UNSW/Export/Payload-Bytes/Payload_Bytes_File_5.csv (deflated 84%)
  adding: UNSW/Export/Packet-Bytes/Packet_Bytes_File_5.csv (deflated 90%)
  adding: UNSW/Export/Packet-Fields/Packet_Fields_File_5.csv (deflated 81%)
  adding: UNSW/Export/Payload-Bytes/Payload_Bytes_File_6.csv (deflated 84%)
  adding: UNSW/Expor

In [None]:
for i in range(2,19):
    file_name_1 = f"./UNSW/Export/Payload-Bytes/UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.zip"
    !unzip {file_name_1}
    
    file_name_2 = f"./UNSW/Export/Packet-Bytes/Packet_Bytes_File_{i}.zip"
    !unzip {file_name_2}
    
    file_name_3 = f"./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}.zip"
    !unzip {file_name_3}
    
    write_log(f"FILE {i} UNZIPPED SUCCESSFULLY")

# Upload Files to Hugging Face

In [16]:
from huggingface_hub import login, HfApi

In [17]:
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [18]:
api = HfApi()

In [19]:
api.upload_folder(
            folder_path="./UNSW/Export/Payload-Bytes",
            repo_id="rdpahalavan/UNSW-NB15",
            repo_type="dataset",
            path_in_repo="Payload-Bytes/",
            allow_patterns="*.parquet",
            delete_patterns="*.parquet",
            multi_commits=True,
            multi_commits_verbose=True
        )

Will create 0 deletion commit(s) and 18 addition commit(s), totalling 18 atomic operations.
Multi-commits strategy with ID c0d23037c50ff24f17fe6205fe8d24953f66f9bb1a5fe08d9c3f554df126b183.
New PR created: https://huggingface.co/datasets/rdpahalavan/UNSW-NB15/discussions/6
  step e5d7d65f25be74ce89fff63a6b96b95782c5dbdf47e8e1716009d478c7b572ce completed (still 17 to go).


Payload_Bytes_File_10.parquet:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

  step 2ef2628c74ccc33972d258e1cac6b6c02cab17b2401fd61743e613580847d5a6 completed (still 16 to go).


Payload_Bytes_File_11.parquet:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

  step 84c65ef7a0d01daf9689d92167c7318f734c6230063c33511379028c40cd0d5b completed (still 15 to go).


Payload_Bytes_File_12.parquet:   0%|          | 0.00/2.71G [00:00<?, ?B/s]

  step 745c5b4c477e36bc4055be82d170b531c43d4e2634d46917a9936586833c708e completed (still 14 to go).
  step 6b97d17426eb7cf353e0a6bfe9e24b94e74cbcb2c416a41bae3f2d27145acbbd completed (still 13 to go).


Payload_Bytes_File_14.parquet:   0%|          | 0.00/2.72G [00:00<?, ?B/s]

  step 5d85060e987bade56ba3efc8cbfa7376c1a9665fe5979125ec1d8a62a21c37ef completed (still 12 to go).


Payload_Bytes_File_15.parquet:   0%|          | 0.00/2.77G [00:00<?, ?B/s]

  step c4228cf15809d9ca0d0017481294defe69f379fec642044b7b17f1c7ffb60547 completed (still 11 to go).


Payload_Bytes_File_16.parquet:   0%|          | 0.00/2.73G [00:00<?, ?B/s]

  step aff20752c897767cbf949734c348a3e4f622cfec16cbee08129c9826481d885d completed (still 10 to go).
  step db23a788bdb0f1a78e74c43073860dedf3cc7ba4d9953cb027ad05f04c6c7d7a completed (still 9 to go).


Payload_Bytes_File_18.parquet:   0%|          | 0.00/2.60G [00:00<?, ?B/s]

  step ea971f8d3ba3a48b67a63354b6471bb644e76dac7055303715cf2053f3479b3c completed (still 8 to go).


Payload_Bytes_File_2.parquet:   0%|          | 0.00/2.66G [00:00<?, ?B/s]

  step 869d5af7aea9bc530e236623fef592d76329f2dd3341f312e1bc5dd05418aabd completed (still 7 to go).


Payload_Bytes_File_3.parquet:   0%|          | 0.00/2.62G [00:00<?, ?B/s]

  step 89802d83893758ff601ee9bb4a71b5149aac0f835c952d6dbf360f7098cd3185 completed (still 6 to go).


Payload_Bytes_File_4.parquet:   0%|          | 0.00/2.69G [00:00<?, ?B/s]

  step 2cbaec6226fc8e1e720fae55fb60994cfce8d4314c2ff104990b8d38d745d3d8 completed (still 5 to go).


Payload_Bytes_File_5.parquet:   0%|          | 0.00/2.66G [00:00<?, ?B/s]

  step 57e026ef17fd2747ac6003ea775feba275357e16c1d5026ee9515cf426f92c3f completed (still 4 to go).


Payload_Bytes_File_6.parquet:   0%|          | 0.00/2.59G [00:00<?, ?B/s]

  step adde2d0a4e91adf93eede7be3dd4f8a29b10753f4cd9d3b273fa43029553c955 completed (still 3 to go).


Payload_Bytes_File_7.parquet:   0%|          | 0.00/2.56G [00:00<?, ?B/s]

  step 251846617191479bb706501f1e467ef30a880304d0fa503a7b2176eb2e3b260d completed (still 2 to go).


Payload_Bytes_File_8.parquet:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

  step b9de4efa248aee15ae581c33e1ce9d02cbc06d5501588535d686be17d6c58c54 completed (still 1 to go).


Payload_Bytes_File_9.parquet:   0%|          | 0.00/2.57G [00:00<?, ?B/s]

  step 7db7d90a32aae5af665f96af3682f59ba3c9281763c7d92402a12d9433e052ab completed (still 0 to go).
All commits have been pushed.
PR is now open for reviews.
PR has been automatically merged (`merge_pr=True` was passed).


'https://huggingface.co/datasets/rdpahalavan/UNSW-NB15/tree/main/Payload-Bytes/'

# CSV to Parquet

In [None]:
for i in range(3,19):
    write_log(f'<<<<<<<<----- Started Processing CSV File {i} ----->>>>>>>>\n')
    df = pd.read_csv(f'./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.csv', low_memory=False)
    df = df.convert_dtypes()
    # df.to_csv(f'./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.csv', index=False)
    df.to_parquet(f'./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.parquet', index=False)
    del df
    write_log(f'------------ Payload Bytes Exported ------------')
    df = pd.read_csv(f'./UNSW/Export/Packet-Bytes/Packet_Bytes_File_{i}.csv', low_memory=False)
    df = df.convert_dtypes()
    # df.to_csv(f'./UNSW/Export/Packet-Bytes/Packet_Bytes_File_{i}.csv', index=False)
    df.to_parquet(f'./UNSW/Export/Packet-Bytes/Packet_Bytes_File_{i}.parquet', index=False)
    del df
    write_log(f'------------ Packet Bytes Exported ------------')
    df = pd.read_csv(f'./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}.csv', low_memory=False)
    df = df.convert_dtypes()
    # df.to_csv(f'./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}.csv', index=False)
    df.to_parquet(f'./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}.parquet', index=False)
    del df
    write_log(f'------------ Packet Fields Exported ------------')

In [8]:
import pandas as pd
import numpy as np

all_columns = set()

for i in range(1,19):
    df = pd.read_csv(f'./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.csv', nrows=0)
    all_columns = all_columns.union(df.columns)

In [9]:
len(all_columns)

1485

In [11]:
for i in range(1,19):
    df = pd.read_parquet(f'./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.parquet')
    missing_cols = all_columns - set(df.columns)
    if len(missing_cols) == 0:
        print(f"Completed: {i}")
        continue
    df_missing_cols = pd.DataFrame({col: np.nan for col in missing_cols}, index=df.index)
    df_missing_cols = df_missing_cols.convert_dtypes()
    df = pd.concat([df, df_missing_cols], axis=1)
    df.to_parquet(f'./UNSW/Export/Payload-Bytes/Payload_Bytes_File_{i}.parquet', index=False)
    del df_missing_cols
    del df
    print(f"Completed: {i}")

Completed: 1
Completed: 2
Completed: 3
Completed: 4
Completed: 5
Completed: 6
Completed: 7
Completed: 8
Completed: 9
Completed: 10
Completed: 11
Completed: 12
Completed: 13
Completed: 14
Completed: 15
Completed: 16
Completed: 17
Completed: 18


In [12]:
df = pd.read_parquet('UNSW/Export/Payload-Bytes/Payload_Bytes_File_1.parquet')

In [13]:
df

Unnamed: 0,packet_id,flow_id,source_ip,source_port,destination_ip,destination_port,protocol,payload_length,payload_byte_1,payload_byte_2,...,payload_byte_1468,payload_byte_1469,payload_byte_1470,payload_byte_1471,payload_byte_1472,payload_byte_1473,payload_byte_1474,payload_byte_1475,payload_byte_1476,attack_label
0,1,117,10.40.182.1,0,224.0.0.5,0,ospf,44,2,1,...,,,,,,,,,,normal
1,2,118,10.40.85.1,0,224.0.0.5,0,ospf,44,2,1,...,,,,,,,,,,normal
2,33,6,149.171.126.9,111,59.166.0.0,32119,udp,48,54,239,...,,,,,,,,,,normal
3,36,2,149.171.126.9,1024,59.166.0.0,33661,udp,72,87,198,...,,,,,,,,,,normal
4,37,2,149.171.126.9,1024,59.166.0.0,33661,udp,24,33,153,...,,,,,,,,,,normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2503011,9999929,115904,59.166.0.8,51456,149.171.126.3,6881,tcp,17,0,0,...,,,,,,,,,,normal
2503012,9999932,115900,59.166.0.3,31666,149.171.126.5,22,tcp,48,38,26,...,,,,,,,,,,normal
2503013,9999946,115900,59.166.0.3,31666,149.171.126.5,22,tcp,48,178,44,...,,,,,,,,,,normal
2503014,9999987,115922,59.166.0.8,4793,149.171.126.9,22,tcp,616,0,0,...,,,,,,,,,,normal


In [15]:
len(df.columns)

1485

In [1]:
import pandas as pd
import pyarrow.parquet as pq
from collections import defaultdict

files = [f"Packet_Fields_File_{i}.parquet" for i in range(1,19)]

column_types = defaultdict(list)

for file in files:
    df = pd.read_parquet("./UNSW/Export/Packet-Fields/"+file)
    for column, dtype in df.dtypes.items():
        column_types[column].append(dtype)
    print(f"Completed: {file}")

Completed: Packet_Fields_File_1.parquet
Completed: Packet_Fields_File_2.parquet
Completed: Packet_Fields_File_3.parquet
Completed: Packet_Fields_File_4.parquet
Completed: Packet_Fields_File_5.parquet
Completed: Packet_Fields_File_6.parquet
Completed: Packet_Fields_File_7.parquet
Completed: Packet_Fields_File_8.parquet
Completed: Packet_Fields_File_9.parquet
Completed: Packet_Fields_File_10.parquet
Completed: Packet_Fields_File_11.parquet
Completed: Packet_Fields_File_12.parquet
Completed: Packet_Fields_File_13.parquet
Completed: Packet_Fields_File_14.parquet
Completed: Packet_Fields_File_15.parquet
Completed: Packet_Fields_File_16.parquet
Completed: Packet_Fields_File_17.parquet
Completed: Packet_Fields_File_18.parquet


In [2]:
from collections import Counter

majority_column_types = {}

for column, types in column_types.items():
    counter = Counter(types)
    majority_type = counter.most_common(1)[0][0]
    majority_column_types[column] = majority_type

In [7]:
for i in range(1,19):
    df = pd.read_parquet(f'./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}.parquet')
    for column, dtype in df.dtypes.items():
        majority_type = majority_column_types[column]
        if dtype != majority_type:
            df[column] = df[column].astype(majority_type)
    df.to_parquet(f'./UNSW/Export/Packet-Fields/Packet_Fields_File_{i}.parquet', index=False)
    del df
    print(f"Completed: {i}")

Completed: 1
Completed: 2
Completed: 3
Completed: 4
Completed: 5
Completed: 6


IOStream.flush timed out


Completed: 7
Completed: 8
Completed: 9
Completed: 10
Completed: 11
Completed: 12
Completed: 13
Completed: 14
Completed: 15


IOStream.flush timed out


Completed: 16
Completed: 17
Completed: 18
