In [None]:
import pandas as pd
import numpy as np
import os

In [None]:
datalocation = './datasets.uwf.edu/data/UWF-ZeekData22/'

cols = ['resp_pkts', 
        'service', 
        'orig_ip_bytes', 
        'local_resp', 
        'missed_bytes',
        'protocol', 
        'duration', 
        'conn_state', 
        'dest_ip', 
        'orig_pkts',
        'community_id', 
        'resp_ip_bytes', 
        'dest_port', 
        'orig_bytes',
        'local_orig', 
        'datetime', 
        'history', 
        'resp_bytes', 
        'uid', 
        'src_port',
        'ts', 
        'src_ip', 
        'mitre_attack_tactics'
        ]

df = pd.DataFrame(columns=cols)

for root, dirs, files in os.walk(datalocation):
    for name in files:
        if name.endswith('.csv'):
            dfi = pd.read_csv(os.path.join(root, name))       
        elif name.endswith('.parquet'):
            dfi = pd.read_parquet(os.path.join(root, name), engine='pyarrow')
            
        else:
          continue
        if dfi.shape[1] != 23:
          print("wrong shape: ", os.path.join(name))
          continue
        print(os.path.join(name))
        df =  pd.DataFrame(np.concatenate([df.values, dfi.values]), columns=cols)
print(df.shape)
        

In [None]:
df["orig_bytes"].loc[df["orig_bytes"].notnull() == False] = -10000
df['orig_bytes'] = pd.qcut(
    df['orig_bytes'].astype(float), 
    q=5, 
    duplicates='drop')

In [None]:
df["orig_pkts"].loc[df["orig_pkts"].notnull() == False] = -10000
df['orig_pkts'] = pd.qcut(
    df['orig_pkts'].astype(float), 
    q=5,
    duplicates='drop')

IP bins:
- Class A: First octet value 0–126;
- Class B: First octet value 128–191;
- Class C: First octet value 192–223;
- Class D: First octet value 224–239;
- Class E: First octet value 240–254.
- IPv6 local link (fe80:)
- IPv6 global internet (2001:, ff02:)

In [None]:
df['dest_ip_class'] = df['dest_ip'].str.split('.', expand=True).iloc[:, 0]
df['dest_ip_class'] = df['dest_ip_class'].str.split(':', expand=True).iloc[:, 0]
df['dest_ip_class'].replace('ff02', '2001', inplace=True)
df['dest_ip_class'].replace('fe80', '999', inplace=True)
edge0 = float('-inf')
edge1 = 127
edge2 = 191
edge3 = 223
edge4 = 239
edge5 = 255
edge6 = 1000
edge7 = float('inf')
edges = pd.IntervalIndex.from_arrays([edge0, edge1, edge2, edge3, edge4, edge5, edge6], [edge1, edge2, edge3, edge4, edge5, edge6, edge7])
df['dest_ip'] = pd.cut(df['dest_ip_class'].astype(float), edges)

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for i in ['service', 'protocol', 'history', 'mitre_attack_tactics', 'orig_pkts', 'orig_bytes', 'dest_ip']:
    df[i] = le.fit_transform(df[i])

In [None]:
#np.save('preprocessed_df', df)
df[['orig_bytes', 'orig_pkts', 'history', 'protocol', 'service', 'dest_ip', 'mitre_attack_tactics']].astype(str).to_parquet('preprocessed_df.parquet')