In [80]:
# Mount Drive, Data is saved on my google drive account
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import pickle
import os
import ipaddress
import hashlib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Initialize some colors
R = "\033[91m"  # Red text
W = "\033[0m"   # Reset to default
G = "\033[92m"  # Green text

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [81]:
# Initialize Root directory and path to pickle
# Using visualzation data since dtypes are already configured
PROJECT_ROOT = '/content/drive/MyDrive/IDS'
DATA_PATH = os.path.join(PROJECT_ROOT, 'data/data.pkl')
print(DATA_PATH)

/content/drive/MyDrive/IDS/data/data.pkl


In [82]:
# Load Data
try:
  data = pd.read_pickle(DATA_PATH)
  print(f'{G} Data Loaded! {W}')
except:
  print(f'{R} Data Not Found! {W}')
data.info()

[92m Data Loaded! [0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589530 entries, 0 to 589529
Data columns (total 22 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   ts              589530 non-null  float64
 1   uid             589530 non-null  object 
 2   src_ip          589530 non-null  object 
 3   src_p           589530 non-null  int64  
 4   dst_ip          589530 non-null  object 
 5   dst_p           589530 non-null  int64  
 6   proto           589530 non-null  object 
 7   service         589530 non-null  object 
 8   duration        589530 non-null  object 
 9   src_bytes       589530 non-null  object 
 10  dst_bytes       589530 non-null  object 
 11  conn_state      589530 non-null  object 
 12  local_orig      589530 non-null  object 
 13  local_resp      589530 non-null  object 
 14  missed_bytes    589530 non-null  int64  
 15  history         589530 non-null  object 
 16  orig_pkts       589530 non-null 

# Feature Engineering
- we noticed in our EDA that orig_bytes and orig_pkts, etc.., where found highly corelatted thus to reduce the complexity of our model we will combine them


## Pre Processing
- one hot encode everything but UID, and History
- we will hash UID and History

In [83]:
 # encoded data
 # Missed Bytes is mostly 0's we will drop it
 # f_data['missed_bytes'].value_counts()

 def preprocessing(df):

    # SO IMPORTANT so pandas resets index whenever it wants i guess...

  # Data Type Assignment
  df['duration'] = pd.to_numeric(df['duration'], errors='coerce')  # Convert duration to numeric (coerce invalid values to NaN)
  df['src_bytes'] = pd.to_numeric(df['src_bytes'], errors='coerce')  # Convert orig_bytes to numeric
  df['dst_bytes'] = pd.to_numeric(df['dst_bytes'], errors='coerce')  # Convert resp_bytes to numeric
  df['orig_pkts'] = pd.to_numeric(df['orig_pkts'], errors='coerce')  # Convert orig_pkts to numeric
  df['resp_pkts'] = pd.to_numeric(df['resp_pkts'], errors='coerce')  # Convert resp_pkts to numeric
  df['orig_ip_bytes'] = pd.to_numeric(df['orig_ip_bytes'], errors='coerce')  # Convert orig_ip_bytes to numeric
  df['resp_ip_bytes'] = pd.to_numeric(df['resp_ip_bytes'], errors='coerce')  # Convert resp_ip_bytes to numeric
  df['duration'] = pd.to_numeric(df['duration'], errors='coerce')  # Convert duration to numeric


  # Feature Engineering
  f_data = df.copy()
  f_data['src_bytes/src_pkts'] = f_data['src_bytes'] + f_data['orig_pkts']
  f_data['dst_bytes/dst_pkts'] = f_data['dst_bytes'] + f_data['resp_pkts']
  f_data.drop(['src_bytes', 'orig_pkts', 'dst_bytes', 'resp_pkts', 'orig_ip_bytes', 'resp_ip_bytes'], axis=1, inplace=True)
  # Everything Looks good!
  # Drop Missed_bytes
  fv2_data = df.drop(columns=['missed_bytes'])
  e_data = f_data.copy()


  # Set ip datatype
  temp1 = []
  for x in e_data['src_ip']:
    try:
      temp1.append(int(ipaddress.IPv4Address(x)) % 65535)
    except:
      temp1.append(int(ipaddress.IPv6Address(x)) % 4294967295 )
  e_data['src_ip'] = pd.Series(temp1)

  temp = []
  for x in e_data['dst_ip']:
    try:
      temp.append(int(ipaddress.IPv4Address(x)) % 65535 )
    except:
      temp.append(int(ipaddress.IPv6Address(x)) % 4294967295 )


    # One Hot Encode
  e_data = pd.get_dummies(e_data, columns=['proto', 'service', 'conn_state', 'local_orig', 'local_resp' ], drop_first=True, dtype=float)

  e_data['dst_ip'] = pd.Series(temp)
  e_data['uid'] = hash(str(e_data['uid'])) % 100000
  e_data['history'] = hash(str(e_data['history'])) % 100000
  e_data['src_ip'] = e_data['src_ip'].astype('uint64')
  e_data['dst_ip'] = e_data['dst_ip'].astype('uint64')
  e_data.drop(columns=['ts', 'tunnel_parents'],inplace=True)
  e_data = e_data.fillna(0)
  return e_data


e_data = preprocessing(data)
e_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 589530 entries, 0 to 589529
Data columns (total 53 columns):
 #   Column                           Non-Null Count   Dtype  
---  ------                           --------------   -----  
 0   uid                              589530 non-null  int64  
 1   src_ip                           589530 non-null  uint64 
 2   src_p                            589530 non-null  int64  
 3   dst_ip                           589530 non-null  uint64 
 4   dst_p                            589530 non-null  int64  
 5   duration                         589530 non-null  float64
 6   missed_bytes                     589530 non-null  int64  
 7   history                          589530 non-null  int64  
 8   ip_proto                         589530 non-null  int64  
 9   src_bytes/src_pkts               589530 non-null  float64
 10  dst_bytes/dst_pkts               589530 non-null  float64
 11  proto_tcp                        589530 non-null  float64
 12  pr

In [84]:
e_data

Unnamed: 0,uid,src_ip,src_p,dst_ip,dst_p,duration,missed_bytes,history,ip_proto,src_bytes/src_pkts,...,conn_state_RSTRH,conn_state_S0,conn_state_S1,conn_state_S2,conn_state_S3,conn_state_SF,conn_state_SH,conn_state_SHR,local_orig_T,local_resp_T
0,32758,2565,65430,52995,443,0.176656,0,39353,6,34.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,32758,2565,65289,64620,443,0.089289,0,39353,6,30.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,32758,2565,65430,52995,443,0.213978,0,39353,6,1573.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,32758,4265057297,52768,518915418,53,0.019330,0,39353,17,34.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,32758,4265057297,55558,518915418,53,0.017523,0,39353,17,34.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
589525,32758,3998337667,1,2887303159,0,156.840116,0,39353,58,8387.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
589526,32758,3800188861,135,4157567228,136,838.535921,0,39353,58,875.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
589527,32758,3800188861,136,4157567228,135,871.438452,0,39353,58,612.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
589528,32758,32814,3,2565,1,892.440025,0,39353,1,124311.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [85]:
PICKLE_FILE_PATH = os.path.join(PATH, "p_data.pkl")
print(f"\nSaving Dataframe to {PICKLE_FILE_PATH}")
e_data.to_pickle(PICKLE_FILE_PATH)
print("\n Pickled Saved!")
FUNCTION_FILE_PATH = os.path.join(PATH, "preprocess_function.pkl")
with open(FUNCTION_FILE_PATH, 'wb') as f:
    pickle.dump(preprocessing, f)


Saving Dataframe to /content/drive/MyDrive/IDS/pkl/p_data.pkl

 Pickled Saved!
