In [25]:
# Mount Drive, Data is saved on my google drive account
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import pickle
import os
import ipaddress
import hashlib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler

# Initialize some colors
R = "\033[91m"  # Red text
W = "\033[0m"   # Reset to default
G = "\033[92m"  # Green text

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
# Initialize Root directory and path to pickle
# Using visualzation data since dtypes are already configured
PROJECT_ROOT = '/content/drive/MyDrive/IDS'
DATA_PATH = os.path.join(PROJECT_ROOT, 'data/data.pkl')
print(DATA_PATH)

/content/drive/MyDrive/IDS/data/data.pkl


In [27]:
# Load Data
try:
  data = pd.read_pickle(DATA_PATH)
  print(f'{G} Data Loaded! {W}')
except:
  print(f'{R} Data Not Found! {W}')
data.info()

[92m Data Loaded! [0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1243021 entries, 0 to 1243020
Data columns (total 22 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   ts              1243021 non-null  float64
 1   uid             1243021 non-null  object 
 2   src_ip          1243021 non-null  object 
 3   src_p           1243021 non-null  int64  
 4   dst_ip          1243021 non-null  object 
 5   dst_p           1243021 non-null  int64  
 6   proto           1243021 non-null  object 
 7   service         1243021 non-null  object 
 8   duration        1243021 non-null  object 
 9   src_bytes       1243021 non-null  object 
 10  dst_bytes       1243021 non-null  object 
 11  conn_state      1243021 non-null  object 
 12  local_orig      1243021 non-null  object 
 13  local_resp      1243021 non-null  object 
 14  missed_bytes    1243021 non-null  int64  
 15  history         1243021 non-null  object 
 16  orig_pkts   

# Feature Engineering
- we noticed in our EDA that orig_bytes and orig_pkts, etc.., where found highly corelatted thus to reduce the complexity of our model we will combine them


## Pre Processing
- one hot encode everything but UID, and History
- we will hash UID and History

In [28]:
 # encoded data
 # Missed Bytes is mostly 0's we will drop it
 # f_data['missed_bytes'].value_counts()

 def preprocessing(df):

    # so pandas resets index whenever it wants i guess...

  # Data Type Assignment
  df['duration'] = pd.to_numeric(df['duration'], errors='coerce')  # Convert duration to numeric (coerce invalid values to NaN)
  df['src_bytes'] = pd.to_numeric(df['src_bytes'], errors='coerce')  # Convert orig_bytes to numeric
  df['dst_bytes'] = pd.to_numeric(df['dst_bytes'], errors='coerce')  # Convert resp_bytes to numeric
  df['orig_pkts'] = pd.to_numeric(df['orig_pkts'], errors='coerce')  # Convert orig_pkts to numeric
  df['resp_pkts'] = pd.to_numeric(df['resp_pkts'], errors='coerce')  # Convert resp_pkts to numeric
  df['orig_ip_bytes'] = pd.to_numeric(df['orig_ip_bytes'], errors='coerce')  # Convert orig_ip_bytes to numeric
  df['resp_ip_bytes'] = pd.to_numeric(df['resp_ip_bytes'], errors='coerce')  # Convert resp_ip_bytes to numeric
  df['duration'] = pd.to_numeric(df['duration'], errors='coerce')  # Convert duration to numeric


  # Feature Engineering
  # correlation with other features doesn't determine usefulness for your target variable.
  f_data = df.copy()
# Create your engineered features
  f_data['dst_bytes_per_orig_pkt'] = (f_data['dst_bytes'] / f_data['orig_pkts']).replace(0, 1)
  f_data['dst_bytes_per_resp_pkt'] = (f_data['dst_bytes'] / f_data['resp_pkts']).replace(0, 1)


    # Replace infinite values with a reasonable fallback
  f_data['dst_bytes_per_orig_pkt'] = f_data['dst_bytes_per_orig_pkt'].replace([np.inf, -np.inf], 0)
    # Drop the component features you've engineered from, but keep src_bytes
  f_data.drop(['orig_pkts', 'dst_bytes', 'resp_pkts', 'orig_ip_bytes', 'resp_ip_bytes'], axis=1, inplace=True)
  # Drop Missed_bytes
  f_data = f_data.drop(columns=['missed_bytes'], )
  e_data = f_data.copy()


  # Set ip datatype
  temp1 = []
  for x in e_data['src_ip']:
    try:
      temp1.append(int(ipaddress.IPv4Address(x)) % 65535)
    except:
      temp1.append(int(ipaddress.IPv6Address(x)) % 4294967295 )
  e_data['src_ip'] = pd.Series(temp1)

  temp = []
  for x in e_data['dst_ip']:
    try:
      temp.append(int(ipaddress.IPv4Address(x)) % 65535 )
    except:
      temp.append(int(ipaddress.IPv6Address(x)) % 4294967295 )


    # One Hot Encode
  e_data = pd.get_dummies(e_data, columns=['proto', 'service', 'conn_state', 'local_orig', 'local_resp' ], drop_first=True, dtype=float)

  e_data['dst_ip'] = pd.Series(temp)
  e_data['uid'] = hash(str(e_data['uid'])) % 100000
  e_data['history'] = hash(str(e_data['history'])) % 100000
  e_data['src_ip'] = e_data['src_ip'].astype('uint64')
  e_data['dst_ip'] = e_data['dst_ip'].astype('uint64')
  e_data.drop(columns=['ts', 'tunnel_parents'],inplace=True)
  e_data = e_data.fillna(0)
  return e_data


e_data = preprocessing(data)
e_data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1243021 entries, 0 to 1243020
Data columns (total 55 columns):
 #   Column                           Non-Null Count    Dtype  
---  ------                           --------------    -----  
 0   uid                              1243021 non-null  int64  
 1   src_ip                           1243021 non-null  uint64 
 2   src_p                            1243021 non-null  int64  
 3   dst_ip                           1243021 non-null  uint64 
 4   dst_p                            1243021 non-null  int64  
 5   duration                         1243021 non-null  float64
 6   src_bytes                        1243021 non-null  float64
 7   history                          1243021 non-null  int64  
 8   ip_proto                         1243021 non-null  int64  
 9   dst_bytes_per_orig_pkt           1243021 non-null  float64
 10  dst_bytes_per_resp_pkt           1243021 non-null  float64
 11  proto_tcp                        1243021 non-null 

In [29]:
e_data

Unnamed: 0,uid,src_ip,src_p,dst_ip,dst_p,duration,src_bytes,history,ip_proto,dst_bytes_per_orig_pkt,...,conn_state_RSTRH,conn_state_S0,conn_state_S1,conn_state_S2,conn_state_S3,conn_state_SF,conn_state_SH,conn_state_SHR,local_orig_T,local_resp_T
0,46111,2565,65430,52995,443,0.176656,0.0,94291,6,6717.882353,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,46111,2565,65289,64620,443,0.089289,24.0,94291,6,10.833333,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,46111,2565,65430,52995,443,0.213978,1544.0,94291,6,5407.344828,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,46111,4265057297,52768,518915418,53,0.019330,33.0,94291,17,83.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,46111,4265057297,55558,518915418,53,0.017523,33.0,94291,17,61.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1243016,46111,2803929263,55210,1997376201,53,0.018985,32.0,94291,17,115.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1243017,46111,2803929263,52624,1997376201,53,0.018985,32.0,94291,17,77.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1243018,46111,2803929263,58622,1997376201,53,0.011106,36.0,94291,17,96.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1243019,46111,2803929263,50954,1997376201,53,0.015250,36.0,94291,17,96.000000,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [30]:
PATH = os.path.join(PROJECT_ROOT, "pkl")
PICKLE_FILE_PATH = os.path.join(PATH, "p_data.pkl")
print(f"\nSaving Dataframe to {PICKLE_FILE_PATH}")
e_data.to_pickle(PICKLE_FILE_PATH)
print("\n Pickled Saved!")
FUNCTION_FILE_PATH = os.path.join(PATH, "preprocess_function.pkl")
with open(FUNCTION_FILE_PATH, 'wb') as f:
    pickle.dump(preprocessing, f)


Saving Dataframe to /content/drive/MyDrive/IDS/pkl/p_data.pkl

 Pickled Saved!
