# Exploratory Data Analysis

In [76]:
# Mount Drive, Data is saved on my google drive account
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
import os
import pytz # Date time from UTC to PDT

# Initialize some colors
R = "\033[91m"  # Red text
W = "\033[0m"   # Reset to default
G = "\033[92m"  # Green text

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [77]:
# Initialize Root directory and path to pickle
PROJECT_ROOT = '/content/drive/MyDrive/IDS'
PATH = os.path.join(PROJECT_ROOT, "pkl")
DATA_PATH = os.path.join(PROJECT_ROOT, 'data/data.pkl')

In [78]:
# Load Data
try:
  data = pd.read_pickle(DATA_PATH)
  print(f'{G} Data Loaded! {W}')
except:
  print(f'{R} Data Not Found! {W}')
data

[92m Data Loaded! [0m


Unnamed: 0,ts,uid,src_ip,src_p,dst_ip,dst_p,proto,service,duration,src_bytes,...,local_orig,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,tunnel_parents,ip_proto
0,1.740197e+09,Covuo43LyFTNEu1Cp7,10.0.0.5,65430,99.181.107.78,443,tcp,-,0.176656,0,...,T,F,0,^dAtt,34,1876,169,237196,-,6
1,1.740197e+09,CGgOtJ35xc6GxibRug,10.0.0.5,65289,140.82.112.26,443,tcp,-,0.089289,24,...,T,F,0,^dADFaRfR,6,300,5,325,-,6
2,1.740197e+09,CX6Ki51nhyiL2A6494,10.0.0.5,65430,99.181.107.78,443,tcp,-,0.213978,1544,...,T,F,0,^dAttDa,29,3088,120,161605,-,6
3,1.740197e+09,CTSvKvuTLQBocHrOj,2601:647:cf01:5340:6d8b:e871:9ba9:5a18,52768,2001:558:feed::1,53,udp,dns,0.019330,33,...,F,F,0,Dd,1,81,1,131,-,17
4,1.740197e+09,CtmXoQ1bMnUBBi68Pa,2601:647:cf01:5340:6d8b:e871:9ba9:5a18,55558,2001:558:feed::1,53,udp,dns,0.017523,33,...,F,F,0,Dd,1,81,1,109,-,17
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
589525,1.741251e+09,Cl9V7o3d5gUzwxmAmg,2601:647:cf01:5340:fa79:aff:fed6:65fb,1,2601:647:cf01:5340:210d:b639:9608:b036,0,icmp,-,156.840116,8290,...,F,F,0,-,97,12946,0,0,-,58
589526,1.741251e+09,CHc7xm1HIc1M2STFq7,fe80::833:d877:dbce:6f45,135,fe80::fa79:aff:fed6:65fb,136,icmp,-,838.535921,840,...,T,T,0,-,35,2520,35,2240,-,58
589527,1.741251e+09,C9j7yS2Mqpn8AqAIVc,fe80::833:d877:dbce:6f45,136,fe80::fa79:aff:fed6:65fb,135,icmp,-,871.438452,576,...,T,T,0,-,36,2304,16,1152,-,58
589528,1.741251e+09,ChOLqy4F73Wl6qIVsj,73.92.54.210,3,10.0.0.5,1,icmp,-,892.440025,122608,...,F,T,0,-,1703,170292,0,0,-,1


In [79]:
print("Original index values:")
print(data.index[:5])
print("Index dtype:", data.index.dtype)
print(data['ts'])
print(data['ts'].dtype)
print(data.columns)

Original index values:
RangeIndex(start=0, stop=5, step=1)
Index dtype: int64
0         1.740197e+09
1         1.740197e+09
2         1.740197e+09
3         1.740197e+09
4         1.740197e+09
              ...     
589525    1.741251e+09
589526    1.741251e+09
589527    1.741251e+09
589528    1.741251e+09
589529    1.741251e+09
Name: ts, Length: 589530, dtype: float64
float64
Index(['ts', 'uid', 'src_ip', 'src_p', 'dst_ip', 'dst_p', 'proto', 'service',
       'duration', 'src_bytes', 'dst_bytes', 'conn_state', 'local_orig',
       'local_resp', 'missed_bytes', 'history', 'orig_pkts', 'orig_ip_bytes',
       'resp_pkts', 'resp_ip_bytes', 'tunnel_parents', 'ip_proto'],
      dtype='object')


In [80]:
# Simple cleaning
# data.isnull().sum() # Zero is null
# data.isna().sum() # Zero is na
def visualization_clean(df):
    if df.index.name == 'ts' or 'ts' not in df.columns:
        df = df.reset_index() # SO IMPORTANT so pandas resets index whenever it wants i guess...
    df['ts'] = pd.to_datetime(df['ts'], unit='s')
    df['ts'] = df['ts'].dt.tz_localize('UTC').dt.tz_convert('US/Pacific')
    df.set_index('ts', inplace=True)
    return df
print(visualization_clean(data) )

                                                    uid  \
ts                                                        
2025-02-21 19:55:46.358164072-08:00  Covuo43LyFTNEu1Cp7   
2025-02-21 19:55:46.358163118-08:00  CGgOtJ35xc6GxibRug   
2025-02-21 19:56:08.851763010-08:00  CX6Ki51nhyiL2A6494   
2025-02-21 19:57:40.816343069-08:00   CTSvKvuTLQBocHrOj   
2025-02-21 19:57:40.816483021-08:00  CtmXoQ1bMnUBBi68Pa   
...                                                 ...   
2025-03-06 00:57:15.893676996-08:00  Cl9V7o3d5gUzwxmAmg   
2025-03-06 00:45:46.176140070-08:00  CHc7xm1HIc1M2STFq7   
2025-03-06 00:45:26.384061098-08:00  C9j7yS2Mqpn8AqAIVc   
2025-03-06 00:45:18.768332958-08:00  ChOLqy4F73Wl6qIVsj   
2025-03-06 00:45:15.891634941-08:00   CwIQmx09UCOubp9Gb   

                                                                     src_ip  \
ts                                                                            
2025-02-21 19:55:46.358164072-08:00                                10.0.0.