In [37]:
import pandas as pd
import os
import dotenv
import ipaddress

dotenv.load_dotenv()


True

In [4]:
data_dir = os.getenv("DATA_DIR", './data')

# Data Loading

In [55]:
private_networks = [
    ipaddress.ip_network('192.168.0.0/16'),
    ipaddress.ip_network('10.0.0.0/8'),
    ipaddress.ip_network('172.16.0.0/12')
]

def is_private_ip(ip):
    ip = ipaddress.ip_address(ip)
    return any(ip in network for network in private_networks)

def load_label_flow_df(filename):
    label_flow_path = os.path.join(data_dir, 'GeneratedLabelledFlows', 'TrafficLabelling', filename)
    label_flow_df = pd.read_csv(label_flow_path)
    label_flow_df.columns = label_flow_df.columns.str.strip()

    label_flow_df['Source IP is Private'] = label_flow_df['Source IP'].apply(is_private_ip)
    label_flow_df['Destination IP is Private'] = label_flow_df['Destination IP'].apply(is_private_ip)

    return label_flow_df

In [56]:
label_flow_df = load_label_flow_df('Monday-WorkingHours.pcap_ISCX.csv')

# Data Exploration

In [None]:
label_flow_df['Source IP'].value_counts()

Source IP
192.168.10.3     67774
192.168.10.25    58199
192.168.10.5     48656
192.168.10.12    36517
192.168.10.9     34341
                 ...  
192.243.232.9        1
65.52.108.11         1
52.84.143.21         1
91.189.88.149        1
149.56.76.152        1
Name: count, Length: 8239, dtype: int64

In [58]:
label_flow_df['Destination IP'].value_counts()

Destination IP
192.168.10.3       152194
192.168.10.1        67016
192.168.10.25       38834
192.168.10.5        14034
192.168.10.12        8772
                    ...  
209.15.238.230          1
5.79.102.231            1
23.208.157.252          1
17.253.20.125           1
151.101.192.133         1
Name: count, Length: 9698, dtype: int64

In [59]:
label_flow_df['Source IP is Private'].value_counts()

Source IP is Private
True     426184
False    103734
Name: count, dtype: int64

In [60]:
label_flow_df['Destination IP is Private'].value_counts()

Destination IP is Private
True     331766
False    198152
Name: count, dtype: int64