In [2]:
import pandas as pd

# --- Fixed sample dataset: all columns now have 9 rows ---
data = {
    'session_id': [1, 1, 1, 2, 2, 3, 3, 3, 3],
    'packet_size': [500, 750, 600, 1500, 400, 300, 350, 320, 900],
    'dest_ip': [
        '192.168.0.10', '192.168.0.10', '8.8.8.8',
        '10.0.0.5', '10.0.0.5',
        '192.168.0.12', '192.168.0.13', '192.168.0.13', '8.8.4.4'
    ],
    'protocol': ['TCP', 'TCP', 'UDP', 'TCP', 'UDP', 'UDP', 'UDP', 'UDP', 'TCP']
}

df = pd.DataFrame(data)

# --- Feature: Average packet size per session ---
df_avg_packet = df.groupby('session_id')['packet_size'].mean().reset_index(name='avg_packet_size')

# --- Feature: Unique destination IP count per session ---
df_unique_ips = df.groupby('session_id')['dest_ip'].nunique().reset_index(name='unique_dest_ips')

# --- Feature: Protocol usage counts per session ---
df_protocol_counts = df.groupby(['session_id', 'protocol']).size().unstack(fill_value=0).reset_index()

# --- Merge all features ---
features = df_avg_packet.merge(df_unique_ips, on='session_id').merge(df_protocol_counts, on='session_id')

print(features)

   session_id  avg_packet_size  unique_dest_ips  TCP  UDP
0           1       616.666667                2    2    1
1           2       950.000000                1    1    1
2           3       467.500000                3    1    3
