In [1]:
import pandas as pd
import numpy as np
import ipaddress

In [2]:
df = pd.read_csv("CloudWatch_Traffic_Web_Attack.csv")

In [3]:
df.head()

Unnamed: 0,bytes_in,bytes_out,creation_time,end_time,src_ip,src_ip_country_code,protocol,response.code,dst_port,dst_ip,rule_names,observation_name,source.meta,source.name,time,detection_types
0,5602,12990,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,147.161.161.82,AE,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule
1,30912,18186,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,165.225.33.6,US,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule
2,28506,13468,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,165.225.212.255,CA,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule
3,30546,14278,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,136.226.64.114,US,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule
4,6526,13892,2024-04-25T23:00:00Z,2024-04-25T23:10:00Z,165.225.240.79,NL,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25T23:00:00Z,waf_rule


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282 entries, 0 to 281
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   bytes_in             282 non-null    int64 
 1   bytes_out            282 non-null    int64 
 2   creation_time        282 non-null    object
 3   end_time             282 non-null    object
 4   src_ip               282 non-null    object
 5   src_ip_country_code  282 non-null    object
 6   protocol             282 non-null    object
 7   response.code        282 non-null    int64 
 8   dst_port             282 non-null    int64 
 9   dst_ip               282 non-null    object
 10  rule_names           282 non-null    object
 11  observation_name     282 non-null    object
 12  source.meta          282 non-null    object
 13  source.name          282 non-null    object
 14  time                 282 non-null    object
 15  detection_types      282 non-null    object
dtypes: int64

## Basic cleaning

In [11]:
df.drop_duplicates(inplace=True)
df['creation_time'] = pd.to_datetime(df['creation_time'], utc=True)
df['end_time'] = pd.to_datetime(df['end_time'], utc=True)
df['time'] = pd.to_datetime(df['time'], utc=True)

In [12]:
df = df.convert_dtypes()

## validate IPs

In [15]:
def is_valid_ip(ip):
    try:
        ipaddress.ip_address(ip)
        return True
    except:
        return False

In [17]:
# Remove rows with invalid IPs
df = df[df['src_ip'].apply(is_valid_ip) & df['dst_ip'].apply(is_valid_ip)]

In [19]:
# Show missing values & Drop rows where critical identifiers are missing
df_nulls = df.isnull().sum()
df.dropna(subset=['src_ip', 'dst_ip', 'creation_time', 'end_time'], inplace=True)

In [20]:
for col in ['bytes_in', 'bytes_out']:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

In [23]:
if 'src_ip_country_code' in df.columns:
    df['src_ip_country_code'] = df['src_ip_country_code'].str.upper()
if 'protocol' in df.columns:
    df['protocol'] = df['protocol'].str.strip().str.upper()

# Feature Engineering

In [24]:
df['session_duration_mins'] = (df['end_time'] - df['creation_time']).dt.total_seconds() / 60

In [25]:
# log transformations
df['log_bytes_in'] = np.log1p(df['bytes_in'])
df['log_bytes_out'] = np.log1p(df['bytes_out'])

In [26]:
# Port classification
def categorize_port(port):
    if port in [80, 443]:
        return 'standard'
    elif port < 1024:
        return 'well_known'
    elif port < 49152:
        return 'registered'
    else:
        return 'dynamic'
if 'dst_port' in df.columns:
    df['port_type'] = df['dst_port'].apply(categorize_port)

In [29]:
df['is_standard_port'] = df['dst_port'].apply(lambda x: 1 if x in [80, 443] else 0)

In [27]:
df.head()

Unnamed: 0,bytes_in,bytes_out,creation_time,end_time,src_ip,src_ip_country_code,protocol,response.code,dst_port,dst_ip,rule_names,observation_name,source.meta,source.name,time,detection_types,session_duration_mins,log_bytes_in,log_bytes_out,port_type
0,5602,12990,2024-04-25 23:00:00+00:00,2024-04-25 23:10:00+00:00,147.161.161.82,AE,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25 23:00:00+00:00,waf_rule,10.0,8.631057,9.472012,standard
1,30912,18186,2024-04-25 23:00:00+00:00,2024-04-25 23:10:00+00:00,165.225.33.6,US,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25 23:00:00+00:00,waf_rule,10.0,10.338932,9.808462,standard
2,28506,13468,2024-04-25 23:00:00+00:00,2024-04-25 23:10:00+00:00,165.225.212.255,CA,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25 23:00:00+00:00,waf_rule,10.0,10.257905,9.508146,standard
3,30546,14278,2024-04-25 23:00:00+00:00,2024-04-25 23:10:00+00:00,136.226.64.114,US,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25 23:00:00+00:00,waf_rule,10.0,10.327022,9.566545,standard
4,6526,13892,2024-04-25 23:00:00+00:00,2024-04-25 23:10:00+00:00,165.225.240.79,NL,HTTPS,200,443,10.138.69.97,Suspicious Web Traffic,Adversary Infrastructure Interaction,AWS_VPC_Flow,prod_webserver,2024-04-25 23:00:00+00:00,waf_rule,10.0,8.783703,9.53914,standard


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 282 entries, 0 to 281
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype              
---  ------                 --------------  -----              
 0   bytes_in               282 non-null    Int64              
 1   bytes_out              282 non-null    Int64              
 2   creation_time          282 non-null    datetime64[ns, UTC]
 3   end_time               282 non-null    datetime64[ns, UTC]
 4   src_ip                 282 non-null    string             
 5   src_ip_country_code    282 non-null    string             
 6   protocol               282 non-null    string             
 7   response.code          282 non-null    Int64              
 8   dst_port               282 non-null    Int64              
 9   dst_ip                 282 non-null    string             
 10  rule_names             282 non-null    string             
 11  observation_name       282 non-null    string             