# Data Processing 

## Load Dataset from S3 Bucket 

In [27]:
import boto3
import pandas as pd
from io import StringIO

s3_bucket_name = "network-anomaly-dataset-001aefd6"  # Replace with your actual bucket name
file_names = [
    "Monday-WorkingHours.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv"
]

s3 = boto3.client("s3")
dfs = {}

for file_name in file_names:
    print(f"Loading {file_name} from S3...")

    obj = s3.get_object(Bucket=s3_bucket_name, Key=file_name)

    # Try different encodings and delimiters
    try:
        df = pd.read_csv(StringIO(obj['Body'].read().decode('utf-8')), sep=',', engine='python')
    except UnicodeDecodeError:
        df = pd.read_csv(StringIO(obj['Body'].read().decode('ISO-8859-1')), sep=',', engine='python')

    # Strip spaces from column names
    df.columns = df.columns.str.strip()

    # Store cleaned dataframe
    dfs[file_name] = df
    print(f"{file_name} loaded successfully!")

# Check if "Label" column now contains valid values
for name, df in dfs.items():
    print(f"\n{name} - Unique Labels After Reloading:")
    print(df["Label"].unique() if "Label" in df.columns else "WARNING: Label column missing!")


Loading Monday-WorkingHours.pcap_ISCX.csv from S3...
Monday-WorkingHours.pcap_ISCX.csv loaded successfully!
Loading Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv from S3...
Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv loaded successfully!
Loading Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv from S3...
Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv loaded successfully!
Loading Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv from S3...
Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv loaded successfully!
Loading Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv from S3...
Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv loaded successfully!

Monday-WorkingHours.pcap_ISCX.csv - Unique Labels After Reloading:
['BENIGN' None]

Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv - Unique Labels After Reloading:
['BENIGN' 'DDoS' None]

Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv - Unique Labels After Reloading:
['BENIGN' 'PortScan' 

This will load each csv file from S3 into Pandas Dataframes and store them in a dictionary for easy access.

## Data Clean-Up and Label Encoding 

In [29]:
# Ensure we are working with a copy of the DataFrame
for name in dfs:
    dfs[name] = dfs[name].copy()  # Prevent unintended slicing issues

# Replace missing labels (None or NaN) with "BENIGN"
for name in dfs:
    dfs[name].loc[dfs[name]["Label"].isna(), "Label"] = "BENIGN"

# Normalize label formatting: Convert to uppercase, remove spaces, and standardize names
for name in dfs:
    dfs[name]["Label"] = dfs[name]["Label"].astype(str)  # Ensure it's a string
    dfs[name]["Label"] = dfs[name]["Label"].str.strip().str.upper()  # Remove spaces and standardize case
    dfs[name]["Label"] = dfs[name]["Label"].replace({
        "WEB ATTACK � BRUTE FORCE": "WEB ATTACK",
        "WEB ATTACK � XSS": "WEB ATTACK"
    })

# Verify the label normalization worked
for name, df in dfs.items():
    print(f"\n{name} - Unique Labels After Cleaning:")
    print(df["Label"].unique())



Monday-WorkingHours.pcap_ISCX.csv - Unique Labels After Cleaning:
['BENIGN']

Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv - Unique Labels After Cleaning:
['BENIGN' 'DDOS']

Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv - Unique Labels After Cleaning:
['BENIGN' 'PORTSCAN']

Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv - Unique Labels After Cleaning:
['BENIGN' 'WEB ATTACK']

Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv - Unique Labels After Cleaning:
['BENIGN' 'INFILTRATION']


### Enconding Labels into Numbers 

In [30]:
# Define label encoding mapping
label_mapping = {
    "BENIGN": 0,
    "DDOS": 1,
    "PORTSCAN": 2,
    "WEB ATTACK": 3,
    "INFILTRATION": 4
}

# Apply encoding to the "Label" column
for name in dfs:
    dfs[name]["Label"] = dfs[name]["Label"].map(label_mapping)

# Verify encoding
for name, df in dfs.items():
    print(f"\n{name} - Unique Encoded Label Values: {df['Label'].unique()}")



Monday-WorkingHours.pcap_ISCX.csv - Unique Encoded Label Values: [0]

Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv - Unique Encoded Label Values: [0 1]

Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv - Unique Encoded Label Values: [0 2]

Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv - Unique Encoded Label Values: [0 3]

Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv - Unique Encoded Label Values: [0 4]


## Save Processed Data for Feature Engineering 

In [31]:
for name, df in dfs.items():
    processed_file = f"processed_{name}"
    df.to_csv(processed_file, index=False)
    s3.upload_file(processed_file, s3_bucket_name, f"processed/{processed_file}")
    print(f"Uploaded processed file: {processed_file} to S3.")


Uploaded processed file: processed_Monday-WorkingHours.pcap_ISCX.csv to S3.
Uploaded processed file: processed_Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv to S3.
Uploaded processed file: processed_Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv to S3.
Uploaded processed file: processed_Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv to S3.
Uploaded processed file: processed_Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv to S3.
