# Data Processing 

## Load Dataset from S3 Bucket and Preprocessing 

In [22]:
import boto3
import pandas as pd
from io import StringIO

s3_bucket_name = "network-anomaly-dataset-001aefd6"  # Replace with your actual bucket name
file_names = [
    "Monday-WorkingHours.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
    "Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
    "Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
    "Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv"
]

s3 = boto3.client("s3")
dfs = {}

for file_name in file_names:
    print(f"Loading {file_name} from S3...")

    obj = s3.get_object(Bucket=s3_bucket_name, Key=file_name)

    # Try different encodings and delimiters
    try:
        df = pd.read_csv(StringIO(obj['Body'].read().decode('utf-8')), sep=',', engine='python')
    except UnicodeDecodeError:
        df = pd.read_csv(StringIO(obj['Body'].read().decode('ISO-8859-1')), sep=',', engine='python')

    # Strip spaces from column names
    df.columns = df.columns.str.strip()

    # Check if "Label" column exists
    if "Label" not in df.columns:
        print(f"⚠ WARNING: {file_name} is missing the 'Label' column!")

    # Store cleaned dataframe
    dfs[file_name] = df
    print(f"{file_name} loaded successfully!")

# ✅ Fill missing labels safely
for name, df in dfs.items():
    if "Label" in df.columns:
        df.loc[:, "Label"] = df["Label"].fillna("BENIGN")  # Safe method

# ✅ Fix Encoding Errors (e.g., "Web Attack � Brute Force")
label_cleaning = {
    "Web Attack � Brute Force": "WEB ATTACK",
    "Web Attack � XSS": "WEB ATTACK"
}

for name, df in dfs.items():
    if "Label" in df.columns:
        df.loc[:, "Label"] = df["Label"].replace(label_cleaning)  # Safe replacement

# ✅ Print unique labels after cleaning
for name, df in dfs.items():
    print(f"\n{name} - Unique Labels After Cleaning:")
    print(df["Label"].unique() if "Label" in df.columns else "WARNING: Label column missing!")


Loading Monday-WorkingHours.pcap_ISCX.csv from S3...
Monday-WorkingHours.pcap_ISCX.csv loaded successfully!
Loading Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv from S3...
Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv loaded successfully!
Loading Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv from S3...
Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv loaded successfully!
Loading Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv from S3...
Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv loaded successfully!
Loading Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv from S3...
Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv loaded successfully!

Monday-WorkingHours.pcap_ISCX.csv - Unique Labels After Cleaning:
['BENIGN']

Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv - Unique Labels After Cleaning:
['BENIGN' 'DDoS']

Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv - Unique Labels After Cleaning:
['BENIGN' 'PortScan']

Thursday-Wo

## Label Encoding

In [23]:
# Define Label Encoding Mapping
label_mapping = {
    "BENIGN": 0,
    "DDoS": 1,
    "PortScan": 2,
    "WEB ATTACK": 3,
    "Infiltration": 4
}

# Apply Encoding
for name, df in dfs.items():
    if "Label" in df.columns:
        df["Label"] = df["Label"].map(label_mapping)

# Verify Encoding
for name, df in dfs.items():
    print(f"\n{name} - Encoded Label Values: {df['Label'].unique()}")



Monday-WorkingHours.pcap_ISCX.csv - Encoded Label Values: [0]

Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv - Encoded Label Values: [0 1]

Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv - Encoded Label Values: [0 2]

Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv - Encoded Label Values: [0 3]

Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv - Encoded Label Values: [0 4]


In [24]:
for name, df in dfs.items():
    print(f"\n{name} - Unique Encoded Label Values: {df['Label'].unique()}")
    print(f"Missing values in Label column: {df['Label'].isna().sum()}")



Monday-WorkingHours.pcap_ISCX.csv - Unique Encoded Label Values: [0]
Missing values in Label column: 0

Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv - Unique Encoded Label Values: [0 1]
Missing values in Label column: 0

Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv - Unique Encoded Label Values: [0 2]
Missing values in Label column: 0

Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv - Unique Encoded Label Values: [0 3]
Missing values in Label column: 0

Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv - Unique Encoded Label Values: [0 4]
Missing values in Label column: 0


## Save Processed Data for Feature Engineering 

In [25]:
# Save Processed Data Locally & Upload to S3
for name, df in dfs.items():
    processed_file = f"processed_{name}"
    df.to_csv(processed_file, index=False)  # Save locally
    
    # Upload to S3
    s3.upload_file(processed_file, s3_bucket_name, f"processed/{processed_file}")
    print(f"✅ Uploaded: {processed_file} to S3.")


✅ Uploaded: processed_Monday-WorkingHours.pcap_ISCX.csv to S3.
✅ Uploaded: processed_Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv to S3.
✅ Uploaded: processed_Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv to S3.
✅ Uploaded: processed_Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv to S3.
✅ Uploaded: processed_Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv to S3.
