In [1]:
# Data Preprocessing & Cleaning for AWS CloudWatch Time Series (NAB Dataset)

import sys
import os

sys.path.append(os.path.abspath("../src"))

import pandas as pd
import numpy as np
from data_ingestion.loader import load_aws_cloudwatch_data

# Load data
data_dir = "../data/raw/aws_cloudwatch_data"
data = load_aws_cloudwatch_data(data_dir)

# ----- 1. Handle Missing Values -----
for name, df in data.items():
    print(f"\n{name} missing values before:")
    print(df.isnull().sum())
    # Example: Forward fill, then backward fill for any remaining
    df = df.sort_values('timestamp')
    df['value'] = df['value'].fillna(method='ffill').fillna(method='bfill')
    # Optionally drop rows still missing values
    df = df.dropna(subset=['value'])
    data[name] = df
    print(f"{name} missing values after:\n{df.isnull().sum()}")

# ----- 2. Ensure Consistent Timestamps -----
for name, df in data.items():
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    # Remove duplicates
    df = df.drop_duplicates(subset=['timestamp'])
    df = df.sort_values('timestamp')
    # Optional: Resample to uniform interval (e.g., 5 min) if needed
    # df = df.set_index('timestamp').resample('5T').mean().reset_index()
    data[name] = df

# ----- 3. Data Type Consistency -----
for name, df in data.items():
    df['value'] = pd.to_numeric(df['value'], errors='coerce')
    data[name] = df

# ----- 4. Outlier Handling (Optional) -----
# Example: clip values to 1st and 99th percentile
for name, df in data.items():
    lower = df['value'].quantile(0.01)
    upper = df['value'].quantile(0.99)
    df['value'] = df['value'].clip(lower, upper)
    data[name] = df

# ----- 5. Feature Engineering (Optional) -----
for name, df in data.items():
    df['hour'] = df['timestamp'].dt.hour
    df['dayofweek'] = df['timestamp'].dt.dayofweek
    data[name] = df

# ----- 6. Save Cleaned Data -----
output_dir = "../data/processed/aws_cloudwatch_data"
os.makedirs(output_dir, exist_ok=True)
for name, df in data.items():
    outfile = os.path.join(output_dir, f"{name}_cleaned.csv")
    df.to_csv(outfile, index=False)
    print(f"Saved cleaned data to {outfile}")


ec2_cpu_utilization_77c1ca missing values before:
timestamp    0
value        0
dtype: int64
ec2_cpu_utilization_77c1ca missing values after:
timestamp    0
value        0
dtype: int64

rds_cpu_utilization_e47b3b missing values before:
timestamp    0
value        0
dtype: int64
rds_cpu_utilization_e47b3b missing values after:
timestamp    0
value        0
dtype: int64

elb_request_count_8c0756 missing values before:
timestamp    0
value        0
dtype: int64
elb_request_count_8c0756 missing values after:
timestamp    0
value        0
dtype: int64

ec2_cpu_utilization_c6585a missing values before:
timestamp    0
value        0
dtype: int64
ec2_cpu_utilization_c6585a missing values after:
timestamp    0
value        0
dtype: int64

grok_asg_anomaly missing values before:
timestamp    0
value        0
dtype: int64
grok_asg_anomaly missing values after:
timestamp    0
value        0
dtype: int64

ec2_network_in_5abac7 missing values before:
timestamp    0
value        0
dtype: int64
ec2_n

  df['value'] = df['value'].fillna(method='ffill').fillna(method='bfill')
  df['value'] = df['value'].fillna(method='ffill').fillna(method='bfill')
  df['value'] = df['value'].fillna(method='ffill').fillna(method='bfill')
  df['value'] = df['value'].fillna(method='ffill').fillna(method='bfill')
  df['value'] = df['value'].fillna(method='ffill').fillna(method='bfill')
  df['value'] = df['value'].fillna(method='ffill').fillna(method='bfill')
  df['value'] = df['value'].fillna(method='ffill').fillna(method='bfill')
  df['value'] = df['value'].fillna(method='ffill').fillna(method='bfill')
  df['value'] = df['value'].fillna(method='ffill').fillna(method='bfill')
  df['value'] = df['value'].fillna(method='ffill').fillna(method='bfill')
  df['value'] = df['value'].fillna(method='ffill').fillna(method='bfill')
  df['value'] = df['value'].fillna(method='ffill').fillna(method='bfill')
  df['value'] = df['value'].fillna(method='ffill').fillna(method='bfill')
  df['value'] = df['value'].fillna(met

Saved cleaned data to ../data/processed/aws_cloudwatch_data/elb_request_count_8c0756_cleaned.csv
Saved cleaned data to ../data/processed/aws_cloudwatch_data/ec2_cpu_utilization_c6585a_cleaned.csv
Saved cleaned data to ../data/processed/aws_cloudwatch_data/grok_asg_anomaly_cleaned.csv
Saved cleaned data to ../data/processed/aws_cloudwatch_data/ec2_network_in_5abac7_cleaned.csv
Saved cleaned data to ../data/processed/aws_cloudwatch_data/ec2_disk_write_bytes_1ef3de_cleaned.csv
Saved cleaned data to ../data/processed/aws_cloudwatch_data/rds_cpu_utilization_cc0c53_cleaned.csv
Saved cleaned data to ../data/processed/aws_cloudwatch_data/ec2_cpu_utilization_24ae8d_cleaned.csv
Saved cleaned data to ../data/processed/aws_cloudwatch_data/ec2_disk_write_bytes_c0d644_cleaned.csv
Saved cleaned data to ../data/processed/aws_cloudwatch_data/iio_us-east-1_i-a2eb1cd9_NetworkIn_cleaned.csv
Saved cleaned data to ../data/processed/aws_cloudwatch_data/ec2_network_in_257a54_cleaned.csv
Saved cleaned data to 