# Notebook 02: ETL & Feature Engineering

**Goal**: Prepare the raw metrics data for Machine Learning models.

**Steps**:
1. Load raw data.
2.Clean missing values/duplicates.
3. Generate features (Lags, Rolling windows, Date parts).
4. Save processed dataset.

In [4]:
import sys
import os
from pathlib import Path

# Add src to path
project_root = Path(os.getcwd()).parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

from src.etl_pipeline import ETLPipeline
import pandas as pd

## 1. Run Pipeline

In [5]:
# Define paths
raw_data = project_root / "data" / "scratch" / "server_metrics.csv.gz"
output_dir = project_root / "data" / "processed"

pipeline = ETLPipeline(
    raw_data_path=str(raw_data),
    output_path=str(output_dir)
)

df_processed = pipeline.run()

Sample raw timestamps: ['2022-01-01', '2022-01-02', '2022-01-03', '2022-01-04', '2022-01-05']
Sample parsed timestamps: [Timestamp('2022-01-01 00:00:00'), Timestamp('2022-01-02 00:00:00'), Timestamp('2022-01-03 00:00:00'), Timestamp('2022-01-04 00:00:00'), Timestamp('2022-01-05 00:00:00')]
Timestamp after cleaning: [Timestamp('2022-01-01 00:00:00'), Timestamp('2022-01-02 00:00:00'), Timestamp('2022-01-03 00:00:00'), Timestamp('2022-01-04 00:00:00'), Timestamp('2022-01-05 00:00:00')]
Shape before dropna: (175320, 33)
Shape after dropna: (171720, 33)


## 2. Inspect Output

In [3]:
print("Processed Data Shape:", df_processed.shape)
print("Columns:", df_processed.columns.tolist())
df_processed.head()

Processed Data Shape: (171720, 33)
Columns: ['timestamp', 'server_id', 'cpu_p95', 'mem_p95', 'disk_p95', 'net_in_p95', 'net_out_p95', 'app_name', 'business_unit', 'criticality', 'region', 'server_type', 'year', 'month', 'quarter', 'dayofweek', 'is_weekend', 'is_eoq', 'is_holiday', 'day_of_week', 'day_of_year', 'cpu_p95_lag_1', 'cpu_p95_lag_7', 'cpu_p95_lag_30', 'cpu_p95_roll_mean_7', 'cpu_p95_roll_std_7', 'cpu_p95_roll_mean_30', 'mem_p95_lag_1', 'mem_p95_lag_7', 'mem_p95_lag_30', 'mem_p95_roll_mean_7', 'mem_p95_roll_std_7', 'mem_p95_roll_mean_30']


Unnamed: 0,timestamp,server_id,cpu_p95,mem_p95,disk_p95,net_in_p95,net_out_p95,app_name,business_unit,criticality,...,cpu_p95_lag_30,cpu_p95_roll_mean_7,cpu_p95_roll_std_7,cpu_p95_roll_mean_30,mem_p95_lag_1,mem_p95_lag_7,mem_p95_lag_30,mem_p95_roll_mean_7,mem_p95_roll_std_7,mem_p95_roll_mean_30
0,2022-01-31,server_000,52.9,31.76,18.08,167.31,100.39,0.0,0.0,0.0,...,0.0,39.642857,18.634284,36.945667,20.12,54.59,3.88,33.768571,13.473705,29.722
1,2022-02-01,server_000,33.92,20.06,17.2,167.98,100.79,0.0,0.0,0.0,...,17.29,37.374286,15.1337,38.709,31.76,33.3,12.45,30.507143,9.876642,30.651333
2,2022-02-02,server_000,69.53,33.81,23.8,270.92,162.55,0.0,0.0,0.0,...,51.75,34.697143,13.554105,39.263333,20.06,33.41,38.67,28.615714,10.500703,30.905
3,2022-02-03,server_000,36.91,30.06,17.99,132.85,79.71,0.0,0.0,0.0,...,41.23,37.071429,18.002045,39.856,33.81,47.4,32.62,28.672857,10.532182,30.743
4,2022-02-04,server_000,58.12,31.76,22.76,236.7,142.02,0.0,0.0,0.0,...,41.16,37.551429,17.93718,39.712,30.06,30.25,23.4,26.195714,6.755563,30.657667
