In [3]:
import pandas as pd
import os

# --- MILESTONE 2: FEATURE ENGINEERING ---

# --- 1. Load the Merged Data ---
# Ensure your CSV files are in the 'data/raw' folder
usage_path = 'data/raw/azure_usage.csv'
factors_path = 'data/raw/external_factors.csv.csv'
processed_folder = 'data/processed'

if not os.path.exists(processed_folder):
    os.makedirs(processed_folder)

try:
    usage_df = pd.read_csv(usage_path)
    factors_df = pd.read_csv(factors_path)
    df = pd.merge(usage_df, factors_df, on='date')
    df['date'] = pd.to_datetime(df['date'])
    print("Successfully loaded and merged data.")
except FileNotFoundError:
    print("ERROR: Could not find data files in 'data/raw'. Please check the file paths.")
    df = pd.DataFrame()

if not df.empty:
    # --- 2. Create Time-Based Features ---
    df['month'] = df['date'].dt.month
    df['day_of_week'] = df['date'].dt.dayofweek # Monday=0, Sunday=6
    df['day_of_month'] = df['date'].dt.day
    df['week_of_year'] = df['date'].dt.isocalendar().week

    # --- 3. Create Lag Features (usage from the previous day) ---
    df.sort_values(by=['region', 'resource_type', 'date'], inplace=True)
    df['cpu_lag_1'] = df.groupby(['region', 'resource_type'])['usage_cpu'].shift(1)

    # --- 4. Create Rolling Window Features (average usage over the last 7 days) ---
    df['cpu_rolling_mean_7'] = df.groupby(['region', 'resource_type'])['usage_cpu'].transform(
        lambda x: x.rolling(window=7, min_periods=1).mean()
    )

    # --- 5. Display the Data with New Features ---
    df_features = df.dropna() # Drop rows with NaN values from lag/rolling features

    print("\nDataset with new features (first 10 rows):")
    print(df_features.head(10))

    # --- 6. Save the new featured dataset ---
    featured_data_path = os.path.join(processed_folder, 'featured_dataset.csv')
    df_features.to_csv(featured_data_path, index=False)
    print(f"\nSuccessfully created new features and saved to '{featured_data_path}'")

ERROR: Could not find data files in 'data/raw'. Please check the file paths.
