In [1]:
import pandas as pd
import os
import warnings

warnings.filterwarnings('ignore')

print("--- Milestone 2: Feature Engineering & Data Wrangling ---")

# --- 1. Define File Paths ---
# This path logic works when the notebook is in the 'notebooks' folder
try:
    current_dir = os.getcwd()
    base_path = os.path.dirname(current_dir)
    processed_data_path = os.path.join(base_path, 'data', 'processed', 'cleaned_merged.csv')
    output_path = os.path.join(base_path, 'data', 'processed', 'featured_dataset.csv')
    print(f"Project base path identified as: {base_path}")
    print("Searching for data file at:", processed_data_path)
except Exception as e:
    print(f"Error setting up file paths: {e}")
    exit()

# --- 2. Load the Cleaned Dataset ---
try:
    df = pd.read_csv(processed_data_path)
    print(f"\nSuccessfully loaded '{os.path.basename(processed_data_path)}'.")
except FileNotFoundError:
    print(f"\nError: The file was not found at {processed_data_path}")
    print("Please ensure 'cleaned_merged.csv' exists in your 'data/processed' folder from Milestone 1.")
    exit()

# --- 3. Engineer Derived Features ---
print("Creating time-based features (day of week, month, year-week)...")
# Ensure the 'date' column is in datetime format for feature extraction
df['date'] = pd.to_datetime(df['date'])

df['day_of_week'] = df['date'].dt.dayofweek  # Monday=0, Sunday=6
df['month'] = df['date'].dt.month

# Get the full ISO calendar data (year, week, day) to handle year-end weeks correctly
iso_cal = df['date'].dt.isocalendar()
# Create a sortable year-week string (e.g., "2023-W05")
df['year_week'] = iso_cal.year.astype(str) + '-W' + iso_cal.week.astype(str).str.zfill(2)
# Keep the simple week number for other calculations if needed
df['week_of_year'] = iso_cal.week

# --- 4. Create Rolling Average (Trend) Feature ---
print("Creating 7-day rolling average for CPU usage...")
# Sort data to ensure correct rolling calculations for time-series data
df = df.sort_values(by=['region', 'resource_type', 'date'])

# Calculate a 7-day rolling average of CPU usage.
# We group by each specific resource in each region to get an accurate trend.
df['cpu_usage_7_day_avg'] = df.groupby(['region', 'resource_type'])['usage_cpu'].transform(
    lambda x: x.rolling(window=7, min_periods=1).mean()
)

# --- 5. Save the Enriched, Model-Ready Dataset ---
df.to_csv(output_path, index=False)
print(f"\nSuccessfully created new features and saved the file to: {output_path}")

# --- Display the first few rows of the new dataset to verify ---
print("\n--- Sample of the new 'featured_dataset.csv' with new columns: ---")
print(df[['date', 'day_of_week', 'month', 'year_week', 'cpu_usage_7_day_avg']].head())


--- Milestone 2: Feature Engineering & Data Wrangling ---
Project base path identified as: c:\Users\RAVI TEJA\OneDrive\Desktop\Infosys Internship 6.0\azure-demand-forecasting
Searching for data file at: c:\Users\RAVI TEJA\OneDrive\Desktop\Infosys Internship 6.0\azure-demand-forecasting\data\processed\cleaned_merged.csv

Successfully loaded 'cleaned_merged.csv'.
Creating time-based features (day of week, month, year-week)...
Creating 7-day rolling average for CPU usage...

Successfully created new features and saved the file to: c:\Users\RAVI TEJA\OneDrive\Desktop\Infosys Internship 6.0\azure-demand-forecasting\data\processed\featured_dataset.csv

--- Sample of the new 'featured_dataset.csv' with new columns: ---
         date  day_of_week  month year_week  cpu_usage_7_day_avg
2  2023-01-01            6      1  2022-W52                 70.0
14 2023-01-02            0      1  2023-W01                 78.0
26 2023-01-03            1      1  2023-W01                 71.0
38 2023-01-04     