In [2]:
import pandas as pd
import numpy as np


In [3]:
# Load cleaned & resampled data from Module 2
hourly_df = pd.read_csv(
    "hourly_cleaned_energy.csv",
    index_col=0,
    parse_dates=True
)

hourly_df.head()


Unnamed: 0_level_0,Energy_kWh,Energy_Scaled
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-01 00:00:00,11.48,0.297095
2023-01-01 01:00:00,17.92,0.475242
2023-01-01 02:00:00,26.27,0.706224
2023-01-01 03:00:00,12.16,0.315906
2023-01-01 04:00:00,15.61,0.411342


In [4]:
# Extract time-based features from timestamp
hourly_df['hour'] = hourly_df.index.hour
hourly_df['day'] = hourly_df.index.day
hourly_df['month'] = hourly_df.index.month
hourly_df['day_of_week'] = hourly_df.index.dayofweek  # 0=Monday

hourly_df.head()


Unnamed: 0_level_0,Energy_kWh,Energy_Scaled,hour,day,month,day_of_week
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-01-01 00:00:00,11.48,0.297095,0,1,1,6
2023-01-01 01:00:00,17.92,0.475242,1,1,1,6
2023-01-01 02:00:00,26.27,0.706224,2,1,1,6
2023-01-01 03:00:00,12.16,0.315906,3,1,1,6
2023-01-01 04:00:00,15.61,0.411342,4,1,1,6


In [5]:
# Lag features (previous energy values)
hourly_df['lag_1'] = hourly_df['Energy_Scaled'].shift(1)
hourly_df['lag_2'] = hourly_df['Energy_Scaled'].shift(2)
hourly_df['lag_3'] = hourly_df['Energy_Scaled'].shift(3)

hourly_df.head(10)


Unnamed: 0_level_0,Energy_kWh,Energy_Scaled,hour,day,month,day_of_week,lag_1,lag_2,lag_3
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2023-01-01 00:00:00,11.48,0.297095,0,1,1,6,,,
2023-01-01 01:00:00,17.92,0.475242,1,1,1,6,0.297095,,
2023-01-01 02:00:00,26.27,0.706224,2,1,1,6,0.475242,0.297095,
2023-01-01 03:00:00,12.16,0.315906,3,1,1,6,0.706224,0.475242,0.297095
2023-01-01 04:00:00,15.61,0.411342,4,1,1,6,0.315906,0.706224,0.475242
2023-01-01 05:00:00,10.41,0.267497,5,1,1,6,0.411342,0.315906,0.706224
2023-01-01 06:00:00,5.04,0.118949,6,1,1,6,0.267497,0.411342,0.315906
2023-01-01 07:00:00,23.57,0.631535,7,1,1,6,0.118949,0.267497,0.411342
2023-01-01 08:00:00,18.72,0.497372,8,1,1,6,0.631535,0.118949,0.267497
2023-01-01 09:00:00,19.79,0.526971,9,1,1,6,0.497372,0.631535,0.118949


In [6]:
# Rolling average features
hourly_df['rolling_mean_3'] = hourly_df['Energy_Scaled'].rolling(window=3).mean()
hourly_df['rolling_mean_6'] = hourly_df['Energy_Scaled'].rolling(window=6).mean()

hourly_df.head(10)


Unnamed: 0_level_0,Energy_kWh,Energy_Scaled,hour,day,month,day_of_week,lag_1,lag_2,lag_3,rolling_mean_3,rolling_mean_6
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-01-01 00:00:00,11.48,0.297095,0,1,1,6,,,,,
2023-01-01 01:00:00,17.92,0.475242,1,1,1,6,0.297095,,,,
2023-01-01 02:00:00,26.27,0.706224,2,1,1,6,0.475242,0.297095,,0.492854,
2023-01-01 03:00:00,12.16,0.315906,3,1,1,6,0.706224,0.475242,0.297095,0.499124,
2023-01-01 04:00:00,15.61,0.411342,4,1,1,6,0.315906,0.706224,0.475242,0.477824,
2023-01-01 05:00:00,10.41,0.267497,5,1,1,6,0.411342,0.315906,0.706224,0.331581,0.412218
2023-01-01 06:00:00,5.04,0.118949,6,1,1,6,0.267497,0.411342,0.315906,0.265929,0.382527
2023-01-01 07:00:00,23.57,0.631535,7,1,1,6,0.118949,0.267497,0.411342,0.339327,0.408575
2023-01-01 08:00:00,18.72,0.497372,8,1,1,6,0.631535,0.118949,0.267497,0.415952,0.373767
2023-01-01 09:00:00,19.79,0.526971,9,1,1,6,0.497372,0.631535,0.118949,0.551959,0.408944


In [7]:
# Remove rows with NaN values (created due to lag & rolling features)
hourly_df = hourly_df.dropna()

hourly_df.head()


Unnamed: 0_level_0,Energy_kWh,Energy_Scaled,hour,day,month,day_of_week,lag_1,lag_2,lag_3,rolling_mean_3,rolling_mean_6
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2023-01-01 05:00:00,10.41,0.267497,5,1,1,6,0.411342,0.315906,0.706224,0.331581,0.412218
2023-01-01 06:00:00,5.04,0.118949,6,1,1,6,0.267497,0.411342,0.315906,0.265929,0.382527
2023-01-01 07:00:00,23.57,0.631535,7,1,1,6,0.118949,0.267497,0.411342,0.339327,0.408575
2023-01-01 08:00:00,18.72,0.497372,8,1,1,6,0.631535,0.118949,0.267497,0.415952,0.373767
2023-01-01 09:00:00,19.79,0.526971,9,1,1,6,0.497372,0.631535,0.118949,0.551959,0.408944


In [8]:
# Select feature columns
feature_columns = [
    'hour',
    'day',
    'month',
    'day_of_week',
    'lag_1',
    'lag_2',
    'lag_3',
    'rolling_mean_3',
    'rolling_mean_6'
]

# Features (X) and Target (y)
X = hourly_df[feature_columns]
y = hourly_df['Energy_Scaled']

X.head(), y.head()


(                     hour  day  month  day_of_week     lag_1     lag_2  \
 Timestamp                                                                
 2023-01-01 05:00:00     5    1      1            6  0.411342  0.315906   
 2023-01-01 06:00:00     6    1      1            6  0.267497  0.411342   
 2023-01-01 07:00:00     7    1      1            6  0.118949  0.267497   
 2023-01-01 08:00:00     8    1      1            6  0.631535  0.118949   
 2023-01-01 09:00:00     9    1      1            6  0.497372  0.631535   
 
                         lag_3  rolling_mean_3  rolling_mean_6  
 Timestamp                                                      
 2023-01-01 05:00:00  0.706224        0.331581        0.412218  
 2023-01-01 06:00:00  0.315906        0.265929        0.382527  
 2023-01-01 07:00:00  0.411342        0.339327        0.408575  
 2023-01-01 08:00:00  0.267497        0.415952        0.373767  
 2023-01-01 09:00:00  0.118949        0.551959        0.408944  ,
 Timestamp
 2023-

In [9]:
# Combine features and target
final_df = hourly_df[feature_columns + ['Energy_Scaled']]

# Save for next modules
final_df.to_csv("final_feature_engineered_data.csv")

final_df.head()


Unnamed: 0_level_0,hour,day,month,day_of_week,lag_1,lag_2,lag_3,rolling_mean_3,rolling_mean_6,Energy_Scaled
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2023-01-01 05:00:00,5,1,1,6,0.411342,0.315906,0.706224,0.331581,0.412218,0.267497
2023-01-01 06:00:00,6,1,1,6,0.267497,0.411342,0.315906,0.265929,0.382527,0.118949
2023-01-01 07:00:00,7,1,1,6,0.118949,0.267497,0.411342,0.339327,0.408575,0.631535
2023-01-01 08:00:00,8,1,1,6,0.631535,0.118949,0.267497,0.415952,0.373767,0.497372
2023-01-01 09:00:00,9,1,1,6,0.497372,0.631535,0.118949,0.551959,0.408944,0.526971
