In [1]:
# 04_feature_engineering.py

import pandas as pd
import numpy as np
import os
from google.colab import drive

# === 1. MOUNT GOOGLE DRIVE ===
drive.mount('/content/drive')
root_path = '/content/drive/My Drive/load_type_prediction/'

# === 2. LOAD TRAINING DATA ===
train_path = os.path.join(root_path, 'data/train/power_train.csv')
save_path = os.path.join(root_path, 'processed_data/train_transformed.csv')
df = pd.read_csv(train_path)

# === 3. FEATURE ENGINEERING FUNCTION ===
def apply_feature_engineering(df: pd.DataFrame, is_train: bool = True):
    """
    Apply feature engineering to the cleaned dataset.
    Arguments:
        df (pd.DataFrame): Input dataframe (train or test).
        is_train (bool): Whether the dataset is training data (default: True).
    Returns:
        df (pd.DataFrame): Transformed dataframe with new features.
    """

    # --- Step 1: Drop redundant or constant features ---
    df = df.drop(columns=['Leading_Current_Power_Factor', 'NSM'])

    # --- Step 2: Time Feature Extraction ---
    df['Date_Time'] = pd.to_datetime(df['Date_Time'])
    df['Month'] = df['Date_Time'].dt.month
    df['Hour'] = df['Date_Time'].dt.hour
    df['Day'] = df['Date_Time'].dt.dayofweek  # 0 = Monday

    # --- Step 3: Drop original timestamp ---
    df = df.drop(columns=['Date_Time'])

    # --- Step 4: Apply log1p transformation to skewed features ---
    skewed_cols = [
        'Usage_kWh',
        'Lagging_Current_Reactive.Power_kVarh',
        'Leading_Current_Reactive_Power_kVarh',
        'CO2(tCO2)'
    ]

    for col in skewed_cols:
        df[f'log_{col}'] = np.log1p(df[col])
        df.drop(columns=[col], inplace=True)

    # --- Step 5: Label encoding (only for training set) ---
    if is_train and 'Load_Type' in df.columns:
        label_map = {'Light_Load': 0, 'Medium_Load': 1, 'Maximum_Load': 2}
        df['Load_Type'] = df['Load_Type'].map(label_map)

    return df

# === 4. APPLY TRANSFORMATION AND SAVE ===
df_transformed = apply_feature_engineering(df, is_train=True)
df_transformed.to_csv(save_path, index=False)
print(f"✅ Transformed training data saved to: {save_path}")


Mounted at /content/drive
✅ Transformed training data saved to: /content/drive/My Drive/load_type_prediction/processed_data/train_transformed.csv


In [4]:
# === 5. PRINT FIRST 10 ROWS OF TRANSFORMED TRAINING SET ===
print("\n📄 Preview of Transformed Training Data:")
print(df_transformed.head(10))



📄 Preview of Transformed Training Data:
   Lagging_Current_Power_Factor  Load_Type  Month  Hour  Day  log_Usage_kWh  \
0                         70.30          0      1     0    0       1.486140   
1                         73.21          0      1     0    0       2.277646   
2                         66.77          0      1     0    0       1.609438   
3                         70.28          0      1     0    0       1.444563   
4                         68.09          0      1     1    0       1.460938   
5                         88.19          0      1     1    0       1.572774   
6                         67.76          0      1     1    0       1.453953   
7                         65.62          0      1     1    0       1.526056   
8                         64.37          0      1     2    0       1.526056   
9                         66.94          0      1     2    0       1.935949   

   log_Lagging_Current_Reactive.Power_kVarh  \
0                                  1.49514

In [5]:
# 05_test_feature_engineering.py

import pandas as pd
import numpy as np
import os
from google.colab import drive

# === 1. MOUNT GOOGLE DRIVE ===
drive.mount('/content/drive')
root_path = '/content/drive/My Drive/load_type_prediction/'
test_path = os.path.join(root_path, 'data/test/power_test.csv')
save_path = os.path.join(root_path, 'processed_data/test_transformed.csv')

# === 2. LOAD TEST DATA ===
df = pd.read_csv(test_path)
df['Date_Time'] = pd.to_datetime(df['Date_Time'])

# === 3. DROP REDUNDANT FEATURES ===
df = df.drop(columns=['Leading_Current_Power_Factor', 'NSM'])

# === 4. TIME FEATURE EXTRACTION ===
df['Month'] = df['Date_Time'].dt.month
df['Hour'] = df['Date_Time'].dt.hour
df['Day'] = df['Date_Time'].dt.dayofweek

# Drop original datetime
df = df.drop(columns=['Date_Time'])

# === 5. APPLY LOG TRANSFORMATIONS (same as train) ===
skewed_cols = [
    'Usage_kWh',
    'Lagging_Current_Reactive.Power_kVarh',
    'Leading_Current_Reactive_Power_kVarh',
    'CO2(tCO2)'
]

for col in skewed_cols:
    df[f'log_{col}'] = np.log1p(df[col])
    df.drop(columns=[col], inplace=True)

# === 6. SAVE TRANSFORMED TEST SET ===
df.to_csv(save_path, index=False)
print(f"✅ Transformed test set saved to: {save_path}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Transformed test set saved to: /content/drive/My Drive/load_type_prediction/processed_data/test_transformed.csv


In [6]:
# === 7. PRINT FIRST 10 ROWS OF TRANSFORMED TEST SET ===
print("\n📄 Preview of Transformed Test Data:")
print(df.head(10))



📄 Preview of Transformed Test Data:
   Lagging_Current_Power_Factor   Load_Type  Month  Hour  Day  log_Usage_kWh  \
0                         80.37  Light_Load     12     0    5       1.470176   
1                         82.15  Light_Load     12     0    5       1.587192   
2                         82.27  Light_Load     12     0    5       2.075662   
3                         81.46  Light_Load     12     0    5       1.587192   
4                         88.19  Light_Load     12     1    5       1.587192   
5                         88.19  Light_Load     12     1    5       1.838961   
6                         78.99  Light_Load     12     1    5       1.587192   
7                         76.69  Light_Load     12     1    5       1.593309   
8                         74.97  Light_Load     12     2    5       1.593309   
9                         76.73  Light_Load     12     2    5       1.601406   

   log_Lagging_Current_Reactive.Power_kVarh  \
0                                  

In [None]:
# 04_feature_engineering_all.py

import pandas as pd
import numpy as np
from pathlib import Path

# === CONFIGURATION ===
BASE_DIR = Path("/content/drive/My Drive/load_type_prediction")
RAW_DIR = BASE_DIR / "data"
PROCESSED_DIR = BASE_DIR / "processed_data"
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# === FEATURE ENGINEERING FUNCTION ===
def apply_feature_engineering(df: pd.DataFrame, is_train: bool = True) -> pd.DataFrame:
    # Drop redundant or constant features
    df = df.drop(columns=['Leading_Current_Power_Factor', 'NSM'], errors='ignore')

    # Extract time features
    df['Date_Time'] = pd.to_datetime(df['Date_Time'])
    df['Month'] = df['Date_Time'].dt.month
    df['Hour'] = df['Date_Time'].dt.hour
    df['Day'] = df['Date_Time'].dt.dayofweek

    # Drop original timestamp
    df = df.drop(columns=['Date_Time'])

    # Apply log1p to skewed features
    skewed_cols = [
        'Usage_kWh',
        'Lagging_Current_Reactive.Power_kVarh',
        'Leading_Current_Reactive_Power_kVarh',
        'CO2(tCO2)'
    ]
    for col in skewed_cols:
        if col in df.columns:
            df[f'log_{col.replace(" ", "_").replace(".", "")}'] = np.log1p(df[col])
            df.drop(columns=[col], inplace=True)

    # Label encoding for training set
    if is_train and 'Load_Type' in df.columns:
        label_map = {'Light_Load': 0, 'Medium_Load': 1, 'Maximum_Load': 2}
        df['Load_Type'] = df['Load_Type'].map(label_map)

    return df

# === LOAD AND TRANSFORM TRAIN DATA ===
train_path = RAW_DIR / "train" / "power_train.csv"
train_df = pd.read_csv(train_path)
train_transformed = apply_feature_engineering(train_df, is_train=True)
print("🔹 Preview of Transformed Train Data:\n")
print(train_transformed.head(10).to_csv(index=False))

# SAVE TRAIN
train_transformed.to_csv(PROCESSED_DIR / "transformed_power_train.csv", index=False)

# === LOAD AND TRANSFORM TEST DATA ===
test_path = RAW_DIR / "test" / "power_test.csv"
test_df = pd.read_csv(test_path)
test_transformed = apply_feature_engineering(test_df, is_train=False)
print("\n🔹 Preview of Transformed Test Data:\n")
print(test_transformed.head(10).to_csv(index=False))

# SAVE TEST
test_transformed.to_csv(PROCESSED_DIR / "transformed_power_test.csv", index=False)
