# üè≠ **Overview**

# üìö **Library and Configuration**

In [None]:
# System & Environment Configuration
import sys
import importlib
sys.path.append("..")

# Ignore warning
from warnings import filterwarnings
filterwarnings("ignore")

# Core Library
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.cluster import KMeans

# Source helper
import src.preprocessing as preprocessing

# Reload shortcut
def r(module=preprocessing):
    importlib.reload(module)

r()

# Train and Test Processed
PROCESSED_ROOT = Path('../data/processed/')

TRAIN_PATH_PROCESSED = PROCESSED_ROOT/'train.csv'
TEST_PATH_PROCESSED = PROCESSED_ROOT/'test.csv'

TRAIN_PATH_ENGINEERED = PROCESSED_ROOT/'train_engineered.csv'
TEST_PATH_ENGINEERED = PROCESSED_ROOT/'test_engineered.csv'

print('library and configuration ready!')

library and configuration ready!


# üóÉÔ∏è **Train and Test Loading**

In [97]:
train = pd.read_csv(TRAIN_PATH_PROCESSED)
test = pd.read_csv(TEST_PATH_PROCESSED)

print('Train shape :', train.shape)
print('Test Shape  :', test.shape)

Train shape : (18942, 39)
Test Shape  : (1077, 39)


# ‚öôÔ∏è **Feature Engineering**

In [98]:
# To preserve original dataset, we copy just to give some demonstration how these feature engineering works
train_lore = train.copy()
test_lore = test.copy()

In [99]:
cloud_map = {
    'Unknown': 1, 'Opaque Ice': 2, 'Overlapping': 3,
    'Super-Cooled Water': 4, 'Cirrus': 5, 'Fog': 6,
    'Water': 7, 'Overshooting': 8,
    'Probably Clear': 9, 'Clear': 10
}

train_lore['Cloud Type'] = train_lore['Cloud Type'].map(cloud_map)
test_lore['Cloud Type'] = test_lore['Cloud Type'].map(cloud_map)

In [100]:
def add_time_features(df):
    df = df.copy()

    # Standardize timestamp
    df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')
    df = df.sort_values('Timestamp')

    # Shift time -5h (solar noon correction)
    solar_time = df['Timestamp'] - pd.Timedelta(hours=5)
    solar_hour = solar_time.dt.hour

    # Cyclical hour
    df['hour_sin'] = np.sin(2 * np.pi * solar_hour / 24)
    df['hour_cos'] = np.cos(2 * np.pi * solar_hour / 24)

    # Cyclical month
    df['month_sin'] = np.sin(2 * np.pi * df['Timestamp'].dt.month / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['Timestamp'].dt.month / 12)

    # Day-of-year (for astronomy)
    df['doy'] = solar_time.dt.dayofyear

    return df

train_time = add_time_features(train_lore)
test_time = add_time_features(test_lore)

print('Train shape after time features :', train_time.shape)
print('Test shape after time features  :', test_time.shape)

Train shape after time features : (18942, 44)
Test shape after time features  : (1077, 44)


In [102]:
def add_astronomy_features(df):
    df = df.copy()
    doy = df['doy']

    # Solar declination
    delta = 23.45 * np.sin(np.radians(360 * (284 + doy) / 365))
    df['solar_declination'] = delta

    # Equation of Time
    B = np.radians((doy - 81) * 360 / 365)
    df['equation_of_time'] = (
        9.87 * np.sin(2*B) - 7.53 * np.cos(B) - 1.5 * np.sin(B)
    )

    # Earth-Sun distance
    df['sun_earth_distance_factor'] = 1 + 0.033 * np.cos(np.radians(360 * doy / 365))
    df['extraterrestrial_radiation'] = 1367 * df['sun_earth_distance_factor']

    return df

train_astro = add_astronomy_features(train_time)
test_astro = add_astronomy_features(test_time)

print('Train shape after astronomy features :', train_astro.shape)
print('Test shape after astronomy features  :', test_astro.shape)

Train shape after astronomy features : (18942, 48)
Test shape after astronomy features  : (1077, 48)


In [103]:
def add_sun_features(df):
    df = df.copy()

    sunrise_dt = pd.to_datetime(df['sunrise'])
    sunset_dt = pd.to_datetime(df['sunset'])

    # length of daylight
    df['sunHour'] = (sunset_dt - sunrise_dt).dt.total_seconds()

    # day/night flag
    curr = df['Timestamp'].dt.time
    rise = sunrise_dt.dt.time
    set_  = sunset_dt.dt.time

    df['is_daytime'] = [
        1 if (r <= c <= s) else 0
        for c, r, s in zip(curr, rise, set_)
    ]

    return df

train_sun = add_sun_features(train_astro)
test_sun = add_sun_features(test_astro)

print('Train shape after sun features :', train_sun.shape)
print('Test shape after sun features  :', test_sun.shape)

Train shape after sun features : (18942, 49)
Test shape after sun features  : (1077, 49)


In [105]:
def add_physics_features(df):
    df = df.copy()
    eps = 1e-6

    df['clearsky_index'] = df['GHI'] / (df['Clearsky GHI'] + eps)
    df['diffuse_fraction'] = df['DHI'] / (df['GHI'] + eps)

    # wind cooling using Kelvin
    df['wind_cooling_potential'] = df['windspeedKmph'] / (df['tempC'] + 273.15)

    return df

train_physics = add_physics_features(train_sun)
test_physics = add_physics_features(test_sun)

print('Train shape after physics features :', train_physics.shape)
print('Test shape after physics features  :', test_physics.shape)

Train shape after physics features : (18942, 52)
Test shape after physics features  : (1077, 52)


In [106]:
def add_time_dynamic_features(df):
    df = df.copy()

    # lag 1h reference timestamp
    df['target_time_1h'] = df['Timestamp'] - pd.Timedelta(hours=1)

    # lookup table
    lookup = df[['Timestamp', 'GHI', 'cloudcover']].copy()
    lookup.columns = ['ts_ref', 'GHI_lag1', 'cloudcover_lag1']

    df = df.merge(lookup, left_on='target_time_1h',
                  right_on='ts_ref', how='left')

    # rolling 3-hour mean
    idx = df.set_index('Timestamp')
    df['GHI_rolling_mean_3h'] = (
        idx['GHI'].rolling('3h', min_periods=1).mean().values
    )

    # cleanup + fill
    df.drop(columns=['target_time_1h', 'ts_ref'], inplace=True)
    df[['GHI_lag1','cloudcover_lag1','GHI_rolling_mean_3h']] = \
        df[['GHI_lag1','cloudcover_lag1','GHI_rolling_mean_3h']].fillna(0)

    return df

train_roll = add_time_dynamic_features(train_physics)
test_roll = add_time_dynamic_features(test_physics)

print('Train shape after rolling features :', train_roll.shape)
print('Test shape after rolling features  :', test_roll.shape)

Train shape after rolling features : (18942, 55)
Test shape after rolling features  : (1077, 55)


In [107]:
def add_gradient_features(df):
    df = df.copy()

    df['GHI_diff_1h'] = df['GHI'] - df['GHI_lag1']
    df['cloudcover_diff_1h'] = df['cloudcover'] - df['cloudcover_lag1']
    df['clearsky_index_diff'] = df['clearsky_index'].diff().fillna(0)

    # 2nd derivative (acceleration)
    df['GHI_acceleration'] = df['GHI_diff_1h'].diff().fillna(0)

    return df

train_grad = add_gradient_features(train_roll)
test_grad = add_gradient_features(test_roll)

print('Train shape after gradient features :', train_grad.shape)
print('Test shape after gradient features  :', test_grad.shape)

Train shape after gradient features : (18942, 59)
Test shape after gradient features  : (1077, 59)


In [108]:
def fit_weather_clusters(train_df, n_clusters=4):
    features = [
        'cloudcover_lag1',
        'humidity',
        'windspeedKmph',
        'GHI_rolling_mean_3h',
        'clearsky_index',
        'diffuse_fraction'
    ]

    model = KMeans(n_clusters=n_clusters, random_state=42)
    model.fit(train_df[features].fillna(0))

    return model, features

def apply_weather_clusters(df, model, features):
    df = df.copy()
    df['weather_cluster'] = model.predict(df[features].fillna(0))
    return df


cluster_model, cluster_feats = fit_weather_clusters(train_grad)

train_cluster = apply_weather_clusters(train_grad, cluster_model, cluster_feats)
test_cluster  = apply_weather_clusters(test_grad, cluster_model, cluster_feats)

print('Train shape after weather clustering :', train_cluster.shape)
print('Test shape after weather clustering  :', test_cluster.shape)

Train shape after weather clustering : (18942, 60)
Test shape after weather clustering  : (1077, 60)


In [109]:
def fit_stl_decomposition(train_df, period=24):
    # Fit STL on train
    stl = STL(train_df['GHI'].fillna(0), period=period).fit()

    train_trend = stl.trend
    train_seasonal = stl.seasonal
    train_resid = stl.resid

    # Fit linear models to approximate patterns for test set
    t = np.arange(len(train_df)).reshape(-1, 1)

    trend_model = LinearRegression().fit(t, train_trend)
    seasonal_model = LinearRegression().fit(t, train_seasonal)
    resid_model = LinearRegression().fit(t, train_resid)

    return trend_model, seasonal_model, resid_model

def apply_stl_features(df, trend_model, seasonal_model, resid_model):
    df = df.copy()
    t = np.arange(len(df)).reshape(-1, 1)

    df['GHI_trend'] = trend_model.predict(t)
    df['GHI_seasonal'] = seasonal_model.predict(t)
    df['GHI_residual'] = resid_model.predict(t)

    return df

trend_m, season_m, resid_m = fit_stl_decomposition(train_cluster)

train_stl = apply_stl_features(train_cluster, trend_m, season_m, resid_m)
test_stl  = apply_stl_features(test_cluster, trend_m, season_m, resid_m)

print('Train shape after STL features :', train_stl.shape)
print('Test shape after STL features  :', test_stl.shape)

Train shape after STL features : (18942, 63)
Test shape after STL features  : (1077, 63)


# üõ†Ô∏è **Feature Engineering Pipeline**

In [111]:
fe = preprocessing.FeatureEngineering(n_clusters=4, stl_period=24)

fe.fit(train)

train_engineered = fe.transform(train)
test_engineered  = fe.transform(test)

print('Train engineered shape :', train_engineered.shape)
print('Test engineered shape  :', test_engineered.shape)

Train engineered shape : (18942, 63)
Test engineered shape  : (1077, 63)


In [None]:
train_engineered.isna().sum()[train_engineered.isna().sum() > 0]

moonrise             629
moonset              639
DHI                 1044
DNI                 1044
GHI                 1044
Clearsky DHI        1044
Clearsky DNI        1044
Clearsky GHI        1044
clearsky_index      1044
diffuse_fraction    1044
GHI_diff_1h         1044
dtype: int64

In [113]:
train_engineered.to_csv(TRAIN_PATH_ENGINEERED, index=False)
test_engineered.to_csv(TEST_PATH_ENGINEERED, index=False)

print(f'Train and Test engineered saved to {TRAIN_PATH_ENGINEERED} and {TEST_PATH_ENGINEERED}')

Train and Test engineered saved to ..\data\processed\train_engineered.csv and ..\data\processed\test_engineered.csv


# üì¶ **Preprocessor**

In [112]:
# def get_feature_groups(
#     df, target_col, cyclical_cols, manual_drop_cols=None
# ):
#     """
#     Memisahkan kolom menjadi Numerical, Ordinal, dan Cyclical secara otomatis.
#     """
#     if manual_drop_cols is None:
#         manual_drop_cols = ['tempC', 'DewPointC', 'Pressure'] # Default drop

#     # Kolom yang pasti dibuang (Target + Cyclical mentah + Manual Drop)
#     features_to_drop = [target_col] + cyclical_cols + manual_drop_cols

#     # Numerical: Ambil semua angka, lalu buang yang masuk daftar drop
#     num_features = df.select_dtypes(np.number).columns.difference(features_to_drop).tolist()
#     # Ordinal: Hardcoded sesuai strategimu
#     ord_features = ['Cloud Type']
#     # Cyclical: Sesuai input
#     cyc_features = cyclical_cols

#     # Gabungkan semua untuk select X nanti
#     all_selected = list(set(num_features + ord_features + cyc_features))

#     print(f"üìä Features Summary:")
#     print(f"   - Numerical : {len(num_features)}")
#     print(f"   -   Ordinal : {len(ord_features)}")
#     print(f"   -  Cyclical : {len(cyc_features)}")
#     print(f"   -     TOTAL : {len(all_selected)}")

#     return num_features, ord_features, cyc_features, all_selected

# def build_preprocessor(num_feat, ord_feat, cyc_feat):
#     """
#     Menyusun ColumnTransformer agar rapi.
#     """
#     # Sub-pipeline untuk Cyclical
#     cyclical_pipeline = Pipeline([
#         ('cyclic_encoding', SolarLunarTransformer(features=cyc_feat))
#     ])

#     # Sub-pipeline untuk Numerical
#     numerical_pipeline = Pipeline([
#         ('numeric_imputer', SimpleImputer(strategy='mean')),
#         ('numeric_scaler', StandardScaler())
#     ])

#     # Main Transformer
#     preprocessor = ColumnTransformer(transformers=[
#         ('ord', 'passthrough', ord_feat),
#         ('cyc', cyclical_pipeline, cyc_feat),
#         ('num', numerical_pipeline, num_feat),
#     ])

#     return preprocessor

# # Setup Data & Features
# solar_lunar = ['moonrise', 'moonset', 'sunrise', 'sunset']

# num_cols, ord_cols, cyc_cols, sel_features = get_feature_groups(
#     train_engineered, '% Baseline', solar_lunar
# )