In [None]:
# Cleaner Utility
import src.preprocessing as preprocessing

# Reload shortcut
def r(module=preprocessing):
    importlib.reload(module)

In [None]:
train_df = train_raw.copy()
test_df = test_raw.copy()

print(train_df.shape)
print(test_df.shape)

def feature_engineering(df):
    df = df.copy()

    # Standardize timestamp
    df['Timestamp'] = pd.to_datetime(df['Timestamp'])
    df = df.sort_values('Timestamp')

    # --- CLOUD MAPPING ---
    cloud_map = {
        'Unknown': 0, 'Opaque Ice': 1, 'Overlapping': 2, 'Super-Cooled Water': 3,
        'Cirrus': 4, 'Fog': 5, 'Water': 6, 'Overshooting': 7,
        'Probably Clear': 8, 'Clear': 9
    }
    df['Cloud Type'] = df['Cloud Type'].map(cloud_map).fillna(0)

    # --- PHYSICS CORRECTION (CRITICAL) ---
    # Observation: Solar peak occurs at 17:00 in raw data.
    # Adjustment: Shift time by -5 hours to align Solar Noon with 12:00.
    # This helps the model understand the true "shape" of the solar day.
    solar_time = df['Timestamp'] - pd.Timedelta(hours=5)
    solar_hour = solar_time.dt.hour
    doy = solar_time.dt.dayofyear

    # --- CYCLICAL FEATURES ---
    # Calculated on Solar Time for better alignment with physics
    df['hour_sin'] = np.sin(2 * np.pi * solar_hour / 24)
    df['hour_cos'] = np.cos(2 * np.pi * solar_hour / 24)

    # Monthly cycles
    df['month_sin'] = np.sin(2 * np.pi * df['Timestamp'].dt.month / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['Timestamp'].dt.month / 12)

    # --- ASTRONOMICAL FEATURES ---
    # Solar Declination (Cooper 1969)
    delta = 23.45 * np.sin(np.radians(360 * (284 + doy) / 365))
    df['solar_declination'] = delta

    # Equation of Time (EoT)
    B = np.radians((doy - 81) * 360 / 365)
    df['equation_of_time'] = 9.87 * np.sin(2*B) - 7.53 * np.cos(B) - 1.5 * np.sin(B)

    # Earth-Sun Distance Factor (Eccentricity)
    df['sun_earth_distance_factor'] = 1 + 0.033 * np.cos(np.radians(360 * doy / 365))
    df['extraterrestrial_radiation'] = 1367 * df['sun_earth_distance_factor']

    # --- SUNRISE / SUNSET LOGIC ---
    # Parsing raw time strings
    sunrise_dt = pd.to_datetime(df['sunrise'], format='%I:%M %p')
    sunset_dt = pd.to_datetime(df['sunset'], format='%I:%M %p')

    # Duration of daylight
    df['sunHour'] = (sunset_dt - sunrise_dt).dt.total_seconds()

    # Accurate daytime flag (comparing raw times)
    curr_time = df['Timestamp'].dt.time
    rise_time = sunrise_dt.dt.time
    set_time = sunset_dt.dt.time

    df['is_daytime'] = [
        1 if (r <= c <= s) else 0
        for c, r, s in zip(curr_time, rise_time, set_time)
    ]

    # --- SOLAR PHYSICS RATIOS ---
    # Adding epsilon to avoid division by zero
    epsilon = 1e-6
    df['clearsky_index'] = df['GHI'] / (df['Clearsky GHI'] + epsilon)
    df['diffuse_fraction'] = df['DHI'] / (df['GHI'] + epsilon)

    # Wind Cooling Potential (using Kelvin)
    df['wind_cooling_potential'] = df['windspeedKmph'] / (df['tempC'] + 273.15)

    # --- TIME AWARE FEATURES (SMART MERGE) ---
    # Create lag features correctly handling gaps/jumps in data
    df['target_time_1h'] = df['Timestamp'] - pd.Timedelta(hours=1)

    lookup = df[['Timestamp', 'GHI', 'cloudcover']].copy()
    lookup.columns = ['ts_ref', 'GHI_lag1', 'cloudcover_lag1']

    df = df.merge(lookup, left_on='target_time_1h', right_on='ts_ref', how='left')

    # Rolling stats (3h window)
    indexer = df.set_index('Timestamp')
    df['GHI_rolling_mean_3h'] = indexer['GHI'].rolling('3h', min_periods=1).mean().values

    # Cleanup
    df.drop(columns=['target_time_1h', 'ts_ref'], inplace=True)

    features_to_fill = ['GHI_lag1', 'cloudcover_lag1', 'GHI_rolling_mean_3h']
    df[features_to_fill] = df[features_to_fill].fillna(0)

    return df

# Apply to datasets
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

In [None]:
def get_feature_groups(
    df, target_col, cyclical_cols, manual_drop_cols=None
):
    """
    Memisahkan kolom menjadi Numerical, Ordinal, dan Cyclical secara otomatis.
    """
    if manual_drop_cols is None:
        manual_drop_cols = ['tempC', 'DewPointC', 'Pressure'] # Default drop

    # Kolom yang pasti dibuang (Target + Cyclical mentah + Manual Drop)
    features_to_drop = [target_col] + cyclical_cols + manual_drop_cols

    # Numerical: Ambil semua angka, lalu buang yang masuk daftar drop
    num_features = df.select_dtypes(np.number).columns.difference(features_to_drop).tolist()
    # Ordinal: Hardcoded sesuai strategimu
    ord_features = ['Cloud Type']
    # Cyclical: Sesuai input
    cyc_features = cyclical_cols

    # Gabungkan semua untuk select X nanti
    all_selected = list(set(num_features + ord_features + cyc_features))

    print(f"ðŸ“Š Features Summary:")
    print(f"   - Numerical : {len(num_features)}")
    print(f"   -   Ordinal : {len(ord_features)}")
    print(f"   -  Cyclical : {len(cyc_features)}")
    print(f"   -     TOTAL : {len(all_selected)}")

    return num_features, ord_features, cyc_features, all_selected

def build_preprocessor(num_feat, ord_feat, cyc_feat):
    """
    Menyusun ColumnTransformer agar rapi.
    """
    # Sub-pipeline untuk Cyclical
    cyclical_pipeline = Pipeline([
        ('cyclic_encoding', SolarLunarTransformer(features=cyc_feat))
    ])

    # Sub-pipeline untuk Numerical
    numerical_pipeline = Pipeline([
        ('numeric_imputer', SimpleImputer(strategy='mean')),
        ('numeric_scaler', StandardScaler())
    ])

    # Main Transformer
    preprocessor = ColumnTransformer(transformers=[
        ('ord', 'passthrough', ord_feat),
        ('cyc', cyclical_pipeline, cyc_feat),
        ('num', numerical_pipeline, num_feat),
    ])

    return preprocessor

In [None]:
# HYPERPARAMETER TUNING
# optuna.logging.set_verbosity(optuna.logging.ERROR)

# def objective(trial):
#     param_grid = {
#         'n_estimators': trial.suggest_int('n_estimators', 1000, 3000, step=100),
#         'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
#         'max_depth': trial.suggest_int('max_depth', 5, 12),
#         'min_child_weight': trial.suggest_int('min_child_weight', 1, 7),
#         'subsample': trial.suggest_float('subsample', 0.6, 0.9),
#         'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 0.9),
#         'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 1.0, log=True), # L1 Regularization
#         'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True), # L2 Regularization

#         'tree_method': 'hist',
#         'random_state': SEED,
#         'n_jobs': -1,
#         'device': 'cuda'
#     }

#     # Buat Model dengan parameter dari trial
#     model = XGBRegressor(**param_grid)

#     # Masukkan ke Pipeline
#     pipeline_optuna = Pipeline(steps=[
#         ('preprocessor', preprocessor),
#         ('model', model)
#     ])

#     # Cross Validation
#     cv = KFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
#     scores = cross_val_score(pipeline_optuna, X, y, cv=cv, scoring='neg_root_mean_squared_error', n_jobs=1)

#     rmse = np.sqrt(-scores.mean())
#     return -scores.mean()

# study = optuna.create_study(direction='minimize') # Minimize error
# study.optimize(objective, n_trials=50, show_progress_bar=True)

# print('Best hyperparameters:', study.best_params)
# print('Best RMSE:', study.best_value)