In [None]:
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)]
(https://colab.research.google.com/github/richochandra0805/ml-rainhour/blob/main/Rainhour_Prediction.ipynb)

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Regresi Rainhour dengan Auto Model Selection

In [7]:
# ============================================
# 📦 1. INSTALL LIBRARY
# ============================================
!pip install scikit-learn xgboost openpyxl joblib -q

# ============================================
# 📥 2. LOAD DATA
# ============================================
import pandas as pd

df = pd.read_excel("/content/drive/MyDrive/12822034-AcademicsLibrary/Kerja-Praktik/PT-Adaro-Indonesia-(2025)/4.Kerjaan/TOPIK1-ANNUALRAINHOURFORECAST/Data/Data_Rain_Manual1998-2025.xlsx")
df['Tanggal'] = pd.to_datetime(df['Tanggal'])
df = df.dropna()

# Feature Engineering
df['dayofyear'] = df['Tanggal'].dt.dayofyear
df['month'] = df['Tanggal'].dt.month

X = df[['Rainfall', 'LostHour', 'Nino-3.4', 'dayofyear', 'month']]
y = df['Rainhour']

# ============================================
# 🔀 3. SPLIT DATA
# ============================================
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ============================================
# 🤖 4. TRAINING & AUTO-SELECTION MODEL
# ============================================
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_absolute_error, r2_score

models = {
    "rf": Pipeline([
        ('scaler', StandardScaler()),
        ('model', RandomForestRegressor(random_state=42))
    ]),
    "xgb": Pipeline([
        ('scaler', StandardScaler()),
        ('model', XGBRegressor(objective='reg:squarederror', random_state=42))
    ]),
    "mlp": Pipeline([
        ('scaler', StandardScaler()),
        ('model', MLPRegressor(random_state=42, max_iter=1000))
    ])
}

params = {
    "rf": {
        "model__n_estimators": [100, 200],
        "model__max_depth": [5, 10, None]
    },
    "xgb": {
        "model__n_estimators": [100, 200],
        "model__max_depth": [3, 5, 7]
    },
    "mlp": {
        "model__hidden_layer_sizes": [(64,), (64, 32)],
        "model__activation": ['relu', 'tanh']
    }
}

results = {}
for name in models:
    print(f"Training {name.upper()}...")
    search = GridSearchCV(models[name], params[name], cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
    search.fit(X_train, y_train)
    y_pred = search.predict(X_test)
    results[name] = {
        "best_score": -search.best_score_,
        "test_mae": mean_absolute_error(y_test, y_pred),
        "r2": r2_score(y_test, y_pred),
        "best_params": search.best_params_,
        "model": search.best_estimator_
    }

# Tampilkan hasil semua model
import pprint
pprint.pprint({k: {x: results[k][x] for x in ['best_score', 'test_mae', 'r2']} for k in results})

# Ambil model terbaik berdasarkan test_mae
best_name = min(results, key=lambda x: results[x]['test_mae'])
best_model = results[best_name]['model']
print(f"\nModel terbaik: {best_name.upper()} dengan MAE {results[best_name]['test_mae']:.2f}")

# ============================================
# 💾 5. SIMPAN MODEL
# ============================================
import joblib
joblib.dump(best_model, f"model_rainhour_best_{best_name}.pkl")


Training RF...
Training XGB...
Training MLP...
{'mlp': {'best_score': np.float64(0.4914696613024194),
         'r2': 0.9067183954724032,
         'test_mae': 0.5050554195126451},
 'rf': {'best_score': np.float64(0.4786446207437391),
        'r2': 0.9027242255683187,
        'test_mae': 0.48777123992268434},
 'xgb': {'best_score': np.float64(0.49468497195640265),
         'r2': 0.899333271145261,
         'test_mae': 0.5003213937731403}}

Model terbaik: RF dengan MAE 0.49


['model_rainhour_best_rf.pkl']

Klasifikasi Multi Output

In [8]:
# ============================================
# 📦 1. INSTALL LIBRARY
# ============================================
!pip install scikit-learn xgboost joblib openpyxl -q

# ============================================
# 📥 2. LOAD DAN PERSIAPKAN DATA
# ============================================
import pandas as pd
import numpy as np

df = pd.read_excel("/content/drive/MyDrive/12822034-AcademicsLibrary/Kerja-Praktik/PT-Adaro-Indonesia-(2025)/4.Kerjaan/TOPIK1-ANNUALRAINHOURFORECAST/Data/Data_Rain_Manual1998-2025.xlsx")
df['Tanggal'] = pd.to_datetime(df['Tanggal'])
df = df.dropna()

# Buat fitur waktu
df['dayofyear'] = df['Tanggal'].dt.dayofyear
df['month'] = df['Tanggal'].dt.month

# ============================================
# 🧪 3. REKONSTRUKSI PROBABILITAS JAM HUJAN
# ============================================
# Strategi sederhana: asumsi hujan dimulai jam 14:00, durasi = Rainhour
def rain_hours_binary(rainhour, start_hour=14):
    hours = np.zeros(24)
    dur = int(round(rainhour))
    for i in range(dur):
        h = (start_hour + i) % 24
        hours[h] = 1
    return hours

rain_matrix = df['Rainhour'].apply(lambda rh: rain_hours_binary(rh)).to_list()
rain_array = np.vstack(rain_matrix).astype(int)

# Buat dataframe target multi-output
df_rainhour_bin = pd.DataFrame(rain_array, columns=[f"Rain_at_hour_{i}" for i in range(24)])
df_final = pd.concat([df, df_rainhour_bin], axis=1)

# ============================================
# 🎯 4. SPLIT FITUR DAN TARGET
# ============================================
from sklearn.model_selection import train_test_split

features = ['Rainfall', 'LostHour', 'Nino-3.4', 'dayofyear', 'month']
targets = [f"Rain_at_hour_{i}" for i in range(24)]

X = df_final[features]
Y = df_final[targets]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# ============================================
# 🤖 5. TRAINING MULTIOUTPUT CLASSIFIER
# ============================================
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, log_loss, roc_auc_score

models = {
    "rf": Pipeline([
        ('scaler', StandardScaler()),
        ('model', MultiOutputClassifier(RandomForestClassifier(random_state=42)))
    ]),
    "xgb": Pipeline([
        ('scaler', StandardScaler()),
        ('model', MultiOutputClassifier(XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)))
    ]),
    "mlp": Pipeline([
        ('scaler', StandardScaler()),
        ('model', MultiOutputClassifier(MLPClassifier(max_iter=500, random_state=42)))
    ])
}

params = {
    "rf": {
        "model__estimator__n_estimators": [100],
        "model__estimator__max_depth": [5, 10]
    },
    "xgb": {
        "model__estimator__n_estimators": [100],
        "model__estimator__max_depth": [3, 5]
    },
    "mlp": {
        "model__estimator__hidden_layer_sizes": [(64,), (64,32)],
        "model__estimator__activation": ['relu']
    }
}

results = {}
for name in models:
    print(f"\nTraining {name.upper()}...")
    search = GridSearchCV(models[name], params[name], cv=3, scoring='accuracy', n_jobs=-1)
    search.fit(X_train, Y_train)
    Y_pred = search.predict(X_test)
    results[name] = {
        "best_score": search.best_score_,
        "test_accuracy": np.mean(Y_pred == Y_test.values),
        "best_params": search.best_params_,
        "model": search.best_estimator_
    }

# ============================================
# 📈 6. EVALUASI HASIL
# ============================================
for name in results:
    print(f"\n{name.upper()} Accuracy: {results[name]['test_accuracy']:.3f} | Best CV Score: {results[name]['best_score']:.3f}")

# Ambil model terbaik
best_name = max(results, key=lambda x: results[x]['test_accuracy'])
best_model = results[best_name]['model']
print(f"\n✅ Model terbaik: {best_name.upper()}")

# ============================================
# 💾 7. SIMPAN MODEL
# ============================================
import joblib
joblib.dump(best_model, f"model_prob_rainhour_best_{best_name}.pkl")



Training RF...

Training XGB...


2 fits failed out of a total of 6.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.11/dist-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/sklearn/pipeline.py", line 662, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/usr/local/lib/python3.11/dist-packages/sklearn/multioutput.py", line 543, 


Training MLP...

RF Accuracy: 0.979 | Best CV Score: 0.669

XGB Accuracy: 0.979 | Best CV Score: nan

MLP Accuracy: 0.979 | Best CV Score: 0.659

✅ Model terbaik: RF


['model_prob_rainhour_best_rf.pkl']