In [34]:
!pip install lightgbm scikit-learn



In [35]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
import sys
from io import StringIO

from google.colab import drive
drive.mount('/content/drive')

warnings.filterwarnings('ignore')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
combined_features_path = '/content/drive/My Drive/Data/c_features.pkl'
y_df_path = '/content/drive/My Drive/Data/c_target.pkl'

In [37]:
def preprocess_features(X):
    scaler = StandardScaler()
    minmax_scaler = MinMaxScaler()

    X_scaled = scaler.fit_transform(X)
    X_scaled = minmax_scaler.fit_transform(X_scaled)

    X_scaled[np.isnan(X_scaled)] = 0
    X_scaled[np.isinf(X_scaled)] = 0

    return X_scaled

def calculate_mape(actual, forecast):
    return np.mean(np.abs((actual - forecast) / actual)) * 100

def calculate_log_loss(actual, forecast):
    return np.mean((np.log(actual / forecast))**2)

class SuppressStdoutStderr:
    def __init__(self):
        self.old_stdout = sys.stdout
        self.old_stderr = sys.stderr
        self.mystdout = StringIO()
        self.mystderr = StringIO()

    def __enter__(self):
        sys.stdout = self.mystdout
        sys.stderr = self.mystderr

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout = self.old_stdout
        sys.stderr = self.old_stderr
        self.mystdout.seek(0)
        self.mystderr.seek(0)
        out_lines = self.mystdout.read().splitlines()
        err_lines = self.mystderr.read().splitlines()
        if len(out_lines) > 0:
            print(out_lines[0])
            print(f"...(suppressed {len(out_lines)-1} lines)")
        if len(err_lines) > 0:
            print(err_lines[0])
            print(f"...(suppressed {len(err_lines)-1} lines)")

In [38]:
combined_features = pd.read_pickle(combined_features_path)
y_df = pd.read_pickle(y_df_path)

combined_features = combined_features.loc[y_df.index]
y = y_df.values.flatten()

X_scaled = preprocess_features(combined_features)

splits = TimeSeriesSplit(n_splits=5)

In [39]:
param_grid = {
    'num_leaves': [31, 50],
    'learning_rate': [0.1, 0.01, 0.005],
    'n_estimators': [100, 200, 500],
    'min_child_samples': [20, 50],
    'min_split_gain': [0.0, 0.1, 0.01]
}

lgb_model = lgb.LGBMRegressor()

mae_scorer = make_scorer(mean_absolute_error, greater_is_better=False)

with SuppressStdoutStderr():
    grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, scoring=mae_scorer, cv=splits, verbose=2)
    grid_search.fit(X_scaled, y)

print("Best parameters found: ", grid_search.best_params_)
print("Best MAE score: ", -grid_search.best_score_)

best_params = grid_search.best_params_
final_model = lgb.LGBMRegressor(**best_params)

Fitting 5 folds for each of 108 candidates, totalling 540 fits
...(suppressed 150112 lines)
Best parameters found:  {'learning_rate': 0.1, 'min_child_samples': 20, 'min_split_gain': 0.01, 'n_estimators': 500, 'num_leaves': 31}
Best MAE score:  8.767671352496158


In [40]:
forecasts = []
forecast_indices = []
mae_scores = []
mape_scores = []
log_loss_scores = []

with SuppressStdoutStderr():
    for train_idx, test_idx in splits.split(X_scaled):
        X_train, X_test = X_scaled[train_idx], X_scaled[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        final_model.fit(X_train, y_train)
        forecast = final_model.predict(X_test)

        mae = mean_absolute_error(y_test, forecast)
        mape = calculate_mape(y_test, forecast)
        log_loss = calculate_log_loss(y_test, forecast)

        mae_scores.append(mae)
        mape_scores.append(mape)
        log_loss_scores.append(log_loss)
        forecasts.extend(forecast)
        forecast_indices.extend(test_idx)

average_mae = np.mean(mae_scores)
average_mape = np.mean(mape_scores)
average_log_loss = np.mean(log_loss_scores)

print(f"Average MAE: {average_mae}")
print(f"Average MAPE: {average_mape}")
print(f"Average Log Loss: {average_log_loss}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000088 seconds.
...(suppressed 2661 lines)
Average MAE: 8.767671352496158
Average MAPE: 17.475629604825443
Average Log Loss: 0.050657289296926346
