# Notes

- Overall wmape hesapla, plant ve month dimensionı olmadan, hem cnn için hem lgb için

In [1]:
# !pip install lightgbm

In [2]:
import pandas as pd
import lightgbm as lgb
import numpy as np

# suppress warnings
import warnings
warnings.filterwarnings("ignore")

In [41]:
df = pd.read_parquet("../data/processed/outlier_removed.parquet")
weather_cols = [col for col in df.columns if col.startswith(("UGRD", "VGRD"))]
df = df[["forecast_dt", "rt_plant_id", "production", *weather_cols]]

In [42]:
from sklearn.preprocessing import MinMaxScaler as scaler_
scalers = {}
lower_bound = 1e-8

for i, plant in enumerate(df.rt_plant_id.unique()):
    scalers[plant] = scaler_()
    cols = ["production", *weather_cols]
    df.loc[df["rt_plant_id"] == plant, cols] = scalers[plant].fit_transform(df.loc[df["rt_plant_id"] == plant, cols]).clip(min=lower_bound, max=1-lower_bound)

In [43]:
TRAIN_END = pd.Timestamp("2020-10-01")
VALID_START = TRAIN_END
VALID_END = VALID_START + pd.Timedelta(days=92)
TEST_START = VALID_END
TEST_END = TEST_START + pd.Timedelta(days=365)

In [44]:
for box in ["SW", "NW", "NE", "SE"]:
    df[f"speed_{box}"] = np.sqrt(np.square(df[f"UGRD_80.m.above.ground.{box}"]) + np.square(df[f"VGRD_80.m.above.ground.{box}"]))
    df[f"angle_{box}"] = np.arctan(df[f"UGRD_80.m.above.ground.{box}"] / df[f"VGRD_80.m.above.ground.{box}"])

In [45]:
df = df.sort_values(["rt_plant_id", "forecast_dt"])
train_indexes = df[df.forecast_dt < TRAIN_END].index
valid_indexes = df[(df.forecast_dt >= VALID_START) & (df.forecast_dt < VALID_END)].index
test_indexes = df[(df.forecast_dt >= TEST_START) & (df.forecast_dt < TEST_END)].index

In [46]:
y_test = df.loc[test_indexes, "production"].copy()
df.loc[test_indexes, "production"] = np.nan

In [47]:
train_df = df.loc[train_indexes]
valid_df = df.loc[valid_indexes]
test_df = df.loc[test_indexes]

In [48]:
print(train_df.shape, valid_df.shape, test_df.shape)

(1330290, 19) (198720, 19) (788400, 19)


In [49]:
train_df = train_df.dropna()
y_train = train_df["production"]
X_train = train_df.drop(columns=["forecast_dt", "production"])

y_valid = valid_df["production"]
X_valid = valid_df.drop(columns=["forecast_dt", "production"])

# y_test = test_df["production"]
X_test = test_df.drop(columns=["forecast_dt", "production"])

# for col in ["rt_plant_id", "hour", "dayofweek"]:
for col in ["rt_plant_id"]:
    X_train[col] = X_train[col].astype("category")
    X_valid[col] = X_valid[col].astype("category")
    X_test[col] = X_test[col].astype("category")

In [107]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_valid, y_valid, reference=lgb_train)

params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',
    "num_boost_round": 500,
    "metric": "cross_entropy",
    # "metric": ["l2", "l1"],
    # 'metric': {'l2', 'l1'},
    # 'num_leaves': 51,
    # 'learning_rate': 0.03,
    # 'feature_fraction': 0.9,
    # 'bagging_fraction': 0.8,
    # 'bagging_freq': 5,
    'verbose': 1,
    "categorical_feature": ["rt_plant_id", "hour", "dayofweek"]
}

print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                valid_sets=lgb_eval,
                callbacks=[lgb.early_stopping(stopping_rounds=50)]
                )


Starting training...
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 1330290.000000
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4171
[LightGBM] [Info] Number of data points in the train set: 1330290, number of used features: 17
[LightGBM] [Info] [cross_entropy:Init]: (metric) labels passed interval [0, 1] check
[LightGBM] [Info] [cross_entropy:Init]: sum-of-weights = 198720.000000
[LightGBM] [Info] Start training from score 0.354652
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[494]	valid_0's cross_entropy: 0.409379


In [108]:
# print('Saving model...')
# save model to file
# gbm.save_model('model.txt')

print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
from sklearn.metrics import mean_squared_error
rmse_test = mean_squared_error(y_test, y_pred) ** 0.5
print(f'The RMSE of prediction is: {rmse_test}')

Starting predicting...
The RMSE of prediction is: 0.1852403197085925


In [109]:
import numpy as np

def calculate_wmape(preds, actuals):
    return np.sum(np.abs(preds-actuals)) / np.sum(np.abs(actuals))

def calculate_bias(preds, actuals):
    return np.sum(preds-actuals) / np.sum(actuals)

In [110]:
calculate_wmape(y_pred, y_test)

0.34256397723929133

In [111]:
calculate_bias(y_pred, y_test)

-0.040827540264828675

In [112]:
output_df = test_df[["forecast_dt", "rt_plant_id"]].copy()
output_df["predictions"] = y_pred
output_df["actuals"] = y_test

In [113]:
output_df.to_csv("/home/mert/Desktop/thesis/data/out/lightgbm_output.csv", index=False)

In [114]:
output_df

Unnamed: 0,forecast_dt,rt_plant_id,predictions,actuals
16989,2021-01-01 00:00:00,672,0.901268,0.965714
16990,2021-01-01 01:00:00,672,0.893004,0.404571
16991,2021-01-01 02:00:00,672,0.894403,0.334857
16992,2021-01-01 03:00:00,672,0.873337,0.437714
16993,2021-01-01 04:00:00,672,0.755697,0.426286
...,...,...,...,...
2480665,2021-12-31 19:00:00,2374,0.188697,0.025000
2480666,2021-12-31 20:00:00,2374,0.241482,0.130000
2480667,2021-12-31 21:00:00,2374,0.267678,0.280000
2480668,2021-12-31 22:00:00,2374,0.299181,0.310000
