<center> <h1>EnerjiSA Datathon</h1> </center>
<!-- <img src="https://upload.wikimedia.org/wikipedia/commons/b/b8/Photovoltaik_Dachanlage_Hannover_-_Schwarze_Heide_-_1_MW.jpg" style="margin: 0 auto; width: 100%; max-width: 950px; max-height: 300px" frameborder="0" scrolling="auto"/> -->

#### Previous notebook: [EnerjiSA Time Series Analysis Starter](https://www.kaggle.com/sarperyurttas/enerjisa-time-series-analysis-starter)

<a id="Import Libraries"></a>
<center> <h1>Import Libraries</h1> </center>

In [1]:
# !pip install flaml
# !pip install lightgbm
# !pip install flaml

In [2]:
from sklearn.metrics import mean_squared_error
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# from flaml import AutoML
import optuna
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.float_format', lambda x: '%.8f' % x)

<a id="Data Preprocessing"></a>
<center> <h1>Data Preprocessing</h1> </center>


In [3]:
# read csv files
gen = pd.read_csv('generation.csv', delimiter=';', decimal=',')
temp = pd.read_csv('temperature.csv', delimiter=';', decimal=',')

In [4]:
# slicing frames due to deficiencies in data
df_train = pd.concat([gen.iloc[:25560], temp.iloc[:25560].drop('DateTime', axis=1)], axis=1)

In [5]:
def preprocess(df):
    df['DateTime'] = pd.to_datetime(df['DateTime'])
    df = df.set_index('DateTime')
    df['WWCode'] = df['WWCode'].fillna(0).astype(int)
    # Weather codes filled with "0" because this category means "Cloud development not observed or not observable" reference: https://www.jodc.go.jp/data_format/weather-code.html
    return df

In [6]:
df_test = temp.iloc[25560:26304] # dates between 2021-12-1 and 2021-12-31
df_train = preprocess(df_train)
df_test = preprocess(df_test)

In [7]:
# category 84 samples changed to 83 because category 84 is not found in train set and 83 is the nearest category to 84
df_test.loc[df_test['WWCode'] == 84, 'WWCode'] = 83 

<a id="Preparing Data For Modelling"></a>
<center> <h1>Preparing Data For Modelling</h1> </center>


In [8]:
def create_features(df, label=None):
    """
    Creates time series features from datetime index
    """
    df_copy = df.copy()
    df_copy['date'] = df_copy.index
    df_copy['hour'] = df_copy['date'].dt.hour
    df_copy['dayofweek'] = df_copy['date'].dt.dayofweek
    df_copy['quarter'] = df_copy['date'].dt.quarter
    df_copy['month'] = df_copy['date'].dt.month
    df_copy['year'] = df_copy['date'].dt.year
    df_copy['dayofyear'] = df_copy['date'].dt.dayofyear
    df_copy['dayofmonth'] = df_copy['date'].dt.day
    df_copy['weekofyear'] = df_copy['date'].dt.weekofyear
    
    X = df_copy[['hour','dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear']]
    
    if label:
        y = df_copy[label]
        X = pd.concat([df.drop(label, axis=1), X], axis=1)
        return X, y
    else:
        X = pd.concat([df, X], axis=1)
        return X

In [9]:
def split_train(df, split_date):
    train = df.loc[df.index <= split_date].copy()
    val = df.loc[df.index > split_date].copy()
    return train, val

In [10]:
split_date = '2021-05-01'
df_train, df_val = split_train(df_train, split_date)

In [11]:
x_train, y_train = create_features(df_train, label='Generation')
x_val, y_val = create_features(df_val, label='Generation')
x_test = create_features(df_test)

In [26]:
x_train

Unnamed: 0_level_0,AirTemperature,ComfortTemperature,RelativeHumidity,WindSpeed,WindDirection,WWCode,EffectiveCloudCover,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2019-01-01 00:00:00,-1.70000000,-6.10000000,75.30000000,3.60000000,60.00000000,10,6.30000000,0,1,1,1,2019,1,1,1
2019-01-01 01:00:00,-1.80000000,-5.30000000,75.30000000,2.60000000,70.00000000,10,4.50000000,1,1,1,1,2019,1,1,1
2019-01-01 02:00:00,-2.00000000,-6.00000000,74.70000000,3.10000000,80.00000000,10,5.50000000,2,1,1,1,2019,1,1,1
2019-01-01 03:00:00,-1.90000000,-5.90000000,76.40000000,3.10000000,60.00000000,10,7.60000000,3,1,1,1,2019,1,1,1
2019-01-01 04:00:00,-2.00000000,-7.10000000,76.40000000,4.60000000,60.00000000,10,6.50000000,4,1,1,1,2019,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-04-30 20:00:00,22.60000000,20.70000000,20.00000000,1.00000000,30.00000000,3,3.00000000,20,4,2,4,2021,120,30,17
2021-04-30 21:00:00,21.70000000,20.10000000,23.10000000,1.50000000,20.00000000,4,4.60000000,21,4,2,4,2021,120,30,17
2021-04-30 22:00:00,20.60000000,19.20000000,25.80000000,0.50000000,20.00000000,3,2.50000000,22,4,2,4,2021,120,30,17
2021-04-30 23:00:00,19.40000000,18.20000000,29.70000000,1.50000000,20.00000000,3,2.50000000,23,4,2,4,2021,120,30,17


<a id="Hyperparameter Tuning"></a>
<center> <h1>Hyperparameter Tuning</h1> </center>


In [12]:
datasets = {'x_train': x_train,
            'y_train': y_train,
            'x_val': x_val,
            'y_val': y_val,
            'x_test': x_test
            }

In [13]:
def pred_model(model, **datasets):
    pred_val = model.predict(datasets['x_val'])
    pred_val = pd.DataFrame(pred_val, index=datasets['x_val'].index, columns=['pred'])
    
    pred_test = model.predict(datasets['x_test'])

    return pred_test, pred_val

<a id="FLAML"></a>
<center> <h1>FLAML</h1> </center>


In [14]:
# automl = AutoML()
# settings = {
#     "time_budget": 300,  
#     "metric": 'rmse',
#     "estimator_list": ['lgbm'],
#     "task": 'regression',
# }
# automl.fit(X_train=datasets['x_train'], y_train=datasets['y_train'], **settings)

In [15]:
# model = LGBMRegressor(**automl.best_config)    
# model.fit(datasets['x_train'], datasets['y_train'])

In [16]:
# pred_flaml, pred_val_flaml = pred_model(model, **datasets)

<a id="OPTUNA"></a>
<center> <h1>OPTUNA</h1> </center>


In [17]:
def objective(trial):
    learning_rate = trial.suggest_float("learning_rate", 1e-2, 0.25, log=True)
    reg_lambda = trial.suggest_loguniform("reg_lambda", 1e-8, 100.0)
    reg_alpha = trial.suggest_loguniform("reg_alpha", 1e-8, 100.0)
    subsample = trial.suggest_float("subsample", 0.1, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.1, 1.0)
    max_depth = trial.suggest_int("max_depth", 1, 7)
    early_stopping_rounds = trial.suggest_int("early_stopping_rounds", 100, 500)
    n_estimators = trial.suggest_int("n_estimators", 0, 10000)
    
    model = XGBRegressor(
        random_state=50,
        tree_method="gpu_hist",
        gpu_id=0,
        predictor="gpu_predictor",
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        reg_lambda=reg_lambda,
        reg_alpha=reg_alpha,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        max_depth=max_depth,
    )
    model.fit(
        datasets['x_train'],
        datasets['y_train'],
        early_stopping_rounds=early_stopping_rounds,
        eval_set=[(datasets['x_val'], datasets['y_val'])],
        verbose=1200,
    )
    preds_valid = model.predict(datasets['x_val'])
    rmse = mean_squared_error(datasets['y_val'], preds_valid, squared=False)

    return rmse

In [18]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)

[32m[I 2022-02-28 15:42:54,334][0m A new study created in memory with name: no-name-5eef5e8e-cb30-4be1-a8fa-3be4a3a0533a[0m


[0]	validation_0-rmse:147.46828
[472]	validation_0-rmse:30.89779


[32m[I 2022-02-28 15:43:04,555][0m Trial 0 finished with value: 24.217572818263186 and parameters: {'learning_rate': 0.22587671844591384, 'reg_lambda': 0.017479591267211245, 'reg_alpha': 1.732781477562603e-08, 'subsample': 0.2090063729622258, 'colsample_bytree': 0.9351726599072918, 'max_depth': 7, 'early_stopping_rounds': 450, 'n_estimators': 6536}. Best is trial 0 with value: 24.217572818263186.[0m


[0]	validation_0-rmse:160.01637
[759]	validation_0-rmse:24.31786


[32m[I 2022-02-28 15:43:10,696][0m Trial 1 finished with value: 22.965385793343703 and parameters: {'learning_rate': 0.15193594102027522, 'reg_lambda': 9.121957141067638e-08, 'reg_alpha': 1.6643812772351702e-07, 'subsample': 0.6855411571737606, 'colsample_bytree': 0.8064099912277553, 'max_depth': 5, 'early_stopping_rounds': 489, 'n_estimators': 2770}. Best is trial 1 with value: 22.965385793343703.[0m


[0]	validation_0-rmse:179.78203
[1200]	validation_0-rmse:52.79669
[2400]	validation_0-rmse:51.06072
[2468]	validation_0-rmse:51.02319


[32m[I 2022-02-28 15:43:27,494][0m Trial 2 finished with value: 50.98950092980446 and parameters: {'learning_rate': 0.056969695171508176, 'reg_lambda': 4.6244924852471064e-07, 'reg_alpha': 3.579038715710512e-05, 'subsample': 0.1460472395159377, 'colsample_bytree': 0.11021829450175498, 'max_depth': 6, 'early_stopping_rounds': 156, 'n_estimators': 3269}. Best is trial 1 with value: 22.965385793343703.[0m


[0]	validation_0-rmse:178.20697
[1200]	validation_0-rmse:50.92524
[2400]	validation_0-rmse:49.77357
[3440]	validation_0-rmse:49.45178


[32m[I 2022-02-28 15:43:38,252][0m Trial 3 finished with value: 49.415222977354105 and parameters: {'learning_rate': 0.059287979957009296, 'reg_lambda': 1.9099373574489802e-07, 'reg_alpha': 0.6318158506912479, 'subsample': 0.21543679029887108, 'colsample_bytree': 0.5691750353116296, 'max_depth': 1, 'early_stopping_rounds': 275, 'n_estimators': 3441}. Best is trial 1 with value: 22.965385793343703.[0m


[0]	validation_0-rmse:165.89967
[517]	validation_0-rmse:24.47833


[32m[I 2022-02-28 15:43:44,676][0m Trial 4 finished with value: 23.376315764259562 and parameters: {'learning_rate': 0.10997143199013552, 'reg_lambda': 5.215710067243677e-05, 'reg_alpha': 0.49474855519012206, 'subsample': 0.2770890778152961, 'colsample_bytree': 0.8669245226289175, 'max_depth': 6, 'early_stopping_rounds': 444, 'n_estimators': 4786}. Best is trial 1 with value: 22.965385793343703.[0m


[0]	validation_0-rmse:179.46050
[1200]	validation_0-rmse:27.15634
[2400]	validation_0-rmse:26.37309
[2822]	validation_0-rmse:26.48964


[32m[I 2022-02-28 15:44:16,250][0m Trial 5 finished with value: 26.316834927232605 and parameters: {'learning_rate': 0.03930516640832061, 'reg_lambda': 0.001224214539192457, 'reg_alpha': 0.2223837547302893, 'subsample': 0.17830558782766545, 'colsample_bytree': 0.4141295198297714, 'max_depth': 6, 'early_stopping_rounds': 237, 'n_estimators': 7526}. Best is trial 1 with value: 22.965385793343703.[0m


[0]	validation_0-rmse:180.98619
[1200]	validation_0-rmse:53.94558
[2400]	validation_0-rmse:50.79290
[3600]	validation_0-rmse:49.98020
[4800]	validation_0-rmse:49.63901
[6000]	validation_0-rmse:49.38297
[7200]	validation_0-rmse:49.21806
[7919]	validation_0-rmse:49.21380


[32m[I 2022-02-28 15:44:40,233][0m Trial 6 finished with value: 49.16366708860275 and parameters: {'learning_rate': 0.02114896256898883, 'reg_lambda': 1.1165848010220767e-06, 'reg_alpha': 0.0001338223154663122, 'subsample': 0.1667114638885322, 'colsample_bytree': 0.9140401121139247, 'max_depth': 1, 'early_stopping_rounds': 417, 'n_estimators': 8185}. Best is trial 1 with value: 22.965385793343703.[0m


[0]	validation_0-rmse:176.27589
[1200]	validation_0-rmse:23.92870
[2400]	validation_0-rmse:23.34319
[3216]	validation_0-rmse:23.31172


[32m[I 2022-02-28 15:44:57,119][0m Trial 7 finished with value: 23.284707800171418 and parameters: {'learning_rate': 0.10032916044627102, 'reg_lambda': 48.02521359200534, 'reg_alpha': 4.3444465517418625, 'subsample': 0.744953351267749, 'colsample_bytree': 0.3017010624495227, 'max_depth': 4, 'early_stopping_rounds': 455, 'n_estimators': 7738}. Best is trial 1 with value: 22.965385793343703.[0m


[0]	validation_0-rmse:170.00957
[1200]	validation_0-rmse:27.68277
[2343]	validation_0-rmse:26.26109


[32m[I 2022-02-28 15:45:05,482][0m Trial 8 finished with value: 25.854997240524916 and parameters: {'learning_rate': 0.10711171734208574, 'reg_lambda': 0.05406330148613398, 'reg_alpha': 1.293570958785122e-08, 'subsample': 0.4137831124587127, 'colsample_bytree': 0.684342352026128, 'max_depth': 2, 'early_stopping_rounds': 109, 'n_estimators': 6335}. Best is trial 1 with value: 22.965385793343703.[0m


[0]	validation_0-rmse:181.22414
[797]	validation_0-rmse:45.56702


[32m[I 2022-02-28 15:45:10,932][0m Trial 9 finished with value: 45.56702304808671 and parameters: {'learning_rate': 0.02514398111836948, 'reg_lambda': 2.2663907346155945, 'reg_alpha': 6.462938608167475e-06, 'subsample': 0.21726232752709906, 'colsample_bytree': 0.16597650644527817, 'max_depth': 5, 'early_stopping_rounds': 302, 'n_estimators': 798}. Best is trial 1 with value: 22.965385793343703.[0m


[0]	validation_0-rmse:152.04450
[189]	validation_0-rmse:24.94793


[32m[I 2022-02-28 15:45:11,860][0m Trial 10 finished with value: 24.94091764884884 and parameters: {'learning_rate': 0.23673095993246251, 'reg_lambda': 1.441602399284892e-08, 'reg_alpha': 6.633326155310028e-07, 'subsample': 0.9745865065099406, 'colsample_bytree': 0.7289169985974981, 'max_depth': 3, 'early_stopping_rounds': 366, 'n_estimators': 190}. Best is trial 1 with value: 22.965385793343703.[0m


[0]	validation_0-rmse:176.55116
[1200]	validation_0-rmse:23.36101
[2400]	validation_0-rmse:22.90133
[2948]	validation_0-rmse:22.92269


[32m[I 2022-02-28 15:45:27,484][0m Trial 11 finished with value: 22.871168926064975 and parameters: {'learning_rate': 0.12318487106348765, 'reg_lambda': 45.92186832010407, 'reg_alpha': 62.284836776056494, 'subsample': 0.7410240828409949, 'colsample_bytree': 0.34147849111208917, 'max_depth': 4, 'early_stopping_rounds': 494, 'n_estimators': 9502}. Best is trial 11 with value: 22.871168926064975.[0m


[0]	validation_0-rmse:175.14905
[1200]	validation_0-rmse:23.80333
[2032]	validation_0-rmse:23.94899


[32m[I 2022-02-28 15:45:38,669][0m Trial 12 finished with value: 23.753483940464346 and parameters: {'learning_rate': 0.1497138660878922, 'reg_lambda': 4.474120673071479e-05, 'reg_alpha': 81.92124616217114, 'subsample': 0.6789541549039757, 'colsample_bytree': 0.4254334232936675, 'max_depth': 4, 'early_stopping_rounds': 486, 'n_estimators': 2108}. Best is trial 11 with value: 22.871168926064975.[0m


[0]	validation_0-rmse:181.02789
[1200]	validation_0-rmse:28.49610
[2400]	validation_0-rmse:24.83204
[3600]	validation_0-rmse:23.69330
[4800]	validation_0-rmse:23.19558
[6000]	validation_0-rmse:22.96908
[7200]	validation_0-rmse:22.76306
[8400]	validation_0-rmse:22.70550
[9219]	validation_0-rmse:22.67560


[32m[I 2022-02-28 15:46:18,577][0m Trial 13 finished with value: 22.673224188000535 and parameters: {'learning_rate': 0.011415367681056826, 'reg_lambda': 0.8078672295819468, 'reg_alpha': 0.0026055083430921344, 'subsample': 0.8091275924233325, 'colsample_bytree': 0.7199608834465959, 'max_depth': 3, 'early_stopping_rounds': 371, 'n_estimators': 9904}. Best is trial 13 with value: 22.673224188000535.[0m


[0]	validation_0-rmse:181.35417
[1200]	validation_0-rmse:28.52960
[2400]	validation_0-rmse:24.76418
[3600]	validation_0-rmse:23.54095
[4800]	validation_0-rmse:23.10358
[6000]	validation_0-rmse:22.89749
[7200]	validation_0-rmse:22.76344
[8400]	validation_0-rmse:22.69391
[8921]	validation_0-rmse:22.69447


[32m[I 2022-02-28 15:46:56,949][0m Trial 14 finished with value: 22.680871928478595 and parameters: {'learning_rate': 0.01721582612831278, 'reg_lambda': 35.377477290981474, 'reg_alpha': 0.006946191389481319, 'subsample': 0.8938857585311862, 'colsample_bytree': 0.5635411808333849, 'max_depth': 3, 'early_stopping_rounds': 363, 'n_estimators': 9390}. Best is trial 13 with value: 22.673224188000535.[0m


[0]	validation_0-rmse:181.67635
[1200]	validation_0-rmse:30.76498
[2400]	validation_0-rmse:26.46949
[3600]	validation_0-rmse:24.75798
[4800]	validation_0-rmse:23.96723
[6000]	validation_0-rmse:23.54353
[7200]	validation_0-rmse:23.30375
[8400]	validation_0-rmse:23.08615
[9600]	validation_0-rmse:22.92078
[9974]	validation_0-rmse:22.90553


[32m[I 2022-02-28 15:47:39,532][0m Trial 15 finished with value: 22.904720391854973 and parameters: {'learning_rate': 0.011705149081625494, 'reg_lambda': 0.5778805167678509, 'reg_alpha': 0.0064799602229673545, 'subsample': 0.9913792584232037, 'colsample_bytree': 0.5582960442879776, 'max_depth': 3, 'early_stopping_rounds': 379, 'n_estimators': 9975}. Best is trial 13 with value: 22.673224188000535.[0m


[0]	validation_0-rmse:181.80049
[1200]	validation_0-rmse:44.05912
[2400]	validation_0-rmse:37.47046
[3600]	validation_0-rmse:34.63920
[4800]	validation_0-rmse:32.67310
[6000]	validation_0-rmse:31.23509
[7200]	validation_0-rmse:30.14817
[8400]	validation_0-rmse:29.29256
[8891]	validation_0-rmse:28.99538


[32m[I 2022-02-28 15:48:11,442][0m Trial 16 finished with value: 28.995377803902386 and parameters: {'learning_rate': 0.01072179131775498, 'reg_lambda': 2.5132085399649777, 'reg_alpha': 0.004599155673936436, 'subsample': 0.8666944356207433, 'colsample_bytree': 0.651685563298987, 'max_depth': 2, 'early_stopping_rounds': 319, 'n_estimators': 8892}. Best is trial 13 with value: 22.673224188000535.[0m


[0]	validation_0-rmse:180.09255
[1200]	validation_0-rmse:25.11309
[2400]	validation_0-rmse:23.82154
[3600]	validation_0-rmse:23.35602
[4570]	validation_0-rmse:23.28654


[32m[I 2022-02-28 15:48:31,431][0m Trial 17 finished with value: 23.175024554293515 and parameters: {'learning_rate': 0.01860311812150148, 'reg_lambda': 0.08801344464861816, 'reg_alpha': 0.0223530664103807, 'subsample': 0.5041628757747523, 'colsample_bytree': 0.7740378411787596, 'max_depth': 3, 'early_stopping_rounds': 365, 'n_estimators': 6367}. Best is trial 13 with value: 22.673224188000535.[0m


[0]	validation_0-rmse:181.42940
[1200]	validation_0-rmse:40.09516
[2400]	validation_0-rmse:34.42894
[3600]	validation_0-rmse:31.70663
[4800]	validation_0-rmse:29.94456
[4966]	validation_0-rmse:29.75515


[32m[I 2022-02-28 15:48:48,979][0m Trial 18 finished with value: 29.75514350344508 and parameters: {'learning_rate': 0.016450202765062327, 'reg_lambda': 9.167864732336193, 'reg_alpha': 0.0009694360670785971, 'subsample': 0.8660994021575108, 'colsample_bytree': 0.6402085836740057, 'max_depth': 2, 'early_stopping_rounds': 218, 'n_estimators': 4967}. Best is trial 13 with value: 22.673224188000535.[0m


[0]	validation_0-rmse:180.99124
[1200]	validation_0-rmse:25.77754
[2400]	validation_0-rmse:23.92089
[3600]	validation_0-rmse:23.36092
[4046]	validation_0-rmse:23.39931


[32m[I 2022-02-28 15:49:05,391][0m Trial 19 finished with value: 23.33383953217145 and parameters: {'learning_rate': 0.03282882081481022, 'reg_lambda': 0.0032457965780593096, 'reg_alpha': 0.00034031746285960175, 'subsample': 0.8607609968151619, 'colsample_bytree': 0.4723728931705984, 'max_depth': 3, 'early_stopping_rounds': 339, 'n_estimators': 8596}. Best is trial 13 with value: 22.673224188000535.[0m


In [19]:
optuna_params = study.best_params
optuna_params

{'learning_rate': 0.011415367681056826,
 'reg_lambda': 0.8078672295819468,
 'reg_alpha': 0.0026055083430921344,
 'subsample': 0.8091275924233325,
 'colsample_bytree': 0.7199608834465959,
 'max_depth': 3,
 'early_stopping_rounds': 371,
 'n_estimators': 9904}

In [20]:
model = XGBRegressor(
    random_state=50,
    tree_method="gpu_hist",
    gpu_id=0,
    predictor="gpu_predictor",
    **optuna_params
)
model.fit(
    datasets['x_train'],
    datasets['y_train'],
    early_stopping_rounds=optuna_params['early_stopping_rounds'],
    eval_set=[(datasets['x_val'], datasets['y_val'])],
    verbose=1000,
)

Parameters: { "early_stopping_rounds" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	validation_0-rmse:181.02789
[1000]	validation_0-rmse:29.66521
[2000]	validation_0-rmse:25.59036
[3000]	validation_0-rmse:24.11302
[4000]	validation_0-rmse:23.48376
[5000]	validation_0-rmse:23.16133
[6000]	validation_0-rmse:22.96908
[7000]	validation_0-rmse:22.80327
[8000]	validation_0-rmse:22.72942
[9000]	validation_0-rmse:22.69370
[9219]	validation_0-rmse:22.67560


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7199608834465959,
             early_stopping_rounds=371, enable_categorical=False, gamma=0,
             gpu_id=0, importance_type=None, interaction_constraints='',
             learning_rate=0.011415367681056826, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=9904, n_jobs=8, num_parallel_tree=1,
             predictor='gpu_predictor', random_state=50,
             reg_alpha=0.0026055083430921344, reg_lambda=0.8078672295819468,
             scale_pos_weight=1, subsample=0.8091275924233325,
             tree_method='gpu_hist', validate_parameters=1, verbosity=None)

In [21]:
pred_optuna, pred_val_optuna = pred_model(model, **datasets)

#### Simple trick: Negative predicted and night values are clipped to 0

In [22]:
def clip_nights(pred, test = False):
    nights = [21, 22 , 23 , 0, 1, 2, 3, 4]
    if not test:
        for time, row in pred.iterrows():
            if time.hour in nights or row['pred'] < 0:
                row['pred'] = 0
        return pred
    else:
        for i, hour in enumerate(pd.to_datetime(pred['DateTime'])):
            if hour.hour in nights or pred.loc[i, 'Generation'] < 0:
                pred.loc[i, 'Generation'] = 0
        return pred

In [23]:
# pred_val_flaml = clip_nights(pred_val_flaml)
pred_val_optuna = clip_nights(pred_val_optuna)

In [24]:
# rmse_flaml = mean_squared_error(datasets['y_val'], pred_val_flaml, squared=False)
rmse_optuna = mean_squared_error(datasets['y_val'], pred_val_optuna, squared=False)
# print('Flaml RMSE: ', rmse_flaml)
print('Optuna RMSE: ', rmse_optuna)

Optuna RMSE:  22.063721892469616


In [27]:
pred_val_optuna

Unnamed: 0_level_0,pred
DateTime,Unnamed: 1_level_1
2021-05-01 01:00:00,0.00000000
2021-05-01 02:00:00,0.00000000
2021-05-01 03:00:00,0.00000000
2021-05-01 04:00:00,0.00000000
2021-05-01 05:00:00,5.74300146
...,...
2021-11-30 19:00:00,7.65563250
2021-11-30 20:00:00,8.55105495
2021-11-30 21:00:00,0.00000000
2021-11-30 22:00:00,0.00000000


#### Comparing predictions

In [25]:
plt.figure(figsize=(50,10))
plt.plot(pred_val_flaml)
plt.plot(pred_val_optuna)
plt.plot(y_val)
plt.legend(['Flaml','Optuna', 'Desired'])
plt.show()

NameError: name 'pred_val_flaml' is not defined

<Figure size 3600x720 with 0 Axes>

In [None]:
submission = pd.read_csv('../input/enerjisa-enerji-veri-maratonu/sample_submission.csv')
submission['Generation'] = pred_flaml

In [None]:
submission2 = pd.read_csv('../input/enerjisa-enerji-veri-maratonu/sample_submission.csv')
submission2['Generation'] = pred_optuna

In [None]:
submission = clip_nights(submission, test=True)
submission2 = clip_nights(submission2, test=True)

In [None]:
plt.figure(figsize=(31,10))
plt.plot(submission['Generation'])
plt.plot(submission2['Generation'])
plt.show()

In [None]:
submission.to_csv('submission.csv', index=False)
submission2.to_csv('submission2.csv', index=False)