In [1]:
import warnings
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearnex import patch_sklearn, config_context
patch_sklearn()
warnings.filterwarnings('ignore')

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [2]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score

# custom functions
import config
from metric import NMAE
from data_processor import WindTransformer, UVTransformer, FeatureTransformer

# model import
import xgboost as xgb
from xgboost import XGBRegressor

# logging
import mlflow
import mlflow.sklearn

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [3]:
def get_data():
    power = pd.read_parquet(config.input_path + "dynamic_report_ewp02_2020_10min.parquet").rename({'Date/Time': 'dt', 'WTG.Name': 'turbine_id'}, axis=1)
    train_y = pd.read_parquet(config.input_path + "train_y.parquet").rename({'end_datetime': 'dt'}, axis=1)
    ldaps = pd.read_parquet(config.input_path + "train_ldaps_gyeongju.parquet")

    print("Power: ", power.shape)
    print("train_y: ", train_y.shape)
    print("LDAPS: ", ldaps.shape)

    # data slicing
    power = power[:-3]

    datas = [power, train_y, ldaps]
    #datas = [power, ldaps]
    for d in datas:
        try:
            d['dt'] = (pd.to_datetime(d['dt'])
                        .dt
                        .tz_convert("Asia/Seoul"))
        except TypeError:
            d['dt'] = (pd.to_datetime(d['dt'])
                        .dt
                        .tz_localize("Asia/Seoul"))

    train_y = (train_y.loc[(train_y['plant_name'] == "경주풍력")
                          & (train_y['dt']).between('2020-01-01', '2022-01-01', inclusive='left')])

    ldaps = ldaps.loc[ldaps['dt'].between('2020-01-01', '2022-01-01', inclusive='left')]

    print("Power: ", power.shape, power['dt'].min(), power['dt'].max())
    print("train_y: ", train_y.shape, train_y['dt'].min(), train_y['dt'].max())
    print("LDAPS: ", ldaps.shape, ldaps['dt'].min(), ldaps['dt'].max())

    return power, train_y, ldaps

In [4]:
print(config.mlflow)
print(config.test_size)
print(config.target)

False
0.2
energy_kwh


In [5]:
if config.mlflow:
    # # create experiment
    # exp = mlflow.set_experiment("windpower_experiment")

    # # start MLflow run
    # with mlflow.start_run(experiment_id=exp.experiment_id):

    #     # Train model
    #     model = xgb.XGBRegressor()

    #     # predict
    #     y_pred = 0
        
    #     # Log model hyperparameters
    #     mlflow.log_params(**config.xgb_params)

    #     # Log performance metrics
    #     mlflow.log_metrics({
    #         "R2 score": r2_score(y_test, y_pred),
    #         "NMAE": NMAE(y_test, y_pred)
    #     })

    #     # Log model
    #     mlflow.sklearn.log_model(model, "xgb")
    pass
else:
    # get data
    scada, train_y, ldaps = get_data()

    # build data pipeline
    print('-' * 50)
    print('Feature Engineering')
    DataPipeline = Pipeline([
        ('uv_transform', UVTransformer('wind_u_10m', 'wind_v_10m')),
        ('wind_transform', WindTransformer('wind_speed', 10, 100, ldaps['surf_rough'].mean())),
        ('feature engineering', FeatureTransformer())
    ])

    # data transform
    ldaps = DataPipeline.fit_transform(ldaps)

    # tmp = pd.merge(ldaps[ldaps['turbine_id'] == 'WTG01'], scada[['dt', 'EnergyProductionActiveEnergyProduction[KWh]']],
    #            on = ['dt'])
    # tmp.columns = tmp.columns.str.replace("[", "_").str.replace("]", "")

    tmp = pd.merge(ldaps[ldaps['turbine_id'] == 'WTG01'], train_y[['dt', 'energy_kwh']])


    # Split train valid
    print('-' * 50)
    print("Train Test Split")
    tmp = tmp.drop(['turbine_id'], axis=1)
    # tmp = tmp.drop(['turbine_id', 'wind_speed'], axis=1)
    target = config.target
    x_train = tmp.loc[tmp['dt'].between('2020-01-01', '2021-01-01', inclusive='left')].drop(['dt', target], axis=1)
    x_test = tmp.loc[tmp['dt'].between('2021-01-01', '2022-01-01', inclusive='left')].drop(['dt', target], axis=1)

    y_train = tmp.loc[tmp['dt'].between('2020-01-01', '2021-01-01', inclusive='left'), target]# / 9
    y_test = tmp.loc[tmp['dt'].between('2021-01-01', '2022-01-01', inclusive='left'), target]

    print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

    # Train model
    print('-' * 50)
    print("Train Model")
    xgb = XGBRegressor(**config.xgb_params)
    xgb.fit(x_train, y_train)

    # predict
    y_pred = xgb.predict(x_test)

    # Scoring
    print("NMAE: ", NMAE(y_test, y_pred))

Power:  (52592, 29)
train_y:  (52608, 4)
LDAPS:  (235818, 15)
Power:  (52589, 29) 2020-01-01 00:00:00+09:00 2020-12-31 23:50:00+09:00
train_y:  (17543, 4) 2020-01-01 01:00:00+09:00 2021-12-31 23:00:00+09:00
LDAPS:  (157671, 15) 2020-01-02 00:00:00+09:00 2021-12-31 23:00:00+09:00
--------------------------------------------------
Feature Engineering
--------------------------------------------------
Train Test Split
(8760, 21) (8759, 21) (8760,) (8759,)
--------------------------------------------------
Train Model
NMAE:  13.189758589095375


* dask transform -> 23.5s  
* train_test split (각 1년씩)
* 13.18975