In [22]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
%matplotlib inline
from sklearn import metrics
from sklearn import model_selection
from catboost import CatBoostRegressor
import lightgbm as lgb
# from utils.feature import submission


from utils.load import load_data, load_submission
from utils.feature import (feature_enginning)


In [23]:
train1, predict_df,train_1, test_data = load_data('../data')

In [24]:
target_col = 'energy'
drop_cols = ['time', 'bs', 'split', target_col]

In [16]:
kf = model_selection.KFold(n_splits=10)
kf = kf.split(X=train_1)

oof_valid_preds = np.zeros(train_1.shape[0])
test_preds_list_cat= []

for i, (train_idx, valid_idx) in enumerate(kf):

    train1 = train_1.iloc[train_idx]
    valid_df = train_1.iloc[valid_idx]
    test_df = test_data.copy()

    train1, valid_df, test_df = feature_enginning(train1=train1, valid_df=valid_df, test_df=test_df)
    train_cols = [col for col in train1.columns if col not in drop_cols]

    X_train, y_train = train1[train_cols], train1[target_col]
    X_valid, y_valid = valid_df[train_cols], valid_df[target_col]
    X_test = test_df[train_cols]

    params = {
        'iterations': 10000,
        'learning_rate': 0.001,
        'depth': 6,
        'loss_function': 'MAE',
        'verbose': 100,
    }

    model = CatBoostRegressor(**params)

    model.fit(
        X_train, y_train,
        eval_set=(X_valid, y_valid),
        early_stopping_rounds=100,
        verbose=100,
    )

    valid_preds = model.predict(X_valid)
    test_preds = model.predict(X_test)

    val_score = metrics.mean_absolute_error(y_valid, valid_preds)
    oof_valid_preds[valid_idx] = valid_preds
    test_preds_list_cat.append(test_preds)

    print("=*"*50)
    print(f"Fold : {i}")
    print(f"Valid score : ", val_score)

oof_score = metrics.mean_absolute_error(train_1[target_col], oof_valid_preds)
print("_-*"*50)
print(f"OOF score : ", oof_score)


0:	learn: 10.6527467	test: 10.2190678	best: 10.2190678 (0)	total: 234ms	remaining: 38m 57s
100:	learn: 9.8312414	test: 9.4336272	best: 9.4336272 (100)	total: 1.74s	remaining: 2m 50s
200:	learn: 9.0970525	test: 8.7325532	best: 8.7325532 (200)	total: 3.37s	remaining: 2m 44s
300:	learn: 8.4392028	test: 8.1095269	best: 8.1095269 (300)	total: 4.68s	remaining: 2m 30s
400:	learn: 7.8505266	test: 7.5544044	best: 7.5544044 (400)	total: 6.03s	remaining: 2m 24s
500:	learn: 7.3253172	test: 7.0610635	best: 7.0610635 (500)	total: 7.39s	remaining: 2m 20s
600:	learn: 6.8578640	test: 6.6280576	best: 6.6280576 (600)	total: 9.16s	remaining: 2m 23s
700:	learn: 6.4358786	test: 6.2431557	best: 6.2431557 (700)	total: 10.3s	remaining: 2m 17s
800:	learn: 6.0624240	test: 5.9038347	best: 5.9038347 (800)	total: 12.3s	remaining: 2m 20s
900:	learn: 5.7279450	test: 5.6007705	best: 5.6007705 (900)	total: 13.7s	remaining: 2m 18s
1000:	learn: 5.4306254	test: 5.3341145	best: 5.3341145 (1000)	total: 15.5s	remaining: 2m 1

In [17]:
test_preds_mean = np.mean(test_preds_list_cat, axis=0)
test_data['Energy'] = test_preds_mean
test_data['ID'] = test_data['time'].astype('str') + '_' + test_data['bs']
ss_df = pd.read_csv('../data/SampleSubmission.csv')
ss_df = ss_df[['ID']]
ss_df = ss_df.merge(test_data[['ID', 'Energy']], on='ID', how='left')

In [18]:

ss_df.to_csv("catboost.csv", index=False)

In [25]:
kf = model_selection.KFold(n_splits=10)
kf = kf.split(X=train_1)

oof_valid_preds = np.zeros(train_1.shape[0], )
test_preds_list_lgbm = []

for i, (train_idx, valid_idx) in enumerate(kf):

    train1 = train_1.iloc[train_idx]
    valid_df = train_1.iloc[valid_idx]
    test_df = test_data.copy()

    train1, valid_df, test_df = feature_enginning(train1=train1, valid_df=valid_df, test_df=test_df)
    train_cols = [col for col in train1.columns if col not in drop_cols]

    X_train, y_train = train1[train_cols], train1[target_col]
    X_valid, y_valid = valid_df[train_cols], valid_df[target_col]
    X_test = test_df[train_cols]

    params = {
        'objective': 'regression',
        'boosting_type': 'gbdt',
        # 'learning_rate': 0.01,
        'n_jobs': -1,
        'max_depth' : -1,
        'metric': 'mae',
        'num_boost_round': 10000,
    }

    model = lgb.LGBMRegressor(**params)

    early_stopping_callback = lgb.early_stopping(100, first_metric_only=True, verbose=False)
    verbose_callback = lgb.log_evaluation(100)

    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[early_stopping_callback, verbose_callback],
    )

    valid_preds = model.predict(X_valid)
    test_preds = model.predict(X_test)

    val_score = metrics.mean_absolute_error(y_valid, valid_preds)
    oof_valid_preds[valid_idx] = valid_preds
    test_preds_list_lgbm.append(test_preds)

    print("=*"*50)
    print(f"Fold : {i}")
    print(f"Valid score : ", val_score)

oof_score = metrics.mean_absolute_error(train_1[target_col], oof_valid_preds)
print("_-*"*50)
print(f"OOF score : ", oof_score)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010591 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2036
[LightGBM] [Info] Number of data points in the train set: 83366, number of used features: 79
[LightGBM] [Info] Start training from score 28.290304
[100]	valid_0's l1: 2.07686
[200]	valid_0's l1: 1.85079
[300]	valid_0's l1: 1.75836
[400]	valid_0's l1: 1.68728
[500]	valid_0's l1: 1.6249
[600]	valid_0's l1: 1.58745
[700]	valid_0's l1: 1.56356
[800]	valid_0's l1: 1.53519
[900]	valid_0's l1: 1.51997
[1000]	valid_0's l1: 1.50509
[1100]	valid_0's l1: 1.49341
[1200]	valid_0's l1: 1.48704
[1300]	valid_0's l1: 1.47939
[1400]	valid_0's l1: 1.47354
[1500]	valid_0's l1: 1.46801
[1600]	valid_0's l1: 1.46181
[1700]	valid_0's l1: 1.45663
[1800]	valid_0's l1: 1.45274
[1900]	valid_0's l1: 1.44826
[2000]	valid_0's l1: 1.44613
[2100]	valid_0's l1: 1.

In [None]:
test_preds_mean = np.mean(test_preds_list_lgbm, axis=0)
test_data['Energy'] = test_preds_mean
test_data['ID'] = test_data['time'].astype('str') + '_' + test_data['bs']
ss_df = pd.read_csv('../data/SampleSubmission.csv')
ss_df = ss_df[['ID']]
ss_df = ss_df.merge(test_data[['ID', 'Energy']], on='ID', how='left')

In [None]:

ss_df.to_csv("lightgbm.csv", index=False)
