In [17]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib_venn import venn2
%matplotlib inline
from sklearn import metrics
from sklearn import model_selection
from catboost import CatBoostRegressor

from utils.load import load_data, load_submission
from utils.feature import (feature_enginning)


In [8]:
train_df, predict_df,train_1, test_data = load_data('../data')

In [9]:
target_col = 'energy'
drop_cols = ['time', 'bs', 'split', target_col]

In [5]:
kf = model_selection.KFold(n_splits=10)
kf = kf.split(X=train_1)

oof_valid_preds = np.zeros(train_1.shape[0])
test_preds_list = []

for i, (train_idx, valid_idx) in enumerate(kf):

    train1 = train_1.iloc[train_idx]
    valid_df = train_1.iloc[valid_idx]
    test_df = test_data.copy()

    train1, valid_df, test_df = feature_enginning(train1=train1, valid_df=valid_df, test_df=test_df)
    train_cols = [col for col in train1.columns if col not in drop_cols]

    X_train, y_train = train1[train_cols], train1[target_col]
    X_valid, y_valid = valid_df[train_cols], valid_df[target_col]
    X_test = test_df[train_cols]

    params = {
        'iterations': 10000,
        'learning_rate': 0.001,
        'depth': 6,
        'loss_function': 'MAE',
        'verbose': 100,
    }

    model = CatBoostRegressor(**params)

    model.fit(
        X_train, y_train,
        eval_set=(X_valid, y_valid),
        early_stopping_rounds=100,
        verbose=100,
    )

    valid_preds = model.predict(X_valid)
    test_preds = model.predict(X_test)

    val_score = metrics.mean_absolute_error(y_valid, valid_preds)
    oof_valid_preds[valid_idx] = valid_preds
    test_preds_list.append(test_preds)

    print("=*"*50)
    print(f"Fold : {i}")
    print(f"Valid score : ", val_score)

oof_score = metrics.mean_absolute_error(train_1[target_col], oof_valid_preds)
print("_-*"*50)
print(f"OOF score : ", oof_score)


0:	learn: 10.6527467	test: 10.2190678	best: 10.2190678 (0)	total: 254ms	remaining: 42m 19s
100:	learn: 9.8312414	test: 9.4336272	best: 9.4336272 (100)	total: 2.57s	remaining: 4m 11s
200:	learn: 9.0970525	test: 8.7325532	best: 8.7325532 (200)	total: 4.25s	remaining: 3m 27s
300:	learn: 8.4392028	test: 8.1095269	best: 8.1095269 (300)	total: 6.2s	remaining: 3m 19s
400:	learn: 7.8505266	test: 7.5544044	best: 7.5544044 (400)	total: 7.9s	remaining: 3m 9s
500:	learn: 7.3253172	test: 7.0610635	best: 7.0610635 (500)	total: 9.63s	remaining: 3m 2s
600:	learn: 6.8578640	test: 6.6280576	best: 6.6280576 (600)	total: 11.4s	remaining: 2m 59s
700:	learn: 6.4358786	test: 6.2431557	best: 6.2431557 (700)	total: 13.3s	remaining: 2m 56s
800:	learn: 6.0624240	test: 5.9038347	best: 5.9038347 (800)	total: 15.1s	remaining: 2m 53s
900:	learn: 5.7279450	test: 5.6007705	best: 5.6007705 (900)	total: 16.8s	remaining: 2m 49s
1000:	learn: 5.4306254	test: 5.3341145	best: 5.3341145 (1000)	total: 18.6s	remaining: 2m 47s
1

In [18]:
test_preds_mean = np.mean(test_preds_list, axis=0)
test_data['Energy'] = test_preds_mean
test_data['ID'] = test_data['time'].astype('str') + '_' + test_data['bs']

In [22]:
ss_df = pd.read_csv('../data/SampleSubmission.csv')
ss_df = ss_df[['ID']]
ss_df = ss_df.merge(test_data[['ID', 'Energy']], on='ID', how='left')
ss_df.to_csv("catboost.csv", index=False)