In [56]:
%load_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [57]:
%autoreload

import pandas as pd
import utilities as utils
import numpy as np

from catboost import CatBoostRegressor, Pool, cv
from sklearn.model_selection import train_test_split

In [58]:
data_list = utils.read_data()
Data = utils.DataProcessor(data_list)
df = Data.create_train_data() # NaNs are filled with 0
df.sort_index(inplace=True)
df = df.loc['2019-09-04 08:00:00':]

In [59]:
X = df.drop(['pv_measurement'], axis=1)
y = df['pv_measurement']

In [60]:
# K-fold cross-validation with 5 folds
num_folds = 5
cv_params = {'loss_function': 'MAE',
             'depth': 6,
             'learning_rate': 0.1,
             'iterations': 1000,
             'random_seed': 42,
}

cv_results = cv(Pool(X, y), cv_params, fold_count=num_folds, plot=True)

# print(cv_results)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Training on fold [0/5]
0:	learn: 257.1988296	test: 251.8583623	best: 251.8583623 (0)	total: 66.7ms	remaining: 1m 6s
1:	learn: 240.9562037	test: 235.9223112	best: 235.9223112 (1)	total: 117ms	remaining: 58.5s
2:	learn: 226.5045792	test: 221.8993296	best: 221.8993296 (2)	total: 174ms	remaining: 57.8s
3:	learn: 212.8497658	test: 208.6820070	best: 208.6820070 (3)	total: 221ms	remaining: 55.1s
4:	learn: 206.8484645	test: 202.6812879	best: 202.6812879 (4)	total: 261ms	remaining: 51.9s
5:	learn: 200.5111836	test: 196.3861578	best: 196.3861578 (5)	total: 308ms	remaining: 51s
6:	learn: 186.0864077	test: 182.4989022	best: 182.4989022 (6)	total: 347ms	remaining: 49.3s
7:	learn: 174.1491483	test: 170.9283470	best: 170.9283470 (7)	total: 392ms	remaining: 48.6s
8:	learn: 163.0849657	test: 160.1939178	best: 160.1939178 (8)	total: 445ms	remaining: 49s
9:	learn: 153.8883225	test: 151.1160179	best: 151.1160179 (9)	total: 491ms	remaining: 48.6s
10:	learn: 150.9214924	test: 148.2574713	best: 148.2574713 (

In [61]:
best_iteration = int(cv_results['iterations'].idxmin())
print(best_iteration)

0


In [62]:
# X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.80, random_state=11568)

# # Save X_train, X_validation, y_train, y_validation to csv
# X_train.to_csv('X_train.csv')
# X_validation.to_csv('X_validation.csv')
# y_train.to_csv('y_train.csv')
# y_validation.to_csv('y_validation.csv')

In [66]:
model = CatBoostRegressor(
            loss_function='MAE',
            depth=6,
            learning_rate=0.1,
            iterations=999,
            random_seed=18,
)

In [67]:
# train_pool = Pool(X_train, y_train)
train_pool = Pool(X, y)
# validation_pool = Pool(X_validation, y_validation)

In [68]:
model.fit(
    train_pool,
    # eval_set=validation_pool,
    verbose=50,
)

0:	learn: 265.7430728	total: 45.2ms	remaining: 45.1s
50:	learn: 98.9777302	total: 1.96s	remaining: 36.4s
100:	learn: 95.0961426	total: 3.56s	remaining: 31.6s
150:	learn: 92.2828831	total: 5.26s	remaining: 29.6s
200:	learn: 91.0663566	total: 6.85s	remaining: 27.2s
250:	learn: 89.9300468	total: 8.34s	remaining: 24.9s
300:	learn: 89.6169138	total: 10.1s	remaining: 23.4s
350:	learn: 88.9477779	total: 11.6s	remaining: 21.4s
400:	learn: 88.6432504	total: 13.1s	remaining: 19.5s
450:	learn: 88.1534745	total: 14.7s	remaining: 17.9s
500:	learn: 87.8935703	total: 16.2s	remaining: 16.1s
550:	learn: 87.6839756	total: 18.1s	remaining: 14.7s
600:	learn: 87.3651024	total: 19.8s	remaining: 13.1s
650:	learn: 87.0421659	total: 21.4s	remaining: 11.4s
700:	learn: 86.8969532	total: 23.1s	remaining: 9.84s
750:	learn: 86.6442811	total: 24.8s	remaining: 8.2s
800:	learn: 86.4207576	total: 26.3s	remaining: 6.5s
850:	learn: 86.2554683	total: 27.9s	remaining: 4.86s
900:	learn: 85.9721580	total: 29.4s	remaining: 3.

<catboost.core.CatBoostRegressor at 0x7f441c074370>

In [69]:
X_test = Data.create_test_data().drop(['building'], axis=1)
X_test = X_test[utils.get_features(3)]
# X_test.fillna(0, inplace=True)

test_pool = Pool(X_test)

# X_test.head()

In [70]:
prediction = model.predict(test_pool)

In [71]:
preds_df = pd.DataFrame(prediction, columns=['predict'])
preds_df[preds_df < 0] = 0

results = pd.concat([X_test.reset_index(), preds_df], axis=1)
results['date_forecast'] = pd.to_datetime(results['date_forecast'])
results.set_index('date_forecast', inplace=True)
results.rename(columns={'predict': 'pv_prediction'}, inplace=True)

utils.save_to_csv(results, 'catboost_1')