# Imports

In [1]:
from sklearn.model_selection import KFold
from helper_functions import *

import plotly.offline as py
py.init_notebook_mode(connected=True)

In [2]:
category_columns = []
feature_columns = []

# Data loading (Train dataset)


In [3]:
full_train_df = pd.read_feather("./data/train_full.feather")

# Train model

In [4]:
X_train = full_train_df[feature_columns + category_columns]
y_train = full_train_df['meter_reading_log1p'].values

In [5]:
number_of_folds = 5
# kf = KFold(n_splits=5, shuffle=False, random_state=None)
kf_shuffled = KFold(n_splits=number_of_folds, shuffle=True, random_state=666)

In [6]:
models = []
for train_idx, valid_idx in kf_shuffled.split(X_train, y_train):
    train_data = X_train.iloc[train_idx,:], y_train[train_idx]
    valid_data = X_train.iloc[valid_idx,:], y_train[valid_idx]

    print('train', len(train_idx), 'valid', len(valid_idx))
    model, y_pred_valid, y_true = lgbm_fit(
        train_data,
        valid_data,
        cat_features = category_columns,
        num_rounds=1000,
        lr = 0.04,
        bf = 0.8
    )
    models.append(model)
    del model, y_pred_valid, train_data, valid_data
    gc.collect()

train 15895908 valid 3973978
training LGB:
[LightGBM] [Info] Total Bins 5765
[LightGBM] [Info] Number of data points in the train set: 15895908, number of used features: 27



Overriding the parameters from Reference Dataset.


categorical_column in param dict is overridden.



[LightGBM] [Info] Start training from score 4.217456
Training until validation scores don't improve for 20 rounds
[20]	training's rmse: 1.5651	valid_1's rmse: 1.56528
[40]	training's rmse: 1.36491	valid_1's rmse: 1.36516
[60]	training's rmse: 1.25718	valid_1's rmse: 1.25732
[80]	training's rmse: 1.17734	valid_1's rmse: 1.1774
[100]	training's rmse: 1.12822	valid_1's rmse: 1.12825
[120]	training's rmse: 1.08359	valid_1's rmse: 1.08358
[140]	training's rmse: 1.03867	valid_1's rmse: 1.03861
[160]	training's rmse: 1.00081	valid_1's rmse: 1.00087
[180]	training's rmse: 0.97604	valid_1's rmse: 0.976158
[200]	training's rmse: 0.958312	valid_1's rmse: 0.958544
[220]	training's rmse: 0.94298	valid_1's rmse: 0.943302
[240]	training's rmse: 0.928003	valid_1's rmse: 0.928369
[260]	training's rmse: 0.918326	valid_1's rmse: 0.918757
[280]	training's rmse: 0.907911	valid_1's rmse: 0.908354
[300]	training's rmse: 0.900155	valid_1's rmse: 0.900636
[320]	training's rmse: 0.89221	valid_1's rmse: 0.892712

KeyboardInterrupt: 

# Evaluate the trained model using the average of the models produced by cross-validation

In [None]:
mae_scores = [model.best_score["valid_1"]["rmse"] for model in models]
avg_mae_score = np.average(mae_scores)
print('RMSE: {0}'.format(avg_mae_score))

# rmsle_scores = [model.best_score["valid_1"]["RMSLE"] for model in models]
# avg_rmsle_score = np.average(rmsle_scores)
# print('RMSLE: {0}'.format(avg_rmsle_score))

for index, model in enumerate(models):
    model.save_model('./models/lgbm-base/lgbm_model_{0}.txt'.format(index))

# Data Loading (Test dataset)

In [None]:
full_test_df = pd.read_feather("./data/test_full.feather")
sample_submission = pd.read_feather("./data/feather/sample_submission.feather")

# Prediction on test data

In [None]:
X_test = full_test_df[feature_columns + category_columns]

In [None]:
loaded_models = [lgb.Booster(model_file=f"./models/lgbm-base/lgbm_model_{i}.txt") for i in range(number_of_folds)]

In [None]:
y_test = lgbm_predict(X_test, loaded_models)

In [None]:
sample_submission['meter_reading'] = np.expm1(y_test)

In [None]:
sample_submission

In [None]:
sample_submission[sample_submission['meter_reading'] < 0]

In [None]:
sample_submission.loc[sample_submission['meter_reading'] < 0, 'meter_reading'] = 0

In [None]:
sample_submission

# Analyse the result

In [None]:
from matplotlib import pyplot as plt
import pandas as pd
pd.DataFrame(y_test).hist(bins=20,color="#955586")
full_train_df['meter_reading_log1p'].hist(bins=20, color='#2E1F3F')
plt.xlabel("Meter Readings")
plt.ylabel("Counts")
plt.title("Train/Test Meter Reading Distribution")
plt.show()

In [None]:
sample_submission

In [None]:
sample_submission.describe()

In [None]:
np.log1p(sample_submission['meter_reading']).plot()
plt.show()

In [None]:
sample_submission[sample_submission['meter_reading'] == 0.0]

# Save to submission.csv file

In [None]:
sample_submission.to_csv('./data/submissions/submission_base_lgbm.csv', index=False, float_format='%.4f')