In [7]:
from beepy import beep
import pandas as pd
import numpy as np
import os
import warnings
import gc

import json
import time

import catboost
from catboost import Pool, CatBoostRegressor

from sklearn.metrics import mean_squared_log_error as MSLE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_absolute_error as MAE

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook

In [8]:
def create_catboost_pool(X, y, categorical_features):
    """Returns Catboost Pool with categorical encoding."""

    X_to_encode = X[categorical_features].astype('str')
    X_no_encoding_reqd = X[list(set(X) - set(categorical_features))]

    X_encoded = pd.merge(X_no_encoding_reqd, X_to_encode, left_index=True, right_index=True)

    return Pool(X_encoded, y, cat_features=categorical_features)

In [9]:
def show_feature_importance(model, method='catboost'):
    """Single wrapper to show feature importance"""
    
    if method == 'lgbm':
        feat_imp = {}
        for i in range(len(model.feature_importance())):
            feat_imp[ (model.feature_name())[i]] = (model.feature_importance())[i]
        result_dict = {k: v for k, v in sorted(feat_imp.items(), key=lambda item: item[1], reverse=True)}
        result = pd.DataFrame(result_dict.items(), columns=["Feature Id", "Importances"])
    
    elif method == 'catboost':
        result = model.get_feature_importance(prettified=True, verbose=True)
        
    return result

# CatBoost averages

In [6]:
# site_id has been removed form the list below
cat_features = ['building_id', 'meter', 'primary_use',
                'air_temperature_was_missing',
                'cloud_coverage_was_missing', 'dew_temperature_was_missing',
                'precip_depth_1_hr_was_missing', 'sea_level_pressure_was_missing',
                'wind_direction_was_missing', 'wind_speed_was_missing',
                'day_of_month', 'day_of_week']

In [7]:
site_id_train = "/data/site_id/train/"
site_id_test = "/data/site_id/test/"
site_id_feat_imp = "/data/site_id/imp_features/"
site_id_models = "/data/site_id/final_models/"

In [8]:
site_models = {}

In [14]:
param_file_path = "/data/site_id/site_params.json"
with open(param_file_path, 'r') as f:
    site_params = json.load(f)

importance_threshold = 0.01

for i in range(len(site_params.keys())):
#for i in range(2):
    
    print("Attempting training with: site_{}".format(i))
    
    # 1. Extract the parameters of each site
    params = site_params["site_{}".format(i)]
    
    # 2. Set the file/directory paths
    site_train_file = site_id_train + "/train_site_id_{}.csv".format(i)
    site_test_file = site_id_test + "/test_site_id_{}.csv".format(i)
    feature_imp_file = site_id_feat_imp + "/feat_imp_train_site_id_{}.csv".format(i)
    
    model_save_path = site_id_models + "model_site_{}".format(i)
    
    # 3. Extract Important Features and Categorical Features
    
    imp_feats = pd.read_csv(feature_imp_file)
    features_selected = set(imp_feats[imp_feats.Importances >= importance_threshold]["Feature Id"])

    # Add meter_reading to the important features list
    features_selected.add("meter_reading")
    
    # Identify categorical features 
    cat_features_selected = list(features_selected.intersection(set(cat_features)))
    
    print(" - imp features selected ... reading training files")
    # 4. Read the train file and create a train pool
    train = pd.read_csv(site_train_file, index_col=0)[features_selected]
    y_train = np.log1p(train.meter_reading)
    X_train = train.drop(["meter_reading"], axis=1)
    del train
    
    train_pool = create_catboost_pool(X_train, y_train, cat_features_selected)
    del X_train, y_train
    
    # read the trest file and create test pool
    test = pd.read_csv(site_test_file, index_col=0)[features_selected]
    y_test = np.log1p(test.meter_reading)
    X_test = test.drop("meter_reading", axis=1)
    del test
    
    test_pool = create_catboost_pool(X_test, y_test, cat_features_selected)
    del X_test, y_test
    
    print(" - pools created ... attempting training")
    # 5. Train the model
    
    iterations = params["iterations"]
    depth = params["depth"]
    learning_rate = params["learning_rate"]
    l2_leaf_reg = params["l2_leaf_reg"]
    
    model = CatBoostRegressor(iterations= iterations,
                              depth = depth,
                              learning_rate=learning_rate,
                              l2_leaf_reg=l2_leaf_reg,
                              loss_function="RMSE", 
                              boosting_type="Ordered", 
                              eval_metric="MSLE", 
                              od_type="Iter", od_wait=100, 
                              use_best_model=True, 
                              verbose=100, 
                              random_seed=8848)
    tic = time.time()
    model.fit(train_pool, eval_set=test_pool)
    toc = time.time()
    print(" - model fitted ...")
    print(" - it took {} minutes to train.".format((toc-tic)/60))
    
    # 6. Save the model
    site_models["site_{}".format(i)] = model
    model.save_model(fname=model_save_path)
    print(" - model saved to {} \n".format(model_save_path))
    
    #del model
    gc.collect()

Attempting training with: site_0
 - imp features selected ... reading training files
 - pools created ... attempting training
0:	learn: 0.1483485	test: 0.1479913	best: 0.1479913 (0)	total: 330ms	remaining: 2m 27s
100:	learn: 0.1097533	test: 0.1080915	best: 0.1080915 (100)	total: 26.1s	remaining: 1m 30s
200:	learn: 0.0963933	test: 0.0943070	best: 0.0943070 (200)	total: 51.9s	remaining: 1m 4s
300:	learn: 0.0907607	test: 0.0880459	best: 0.0880459 (300)	total: 1m 23s	remaining: 41.5s
400:	learn: 0.0884386	test: 0.0851277	best: 0.0851277 (399)	total: 1m 52s	remaining: 13.8s
449:	learn: 0.0884377	test: 0.0851276	best: 0.0851272 (401)	total: 1m 57s	remaining: 0us

bestTest = 0.08512722689
bestIteration = 401

Shrink model to first 402 iterations.
 - model fitted ...
 - it took 2.012193481127421 minutes to train.
 - model saved to /data/site_id/final_models/model_site_0 

Attempting training with: site_1
 - imp features selected ... reading training files
 - pools created ... attempting traini

  mask |= (ar1 == a)
  interactivity=interactivity, compiler=compiler, result=result)


 - pools created ... attempting training
0:	learn: 0.3543204	test: 0.3527032	best: 0.3527032 (0)	total: 1.38s	remaining: 8m
100:	learn: 0.2243199	test: 0.2214579	best: 0.2214579 (100)	total: 54.8s	remaining: 2m 15s
200:	learn: 0.1808766	test: 0.1777062	best: 0.1777062 (200)	total: 1m 50s	remaining: 1m 21s
300:	learn: 0.1670328	test: 0.1613792	best: 0.1613792 (299)	total: 2m 45s	remaining: 27s
349:	learn: 0.1670275	test: 0.1613849	best: 0.1613777 (302)	total: 2m 59s	remaining: 0us

bestTest = 0.1613776579
bestIteration = 302

Shrink model to first 303 iterations.
 - model fitted ...
 - it took 3.0413495659828187 minutes to train.
 - model saved to /data/site_id/final_models/model_site_2 

Attempting training with: site_3
 - imp features selected ... reading training files
 - pools created ... attempting training
0:	learn: 0.1151580	test: 0.1148931	best: 0.1148931 (0)	total: 1.11s	remaining: 8m 17s
100:	learn: 0.0481933	test: 0.0475571	best: 0.0475571 (100)	total: 1m 9s	remaining: 4m 1s


# CatBoost - Meters

In [10]:
# meter has been removed form the list below
cat_features = [
    "site_id",
    'air_temperature_was_missing',
     'building_id',
     'cloud_coverage_was_missing',
     'day_of_month',
     'day_of_week',
     'dew_temperature_was_missing',
     'precip_depth_1_hr_was_missing',
     'primary_use',
     'sea_level_pressure_was_missing',
     'site_idcloud_coverage_was_missing',
     'wind_direction_was_missing',
     'wind_speed_was_missing']

In [11]:
meter_train = "/data/meter_type/train/"
meter_test = "/data/meter_type/test/"
meter_feat_imp = "/data/meter_type/imp_features/"
meter_models = "/data/meter_type/final_models/"

meter_models_list = {}

In [16]:
param_file_path = "/data/meter_type/meter_params.json"
with open(param_file_path, 'r') as f:
    meter_params = json.load(f)

importance_threshold = 0.01

#for i in range(len(meter_params.keys())):
for i in [1, 2, 3]:
    
    print("Attempting training with: meter_{}".format(i))
    
    # 1. Extract the parameters of each meter
    params = meter_params["meter_{}".format(i)]
    
    # 2. Set the file/directory paths
    meter_train_file = meter_train + "/train_meter_{}.csv".format(i)
    meter_test_file = meter_test + "/test_meter_{}.csv".format(i)
    feature_imp_file = meter_feat_imp + "/feat_imp_train_meter_{}.csv".format(i)
    
    model_save_path = meter_models + "model_meter_{}".format(i)
    
    # 3. Extract Important Features and Categorical Features
    
    imp_feats = pd.read_csv(feature_imp_file)
    features_selected = set(imp_feats[imp_feats.Importances >= importance_threshold]["Feature Id"])

    # Add meter_reading to the important features list
    features_selected.add("meter_reading")
    
    # Identify categorical features 
    cat_features_selected = list(set(cat_features).intersection(set(features_selected)))
    
    print(" - imp features selected ... reading training files")
    # 4. Read the train file and create a train pool
    train = pd.read_csv(meter_train_file, index_col=0)[features_selected]
    y_train = np.log1p(train.meter_reading)
    X_train = train.drop(["meter_reading"], axis=1)
    del train
    gc.collect()
    
    train_pool = create_catboost_pool(X_train, y_train, cat_features_selected)
    del X_train, y_train
    
    # read the trest file and create test pool
    test = pd.read_csv(meter_test_file, index_col=0)[features_selected]
    y_test = np.log1p(test.meter_reading)
    X_test = test.drop("meter_reading", axis=1)
    del test
    gc.collect()
    
    test_pool = create_catboost_pool(X_test, y_test, cat_features_selected)
    del X_test, y_test
    
    print(" - pools created ... attempting training")
    # 5. Train the model
    
    iterations = params["iterations"]
    depth = params["depth"]
    learning_rate = params["learning_rate"]
    l2_leaf_reg = params["l2_leaf_reg"]
    
    model = CatBoostRegressor(iterations=iterations,
                              depth =depth,
                              learning_rate=learning_rate,
                              l2_leaf_reg=l2_leaf_reg,
                              loss_function="RMSE", 
                              boosting_type="Ordered", 
                              eval_metric="MSLE", 
                              od_type="Iter", od_wait=100, 
                              use_best_model=True, 
                              verbose=100, 
                              random_seed=8848)
    tic = time.time()
    model.fit(train_pool, eval_set=test_pool)
    toc = time.time()
    print(" - model fitted ...")
    print(" - it took {} minutes to train".format((toc-tic)/60))
    
    # 6. Save the model
    meter_models_list["meter_{}".format(i)] = model
    model.save_model(fname=model_save_path)
    print(" - model saved to {}\n\n".format(model_save_path))
    
    #del model
    gc.collect()

Attempting training with: meter_1
 - imp features selected ... reading training files
 - pools created ... attempting training
0:	learn: 0.5579770	test: 0.5576534	best: 0.5576534 (0)	total: 4.75s	remaining: 39m 30s
100:	learn: 0.3075246	test: 0.3013597	best: 0.3012434 (50)	total: 4m 54s	remaining: 19m 21s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.3012434134
bestIteration = 50

Shrink model to first 51 iterations.
 - model fitted ...
 - it took 5.793006602923075 minutes to train
 - model saved to /data/meter_type/final_models/model_meter_1


Attempting training with: meter_2
 - imp features selected ... reading training files
 - pools created ... attempting training
0:	learn: 0.5013716	test: 0.5010729	best: 0.5010729 (0)	total: 2.74s	remaining: 22m 45s
100:	learn: 0.3164940	test: 0.3109665	best: 0.3109585 (93)	total: 3m 7s	remaining: 12m 21s
200:	learn: 0.3164209	test: 0.3109219	best: 0.3109065 (170)	total: 3m 46s	remaining: 5m 37s
300:	learn: 0.3163974	test: 