In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold
import lightgbm as lgb
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from pathlib import Path
import random
import warnings

warnings.filterwarnings('ignore')

In [2]:
root_path = Path('/home/zhouzr/project/competition/Kaggle-ASHRAE/data/')
train = pd.read_pickle(root_path / 'train.pkl')
test  = pd.read_pickle(root_path / 'test.pkl')
weather_train = pd.read_pickle(root_path / 'weather_train.pkl')
weather_test = pd.read_pickle(root_path / 'weather_test.pkl')
meta = pd.read_pickle(root_path / 'building_metadata.pkl')
sample_submission = pd.read_pickle(root_path / 'sample_submission.pkl')

## Config

In [3]:
seed = 42
use_log1p_target = True
n_folds = 2

param = {
            'boosting_type': 'gbdt',
            'objective': 'regression',
            'metric': {'rmse'},
            'subsample': 0.4,
            'subsample_freq': 1,
            'learning_rate': 0.25,
            'num_leaves': 31,
            'feature_fraction': 0.8,
            'lambda_l1': 1,
            'lambda_l2': 1
            }
features = []
categorical_features = ['site_id', 'building_id', 'primary_use', 'hour', 'weekday', 'meter',  'wind_direction']
numerical_features = ['square_feet', 'year_built', 'air_temperature', 'cloud_coverage',
              'dew_temperature', 'precip_depth_1_hr', 'floor_count']

features = numerical_features + categorical_features

## Metrics

In [4]:
def rmsle(y_true, y_pred):
    y_true[y_true<=0] = 0
    y_pred[y_pred<0] = 0
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

def lgb_rmsle(y_pred, dataset):
    y_true = dataset.label
    if use_log1p_target:
        y_pred = np.expm1(y_pred)
        y_true = np.expm1(y_true)
    y_pred[y_pred < 0] = 0.
    metric_score = rmsle(y_true, y_pred)
    is_higher_better = False
    metric_name = 'rmsle'
    return metric_name, metric_score, is_higher_better

## Processing

In [5]:
primary_use_encoder = LabelEncoder()
meta['primary_use'] = primary_use_encoder.fit_transform(meta['primary_use'])

In [6]:
if use_log1p_target:
    train['meter_reading'] = np.log1p(train['meter_reading'])

In [7]:
train = train.merge(meta, on='building_id', how='left')
test = test.merge(meta, on='building_id', how='left')
train = train.merge(weather_train, on=['site_id', 'timestamp'], how='left')
test = test.merge(weather_test, on=['site_id', 'timestamp'], how='left')

In [8]:
def transform(x):
    # time feature
    x['weekday'] = np.int8(x.timestamp.dt.weekday)
    x['hour'] = np.int8(x.timestamp.dt.hour)
    x['month'] = np.int8(x.timestamp.dt.month)
    return x

train = transform(train)
test = transform(test)

## CV

In [55]:
def cv(df, features, categorical_features, n_folds, param, verbose=50):
    kf = GroupKFold(n_splits=n_folds)
    group_map = dict(zip(np.arange(1, 13), 
                     pd.cut(np.arange(1, 13), n_folds, labels=np.arange(n_folds))))
    group = df.timestamp.dt.month.map(group_map)
    
    
    models = []
    train_scores = []
    valid_scores = []
    
    for train_index, val_index in kf.split(df, df['building_id'], groups=group):
        train_X, train_y = df[features].iloc[train_index], df['meter_reading'].iloc[train_index]
        val_X, val_y = df[features].iloc[val_index], df['meter_reading'].iloc[val_index]
        
        lgb_train = lgb.Dataset(train_X, train_y, categorical_feature=categorical_features)
        lgb_eval = lgb.Dataset(val_X, val_y, categorical_feature=categorical_features)
        gbm = lgb.train(param,
                    lgb_train,
                    num_boost_round=500,
                    valid_sets=(lgb_train, lgb_eval),
                    early_stopping_rounds=50,
                    verbose_eval = verbose, feval=lgb_rmsle)
        
        train_preds = gbm.predict(train_X)
        if use_log1p_target:
            train_preds = np.expm1(train_preds)
            train_y = np.expm1(train_y)
        train_scores.append(rmsle(train_y, train_preds))
        
        valid_preds = gbm.predict(val_X)
        if use_log1p_target:
            valid_preds = np.expm1(valid_preds)
            val_y = np.expm1(val_y)
        valid_scores.append(rmsle(val_y, valid_preds))
        
        models.append(gbm)
        
    print('-' * 40 + 'cv finished!' + '-' * 40)
    print('-' * 40 + 'cv finished!' + '-' * 40)
    print('-' * 40 + 'cv finished!' + '-' * 40)
    print(f'train: {np.mean(train_scores): .3f}, valid: {np.mean(valid_scores): .3f}')
    return train_scores, valid_scores, models

In [56]:
train_scores, valid_scores, models = cv(train, features, categorical_features, 2, param)

Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.919538	training's rmsle: 0.919063	valid_1's rmse: 1.35733	valid_1's rmsle: 1.35524
[100]	training's rmse: 0.87654	training's rmsle: 0.875829	valid_1's rmse: 1.35148	valid_1's rmsle: 1.34877
[150]	training's rmse: 0.850931	training's rmsle: 0.850021	valid_1's rmse: 1.35102	valid_1's rmsle: 1.34781
[200]	training's rmse: 0.835636	training's rmsle: 0.834609	valid_1's rmse: 1.3491	valid_1's rmsle: 1.34545
[250]	training's rmse: 0.820568	training's rmsle: 0.819366	valid_1's rmse: 1.34951	valid_1's rmsle: 1.34518
Early stopping, best iteration is:
[205]	training's rmse: 0.834251	training's rmsle: 0.833211	valid_1's rmse: 1.34837	valid_1's rmsle: 1.34471
Training until validation scores don't improve for 50 rounds
[50]	training's rmse: 0.983034	training's rmsle: 0.982909	valid_1's rmse: 1.52948	valid_1's rmsle: 1.52938
[100]	training's rmse: 0.91699	training's rmsle: 0.916576	valid_1's rmse: 1.51576	valid_1's