In [1]:
# System imports
import copy, json, os, gc
import numpy as np, pandas as pd
import ashrae_scripts as scp
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Models
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

# Utilities
from sklearn.metrics import accuracy_score, mean_squared_log_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.externals import joblib



In [2]:
dir = !ls -a
dir = !dir /b
output_compression_type = 'gzip'
if ('kernel-metadata.json' in dir):
    # Local environment
    src = 'Local'
    compression_type = 'gzip'
    data_folder = '../../../data/'
    output_folder = '../../../data/'
    df_path = data_folder + 'ashrae-train-test-1-0/df_label_enc.csv.gz'
    df_test_path = data_folder + 'ashrae-train-test-1-0/df_test_label_enc.csv.gz'
else:
    # Kaggle environment
    src = 'Kaggle'
    compression_type = None
    data_folder = '../input/'
    output_folder = ''
    df_path = data_folder + 'ashrae-train-test-1-0/df_label_enc.csv.gz'
    df_test_path = data_folder + 'ashrae-train-test-1-0/df_test_label_enc.csv.gz'
       
print('Environment set to [{env}]'.format(env=src))
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

Environment set to [Kaggle]
/kaggle/input/ashrae-train-test-1-0/__results__.html
/kaggle/input/ashrae-train-test-1-0/custom.css
/kaggle/input/ashrae-train-test-1-0/__notebook__.ipynb
/kaggle/input/ashrae-train-test-1-0/df_test_label_enc.csv.gz
/kaggle/input/ashrae-train-test-1-0/__output__.json
/kaggle/input/ashrae-train-test-1-0/df_label_enc.csv.gz
/kaggle/input/ashrae-energy-prediction/train.csv
/kaggle/input/ashrae-energy-prediction/building_metadata.csv
/kaggle/input/ashrae-energy-prediction/sample_submission.csv
/kaggle/input/ashrae-energy-prediction/weather_test.csv
/kaggle/input/ashrae-energy-prediction/weather_train.csv
/kaggle/input/ashrae-energy-prediction/test.csv


In [3]:
debug = False
predict_train = False
predict_test = True
generate_putput = True
if debug:
    rows = 1000
    num_round = 1000
else:
    rows = None
    num_round = 10000

In [4]:
dtype = {'building_id': 'int16'
         , 'meter': 'int8'
         , 'site_id': 'int8'
         , 'primary_use': 'int8'
         , 'year_built': 'int16'
         , 'floor_count': 'int8'
         , 'air_temperature': 'float16'
         , 'cloud_coverage': 'float16'
         , 'dew_temperature': 'float16'
         , 'precip_depth_1_hr': 'float16'
         , 'sea_level_pressure': 'float16'
         , 'wind_direction': 'float16'
         , 'wind_speed': 'float16'
         , 'meter_reading': 'float16'
         , 'square_feet': 'float64'
         , 'month': 'int8'
         , 'day': 'int8'
         , 'hour': 'int8'
         , 'day_of_week': 'int8'
         , 'weekend': 'int8'
         , 'night': 'int8'
        }
df = pd.read_csv(df_path
                 , dtype=dtype
                 , parse_dates=['timestamp']
                 , nrows=rows)

In [5]:
df.drop(df[df['meter_reading']==np.inf].index, axis=0, inplace=True)

In [6]:
# df_temp = pd.DataFrame([[1900, 'a'], [2000, 'a'], [2100, 'b'], [2200, 'b'], [np.NaN, 'a'], [np.NaN, 'b']]
#                       , columns=['year', 'site'])
# df_temp
# grouper = 'site'
# col = 'year'
# df_temp = scp.fill_by_group('year', 'site', df_temp)
# df_temp
# df1 = df_temp.groupby(grouper)[col].mean().round(0)
# df1 = df1[df1.notnull()]
# df1 = df1.reset_index()
# df_input = df_input.merge(df1, left_on=grouper, right_on=grouper, how='left')
# col_x = col + '_x'
# col_y = col + '_y'
# print(col_x)
# df_input[col_x].fillna(df_input[col_y])
# df_input = df_input.rename(columns={col_x: col})
# df_input.drop([col_y], 1, inplace=True)
# df_temp

In [7]:
# df_subset = df[['meter_reading_000'
#                 , 'timestamp', 'site_id', 'primary_use', 'year_built']]

# corrmat = df_subset.corr()
# top_corr_features = corrmat.index
# plt.figure(figsize=(5,5))
# g=sns.heatmap(df_subset[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [8]:
feature_cols = ['building_id', 'meter', 'site_id', 'primary_use', 'year_built', 'floor_count'
                , 'air_temperature', 'cloud_coverage','dew_temperature', 'precip_depth_1_hr'
                , 'sea_level_pressure','wind_direction', 'wind_speed', 'square_feet','month'
                , 'day', 'hour', 'day_of_week', 'weekend', 'night']
target_col = 'meter_reading'
min_val = df[target_col].min()
max_val = df[target_col].max()
X = df[feature_cols]
y = df[target_col]

In [9]:
if predict_train:
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, train_size=0.8)

In [10]:
def rmsle(preds, train_data):
    labels = train_data.get_label()
    preds = np.clip(preds, min_val, max_val)
    return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(preds) - np.log1p(labels), 2))), False

In [11]:
params = {
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metrics': 'rmse',
        'num_leaves': 31,
        'learning_rate': 0.05,
        'feature_fraction': 0.9,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'verbose': 0
    }

In [12]:
if predict_train:
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val = lgb.Dataset(X_val, y_val)
    # specify your configurations as a dict
    print('Starting training with custom eval function...')
    # train
    gbm = lgb.train(params=params
                    , train_set=lgb_train
                    , num_boost_round=num_round
                    , valid_sets=lgb_val
                    #, feval=rmsle)
                    , early_stopping_rounds=100)

In [13]:
if predict_train:
    y_pred = gbm.predict(X_val, num_iteration=gbm.best_iteration)
    y_pred = np.clip(y_pred, min_val, max_val)

In [14]:
if predict_train:
    rmsle = mean_squared_log_error(y_val, y_pred)**0.5
    print('Mean square log error: {:.2f}'.format(rmsle))

In [15]:
# params = {
#     'boosting_type': 'gbdt',
#     'objective': 'regression',
#     'metrics': 'rmsle',
#     'num_leaves': 31,
#     'learning_rate': 0.05,
#     'feature_fraction': 0.9,
#     'bagging_fraction': 0.8,
#     'bagging_freq': 5,
#     'verbose': 0
# }
# def rmsle(y_true, y_pred):
#     y_pred = np.clip(y_pred, min_val, max_val)
#     return 'RMSLE', np.sqrt(np.mean(np.power(np.log1p(y_pred) - np.log1p(y_true), 2))), False

# gbm = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', num_boost_round=10000, bagging_freq=5)
# gbm.fit(X_train, y_train,
#         eval_set=[(X_val, y_val)],
#         eval_metric='rmsle',
#         early_stopping_rounds=100)

# y_pred_fit = gbm.predict(X_val)

In [16]:
del df
if predict_train:
    del X_train
    del y_train
    del X_val
    del y_val
if not predict_test:
    del X
    del y
gc.collect()

11

In [17]:
# save model
# joblib.dump(gbm, 'lgb.pkl')
# # load model
# gbm_pickle = joblib.load('lgb.pkl')

In [18]:
if predict_test:
    dtype = {'building_id': 'int16'
             , 'row_id': 'int64'
             , 'meter': 'int8'
             , 'site_id': 'int8'
             , 'primary_use': 'int8'
             , 'year_built': 'int16'
             , 'floor_count': 'int8'
             , 'air_temperature': 'float16'
             , 'cloud_coverage': 'float16'
             , 'dew_temperature': 'float16'
             , 'precip_depth_1_hr': 'float16'
             , 'sea_level_pressure': 'float16'
             , 'wind_direction': 'float16'
             , 'wind_speed': 'float16'
             , 'square_feet': 'float64'
             , 'month': 'int8'
             , 'day': 'int8'
             , 'hour': 'int8'
             , 'day_of_week': 'int8'
             , 'weekend': 'int8'
             , 'night': 'int8'
            }
    df_test = pd.read_csv(df_test_path
                     , dtype=dtype
                     , parse_dates=['timestamp']
                     , nrows=rows)

In [19]:
if predict_test:
    lgb_full = lgb.Dataset(X, y)
    gbm = lgb.train(params=params
                    , train_set=lgb_full
                    , num_boost_round=num_round)

In [20]:
if predict_test:
    sites = list(df_test['site_id'].unique())

In [21]:
if predict_test:
    outputs = pd.DataFrame()
    for site in sites:
        print('Site [{site_no}] start...'.format(site_no=site))
        pred_site = gbm.predict(df_test.loc[df_test['site_id']==site, feature_cols], num_iteration=gbm.best_iteration)
        pred_site = np.clip(pred_site, min_val, max_val)
        output_site = pd.DataFrame({'row_id': df_test.loc[df_test['site_id']==site, ['row_id']]['row_id'],'meter_reading': pred_site})
        outputs = outputs.append(output_site, ignore_index = True)
        print('Site [{site_no}] end!'.format(site_no=site))

Site [0] start...
Site [0] end!
Site [1] start...
Site [1] end!
Site [2] start...
Site [2] end!
Site [3] start...
Site [3] end!
Site [4] start...
Site [4] end!
Site [5] start...
Site [5] end!
Site [6] start...
Site [6] end!
Site [7] start...
Site [7] end!
Site [8] start...
Site [8] end!
Site [9] start...
Site [9] end!
Site [10] start...
Site [10] end!
Site [11] start...
Site [11] end!
Site [12] start...
Site [12] end!
Site [13] start...
Site [13] end!
Site [14] start...
Site [14] end!
Site [15] start...
Site [15] end!


In [22]:
if predict_test:
    del df_test
    gc.collect()

In [23]:
if generate_putput:
    outputs.to_csv('submission.csv', index=False)