# Overview

1448棟のビルの4種類のメーター（電力、冷水、スチーム、温水）の値を予測するコンペである。

背景としては、エネルギー使用量（4種類のメーターの値）が正確に予測できるようになることで<br>
省エネ投資を活発化させて、環境問題に貢献したいという狙いがある。

ビルの所有者は、ビルのエネルギー効率を改善するような投資を行うことで<br>
エネルギー効率が改善された分、コストを抑えることができる。

# Module

In [None]:
import gc
import sys
import optuna
import datetime
import warnings
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

# Datasets

In [None]:
train = pd.read_csv('../input/ashrae-energy-prediction/train.csv')

print(train.shape)
train.head()

1時間ごとのメータ値を保持するテーブル（学習データ）<br>
・building_id：ビルのID<br>
・meter：0は電力、1は冷水、2はスチーム、3は温水<br>
・timestamp：日付と時刻<br>
・meter_reading：メーターの使用量（目的変数）<br>

In [None]:
test = pd.read_csv('../input/ashrae-energy-prediction/test.csv')

print(test.shape)
test.head()

1時間ごとのメータ値を保持するテーブル（テストデータ）<br>
・row_id：行のID<br>
・building_id：ビルのID<br>
・meter：0は電力、1は冷水、2はスチーム、3は温水<br>
・timestamp：日付と時刻<br>

In [None]:
usecols = {'site_id', 'timestamp', 'air_temperature', 'dew_temperature'}

In [None]:
weather_train = pd.read_csv('../input/ashrae-energy-prediction/weather_train.csv', usecols=usecols)

print(weather_train.shape)
weather_train.head()

気象情報を保持するテーブル（学習データに対応）<br>
・site_id：ビルの物理的な位置<br>
・timestamp：日付と時刻<br>
・air_temperature：気温<br>
・dew_temperature：湿度<br>
・precip_depth：降水量<br>
・sea_level_pressure：海圧<br>
・wind_direction：風向<br>
・wind_speed：風速<br>

In [None]:
weather_test = pd.read_csv('../input/ashrae-energy-prediction/weather_test.csv', usecols=usecols)

print(weather_test.shape)
weather_test.head()

気象情報を保持するテーブル（テストデータに対応）<br>
・site_id：ビルの物理的な位置<br>
・timestamp：日付と時刻<br>
・air_temperature：気温<br>
・dew_temperature：湿度<br>
・precip_depth：降水量<br>
・sea_level_pressure：海圧<br>
・wind_direction：風向<br>
・wind_speed：風速<br>

In [None]:
usecols = {'site_id', 'building_id', 'primary_use', 'square_feet', 'year_built'}

In [None]:
building = pd.read_csv('../input/ashrae-energy-prediction/building_metadata.csv', usecols=usecols)

print(building.shape)
building.head()

ビルの情報を保持するテーブル<br>
・site_id：ビルの物理的な位置<br>
・building_id：ビルのID<br>
・primary_use：使用目的<br>
・square_feet：面積<br>
・year_built：施工した年<br>
・floor_count：階数<br>

# Lags

In [None]:
'''
lag_cols = ['air_temperature', 'dew_temperature']

for i in lag_cols:
    weather_train['DIFF_' + i] = weather_train.groupby(by = ['site_id'])[i].shift(-1)
    weather_train['PREVIOUS_DIFF_' + i] = np.abs(weather_train[i] - weather_train['DIFF_' + i])
    weather_train = weather_train.drop(['DIFF_' + i], axis=1)
    
weather_train.head()
'''

In [None]:
'''
lag_cols = ['air_temperature', 'dew_temperature']

for i in lag_cols:
    weather_test['DIFF_' + i] = weather_test.groupby(by = ['site_id'])[i].shift(-1)
    weather_test['PREVIOUS_DIFF_' + i] = np.abs(weather_test[i] - weather_test['DIFF_' + i])
    weather_test = weather_test.drop(['DIFF_' + i], axis=1)
    
weather_test.head()
'''

# Join

In [None]:
def reduce_mem_usage(df, use_float16=False):
    
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
# train = reduce_mem_usage(train)
# test = reduce_mem_usage(test)

weather_train = reduce_mem_usage(weather_train)
weather_test = reduce_mem_usage(weather_test)

building = reduce_mem_usage(building)

In [None]:
train = train.merge(building, on='building_id', how='left')
test = test.merge(building, on='building_id', how='left')

print(train.shape)
train.head()

In [None]:
del building
gc.collect()

In [None]:
train = train.merge(weather_train, on=['site_id', 'timestamp'], how='left')
test = test.merge(weather_test, on=['site_id', 'timestamp'], how='left')

print(train.shape)
train.head()

In [None]:
del weather_train, weather_test
gc.collect()

In [None]:
# train = reduce_mem_usage(train)
# test = reduce_mem_usage(test)

In [None]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [None]:
print(sys.getsizeof(train))
print(sys.getsizeof(test))

In [None]:
train.memory_usage()

In [None]:
test.memory_usage()

In [None]:
print(train.shape)
train.head()

# Analytics

In [None]:
train.info()

In [None]:
train.nunique()

In [None]:
train.isnull().sum()

In [None]:
train.corr()

# Visualization

In [None]:
sns.set()

In [None]:
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

In [None]:
fig, axes = plt.subplots(figsize=(8, 4))
train[['timestamp', 'meter_reading']].set_index('timestamp').resample('H').mean()['meter_reading'].plot(ax=axes, label='hour', alpha=0.8).set_ylabel('Meter reading', fontsize=14)
train[['timestamp', 'meter_reading']].set_index('timestamp').resample('D').mean()['meter_reading'].plot(ax=axes, label='day', alpha=0.8).set_ylabel('Meter reading', fontsize=14)
axes.legend()

In [None]:
fig, axes = plt.subplots(figsize=(8, 4))
train[['timestamp', 'meter_reading']].set_index('timestamp').resample('W').mean()['meter_reading'].plot(ax=axes, label='week', alpha=1).set_ylabel('Meter reading', fontsize=14)
train[['timestamp', 'meter_reading']].set_index('timestamp').resample('M').mean()['meter_reading'].plot(ax=axes, label='month', alpha=1).set_ylabel('Meter reading', fontsize=14)
axes.legend()

# Outliers

In [None]:
def to_season(month_num):
    season = 'winter'
    if 3 <= month_num <= 5:
        season = 'spring'
    elif 6 <= month_num <= 8:
        season = 'summer'
    elif 9 <= month_num <= 11:
        season = 'autumn'
    return season

In [None]:
train['month'] = train['timestamp'].dt.month
train['season'] = train['month'].apply(lambda x: to_season(x))

In [None]:
train = train.drop(train[(train['meter'] == 0) & (train['meter_reading'] == 0)].index, axis=0)

In [None]:
train = train.drop(train[(train['meter'] == 1) & (train['meter_reading'] == 0) & (train['season'] == 'spring')].index, axis=0)
train = train.drop(train[(train['meter'] == 1) & (train['meter_reading'] == 0) & (train['season'] == 'summer')].index, axis=0)
train = train.drop(train[(train['meter'] == 1) & (train['meter_reading'] == 0) & (train['season'] == 'autumn')].index, axis=0)

In [None]:
train = train.drop(train[(train['meter'] == 2) & (train['meter_reading'] == 0) & (train['season'] == 'spring')].index, axis=0)
train = train.drop(train[(train['meter'] == 2) & (train['meter_reading'] == 0) & (train['season'] == 'autumn')].index, axis=0)
train = train.drop(train[(train['meter'] == 2) & (train['meter_reading'] == 0) & (train['season'] == 'winter')].index, axis=0)

In [None]:
train = train.drop(train[(train['meter'] == 3) & (train['meter_reading'] == 0) & (train['season'] == 'spring')].index, axis=0)
train = train.drop(train[(train['meter'] == 3) & (train['meter_reading'] == 0) & (train['season'] == 'autumn')].index, axis=0)
train = train.drop(train[(train['meter'] == 3) & (train['meter_reading'] == 0) & (train['season'] == 'winter')].index, axis=0)

In [None]:
train = train.drop('season', axis=1).reset_index(drop=True)
gc.collect()

train = reduce_mem_usage(train)

In [None]:
print(train.shape)
train.head()

# Feature Engineering

In [None]:
train['hour'] = train['timestamp'].dt.hour
#train['day'] = train['timestamp'].dt.day
#train['weekend'] = train['timestamp'].dt.weekday

In [None]:
train = train.drop(['timestamp', 'site_id'], axis=1)
gc.collect()

train = reduce_mem_usage(train)

In [None]:
df_group = train.groupby('building_id')['meter_reading']

building_mean = df_group.mean()
building_median = df_group.median()
building_std = df_group.std()

building_min = df_group.min()
building_max = df_group.max()

In [None]:
train['building_mean'] = train['building_id'].map(building_mean)
train['building_median'] = train['building_id'].map(building_median)
train['building_std'] = train['building_id'].map(building_std)

train['building_min'] = train['building_id'].map(building_min)
train['building_max'] = train['building_id'].map(building_max)

In [None]:
del df_group
gc.collect()

train = reduce_mem_usage(train)

In [None]:
print(train.shape)
train.head()

# Preprocess

In [None]:
# drop_col = ['precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed', 'floor_count']      

In [None]:
# train = train.drop(drop_col, axis = 1)
# gc.collect()

In [None]:
train['square_feet'] = np.log(train['square_feet'])
test['square_feet'] = np.log(test['square_feet'])

In [None]:
le = LabelEncoder()

train['primary_use'] = le.fit_transform(train['primary_use'])
test['primary_use'] = le.fit_transform(test['primary_use'])

In [None]:
target = np.log1p(train['meter_reading'])
train = train.drop(['meter_reading'], axis=1)

In [None]:
del le
gc.collect()

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [None]:
print(sys.getsizeof(train))
print(sys.getsizeof(test))

In [None]:
print(train.shape)
train.head()

# Modeling

In [None]:
'''
X_tr, X_val, y_tr, y_val = train_test_split(train, target, test_size=0.2, random_state=666)

def create_model(trial):
    num_leaves = trial.suggest_int('num_leaves', 2, 30)
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    learning_rate = trial.suggest_uniform('learning_rate', 0.0001, 0.99)
    max_depth = trial.suggest_int('max_depth', 2, 10)
    min_child_samples = trial.suggest_int('min_child_samples', 100, 1200)
    min_data_in_leaf = trial.suggest_int('min_data_in_leaf', 5, 90)
    bagging_freq = trial.suggest_int('bagging_freq', 1, 7)
    bagging_fraction = trial.suggest_uniform('bagging_fraction', 0.0001, 1.0)
    feature_fraction = trial.suggest_uniform('feature_fraction', 0.0001, 1.0)
    subsample = trial.suggest_uniform('subsample', 0.1, 1.0)
    colsample_bytree = trial.suggest_uniform('colsample_bytree', 0.1, 1.0)
    
    model = lgb.LGBMRegressor(
        num_leaves=num_leaves,
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth, 
        min_child_samples=min_child_samples, 
        min_data_in_leaf=min_data_in_leaf,
        bagging_freq=bagging_freq,
        bagging_fraction=bagging_fraction,
        feature_fraction=feature_fraction,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        metric='rsme',
        random_state=666)
    return model

def objective(trial):
    model = create_model(trial)
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_val)
    rsme = np.sqrt(mean_squared_error(y_pred, y_val))
    return rsme

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=40)
params = study.best_params
print(params)
'''

In [None]:
params = {'num_leaves': 30,
          'n_estimators': 279,
          'learning_rate': 0.41293292317272395,
          'max_depth': 9,
          'min_child_samples': 118,
          'min_data_in_leaf': 74,
          'bagging_freq': 1,
          'bagging_fraction': 0.34909235383110854,
          'feature_fraction': 0.7673641507626504,
          'subsample': 0.7826605676058075,
          'colsample_bytree': 0.31499443536061944,
          'random_state': 666}

In [None]:
cls = lgb.LGBMRegressor(**params)
cls.fit(train, target)

In [None]:
del train, target, params
gc.collect()

In [None]:
lgb.plot_importance(cls)
plt.show()

# Submit

In [None]:
row_id = test['row_id']

test = test.drop(['row_id'], axis = 1)
gc.collect()

In [None]:
test['hour'] = test['timestamp'].dt.hour
# test['day'] = test['timestamp'].dt.day
# test['weekend'] = test['timestamp'].dt.weekday
test['month'] = test['timestamp'].dt.month

In [None]:
test = test.drop(['timestamp', 'site_id'], axis=1)
gc.collect()

In [None]:
test['building_mean'] = test['building_id'].map(building_mean)
test['building_median'] = test['building_id'].map(building_median)
test['building_std'] = test['building_id'].map(building_std)

test['building_min'] = test['building_id'].map(building_min)
test['building_max'] = test['building_id'].map(building_max)

In [None]:
del building_mean, building_median, building_std
gc.collect()

In [None]:
row_id = row_id.reset_index(drop=True)
test = test.reset_index(drop=True)

test = reduce_mem_usage(test)

In [None]:
print(pd.DataFrame([[i for i in dir()], [sys.getsizeof(eval(i)) for i in dir()]],
                   index=['name','size']).T.sort_values('size', ascending=False).reset_index(drop=True).head(10))

In [None]:
print(test.shape)
test.head()

In [None]:
target = np.expm1(cls.predict(test))

submission = pd.DataFrame(target, index=row_id, columns=['meter_reading'])
submission.head(10)

In [None]:
del row_id
gc.collect()

submission = reduce_mem_usage(submission)

In [None]:
submission.to_csv('ASHRAE_submit.csv')