In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import datetime,random
import warnings
import seaborn as sns
import matplotlib.pyplot as plt

from time import time
from scipy.stats import skew, boxcox, mstats
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.linear_model import BayesianRidge
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import cross_val_score
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
SEED = 42
FOLDS = 3

def seed_env(seed=SEED):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

pd.set_option('display.max_columns', 380)
pd.set_option('display.max_rows', 500)
seed_env()

sns.set()
%matplotlib inline

## Feature engineering

In [None]:
def air_temp_fe(df):
    df['min_day_temp'] = df.groupby(['building_id','month','day'])['air_temperature'].transform('min').astype(np.float16)
    df['max_day_temp'] = df.groupby(['building_id','month','day'])['air_temperature'].transform('max').astype(np.float16)
    df['mean_day_temp'] = df.groupby(['building_id','month','day'])['air_temperature'].transform('mean').astype(np.float16)
    df['site_id_mean_temp'] = df.groupby(['site_id','month'])['air_temperature'].transform('mean').astype(np.float16)
    #https://en.wikipedia.org/wiki/Degree_day
    #https://www.sciencedirect.com/topics/engineering/heating-degree-day
    nominal_degree = 18
    df['degree_day'] = (np.round((df['max_day_temp'] + df['min_day_temp'])/2) - nominal_degree).astype(np.float16)
    #df['degree_day_month'] =  df.groupby(['building_id','month'])['degree_day'].transform(lambda x: sum(np.abs(x))).astype(np.float16)
    #df['air_temp_month'] = df.groupby(['building_id','month'])['air_temperature'].transform('mean').astype(np.float16)
    df.drop(['min_day_temp','max_day_temp'], axis = 1, inplace = True)

def sea_level_pressure_fe(df):
    #df['sea_level_pressure_month_max'] = df.groupby(['building_id','month'])['sea_level_pressure'].transform('max').astype(np.float16)
    #df['sea_level_pressure_month_min'] = df.groupby(['building_id','month'])['sea_level_pressure'].transform('min').astype(np.float16)
    df['sea_level_pressure_month_mean'] = df.groupby(['building_id','month'])['sea_level_pressure'].transform('mean').astype(np.float16)
    #air temperature and sea level pressure are used in altitude calculation
    #https://keisan.casio.com/exec/system/1224585971
    #df['sea_level_air_temp_rel'] = (df['sea_level_pressure']*df['air_temperature']*0.00065).astype(np.float32)
    

def add_lag_feature(df, window=3):
    group_df = df.groupby('site_id')
    cols = ['air_temperature']
    rolled = group_df[cols].rolling(window=window, min_periods=0)
    lag_mean = rolled.mean().reset_index().astype(np.float16)
    
    for col in cols:
        df[f'{col}_mean_lag{window}'] = lag_mean[col].astype(np.float32)
    
    del lag_mean,rolled,group_df
    gc.collect()
    

In [None]:
def set_localtime(df):
    zone_dict={0:4,1:0,2:7,3:4,4:7,5:0,6:4,7:4,8:4,9:5,10:7,11:4,12:0,13:5,14:4,15:4} 
    for sid, zone in zone_dict.items():
        sids = df.site_id == sid
        df.loc[sids, 'timestamp'] = df[sids].timestamp - pd.offsets.Hour(zone)
        
def time_features(df):
    df['month'] = df['timestamp'].dt.month.astype(np.int8)
    df['day'] = df['timestamp'].dt.day.astype(np.int8)
    df['hour'] = df['timestamp'].dt.hour.astype(np.int8)
    df['weekday'] = df['timestamp'].dt.weekday.astype(np.int8)

In [None]:
def building_features(df,df_group):
    building_mean = df_group.mean().astype(np.float16)
    building_median = df_group.median().astype(np.float16)
    building_std = df_group.std().astype(np.float16)
    
    df['building_median'] = df['building_id'].map(building_median).astype(np.float16)
    df['building_mean'] = df['building_id'].map(building_mean).astype(np.float16)
    df['building_std'] = df['building_id'].map(building_std).astype(np.float16)
    
    del building_mean,building_median,building_std
    gc.collect()

## Modelling

In [None]:
categorical_features = ["primary_use","building_id","month","day","hour","weekday","site_id"]

In [None]:
def train(train_df, target, splits):
    params = {
    "objective": "regression",
    "boosting": "gbdt",
    "num_leaves": 41,
    "max_depth": 6,
    "learning_rate": 0.05,
    "feature_fraction": 0.8,
    "bagging_fraction": 0.33,
    "bagging_freq": 5,
    "metric": "rmse",
    "reg_lambda": 2
    }
    
    lgb_models = list()
    #kf = KFold(n_splits=FOLDS, shuffle=False, random_state=SEED)
    
    for split in splits:
        train_index = split[0] 
        val_index = split[1] 
        train_X = train_df.iloc[train_index]
        val_X = train_df.iloc[val_index]
        train_y = target.iloc[train_index]
        val_y = target.iloc[val_index]
        lgb_train = lgb.Dataset(train_X, train_y, categorical_feature=categorical_features)
        lgb_eval = lgb.Dataset(val_X, val_y, categorical_feature=categorical_features)
        lgb_model = lgb.train(params,
                lgb_train,
                valid_sets=(lgb_train, lgb_eval),
                valid_names=["train","valid"],
                num_boost_round=1600,
                early_stopping_rounds=200,
                verbose_eval = 200)
        
        lgb_models.append(lgb_model)
        
        del train_X,val_X,train_y,val_y,lgb_train,lgb_eval
        gc.collect()
        
    return lgb_models

In [None]:
def visualize_feature_importance(model,columns):
    feature_imp = pd.DataFrame(sorted(zip(model.feature_importance(),columns), reverse=True), columns=["Value","Feature"])
    plt.figure(figsize=(10,10))
    importance_bar = sns.barplot(data=feature_imp, x='Value', y='Feature')
    plt.show()

## Training

In [None]:
def df_transformation(df, group):
    df['timestamp'] = pd.to_datetime(df['timestamp'], format="%Y-%m-%d %H:%M:%S")
    df["meter"] = df["meter"].astype(np.int8)
    df["building_id"] = df["building_id"].astype(np.int16)
    time_features(df)
    building_features(df,group)

def weather_transformation(weather):
    weather['timestamp'] = pd.to_datetime(weather['timestamp'], format="%Y-%m-%d %H:%M:%S")
    set_localtime(weather)
    #add_lag_feature(weather)
    
def building_transformation(building):
    building["primary_use"] = building["primary_use"].astype(np.int8)
    building["floor_count"] = building["floor_count"].astype(np.int8)
    building["year_built"] = building["year_built"].astype(np.int16)
    
def memory_adj(df):
    df['site_id'] = df['site_id'].astype(np.int8)
    df['square_feet'] = df['square_feet'].astype(np.int32)

In [None]:
train_df = pd.read_feather('../input/ashre-data-clean-leak-for-train/train_no_nan_s.feather')
train_df['meter_reading'] = np.log1p(train_df['meter_reading'])
df_group = train_df.groupby('building_id')['meter_reading']
df_transformation(train_df,df_group)

weather_df = pd.read_feather('../input/ashre-data-clean-leak-for-train/weather_intr_no_nan_s.feather')
building_metadata_df = pd.read_feather('../input/ashre-data-clean-leak-for-train/building_no_nan_s.feather')

weather_transformation(weather_df)
building_transformation(building_metadata_df)

In [None]:
def df_for_meter_type(meter):
    target_train_df = train_df[train_df['meter'] == meter]
    target_train_df = target_train_df.merge(building_metadata_df, on='building_id', how='left')
    target_train_df = target_train_df.merge(weather_df, on=['site_id', 'timestamp'], how='left')
    
    air_temp_fe(target_train_df)
    sea_level_pressure_fe(target_train_df)
    target_train_df.drop(['air_temperature','sea_level_pressure','wind_direction','wind_speed'], axis = 1, inplace = True)
    
    train_idx_f1 = target_train_df[(target_train_df.timestamp.dt.year == 2016)].index
    test_idx_f1 = target_train_df[(target_train_df.timestamp.dt.year == 2017) | (target_train_df.timestamp.dt.year == 2018)].index
    train_idx_f2 = target_train_df[(target_train_df.timestamp.dt.year == 2016) | (target_train_df.timestamp.dt.year == 2017)].index
    test_idx_f2 = target_train_df[(target_train_df.timestamp.dt.year == 2018)].index
    
    splits = [(train_idx_f1,test_idx_f1), (train_idx_f2,test_idx_f2)]
    
    target = target_train_df['meter_reading']
    meter_df = target_train_df.drop(['meter_reading','timestamp','meter'],axis = 1,inplace = False)
    memory_adj(meter_df)
    
    del target_train_df,train_idx_f1,test_idx_f1,train_idx_f2,test_idx_f2
    return meter_df, target, splits

In [None]:
%%time
meter_0_df, target, splits = df_for_meter_type(0)
meter_0_df

In [None]:
meter_0_models = train(meter_0_df,target,splits)
columns = meter_0_df.columns

del meter_0_df,target,splits
gc.collect()

visualize_feature_importance(meter_0_models[0],columns)
visualize_feature_importance(meter_0_models[1],columns)

del columns
gc.collect()

In [None]:
%%time
meter_1_df, target, splits = df_for_meter_type(1)
meter_1_df

In [None]:
meter_1_models = train(meter_1_df,target,splits)
columns = meter_1_df.columns

del meter_1_df,target,splits
gc.collect()

visualize_feature_importance(meter_1_models[0],columns)
visualize_feature_importance(meter_1_models[1],columns)

In [None]:
%%time
meter_2_df, target, splits = df_for_meter_type(2)
meter_2_df

In [None]:
meter_2_models = train(meter_2_df,target,splits)
columns = meter_2_df.columns

del meter_2_df,target,splits
gc.collect()

visualize_feature_importance(meter_2_models[0],columns)
visualize_feature_importance(meter_2_models[1],columns)

In [None]:
%%time
meter_3_df, target, splits = df_for_meter_type(2)
meter_3_df

In [None]:
meter_3_models = train(meter_3_df,target,splits)
columns = meter_3_df.columns

del meter_3_df,target,splits
gc.collect()

visualize_feature_importance(meter_3_models[0],columns)
visualize_feature_importance(meter_3_models[1],columns)

In [None]:
del train_df, weather_df 
gc.collect()

In [None]:
test_df = pd.read_feather('../input/ashre-simple-no-missing-for-test/test_no_nan_s.feather')
df_transformation(test_df,df_group)

weather_test_df = pd.read_feather('../input/ashre-simple-no-missing-for-test/weather_test_intr_no_nan_s.feather')
building_metadata_df = pd.read_feather('../input/ashre-simple-no-missing-for-test/building_test_no_nan_s.feather')

weather_transformation(weather_test_df)
building_transformation(building_metadata_df)

In [None]:
def df_for_meter_type_test(meter):
    target_test_df = test_df[test_df['meter'] == meter]
    target_test_df = target_test_df.merge(building_metadata_df, on='building_id', how='left')
    target_test_df = target_test_df.merge(weather_test_df, on=['site_id', 'timestamp'], how='left')
    
    air_temp_fe(target_test_df)
    sea_level_pressure_fe(target_test_df)
    target_test_df.drop(['air_temperature','sea_level_pressure','wind_direction','wind_speed'], axis = 1, inplace = True)
    meter_df = target_test_df.drop(['timestamp','meter','row_id'],axis = 1,inplace = False)
    memory_adj(meter_df)
    
    del target_test_df
    return meter_df

In [None]:
X_test = df_for_meter_type_test(0)

y_test0 = sum([model.predict(X_test, num_iteration=model.best_iteration) for model in meter_0_models])/2

del X_test,meter_0_models
gc.collect()

In [None]:
X_test = df_for_meter_type_test(1)

y_test1 = sum([model.predict(X_test, num_iteration=model.best_iteration) for model in meter_1_models])/2

del X_test,meter_1_models
gc.collect()

In [None]:
X_test = df_for_meter_type_test(2)

y_test2 = sum([model.predict(X_test, num_iteration=model.best_iteration) for model in meter_2_models])/2

del X_test,meter_2_models
gc.collect()

In [None]:
X_test = df_for_meter_type_test(3)

y_test3 = sum([model.predict(X_test, num_iteration=model.best_iteration) for model in meter_3_models])/2

del X_test,meter_3_models
gc.collect()

In [None]:
sample_submission = pd.read_csv('../input/ashrae-energy-prediction/sample_submission.csv')

In [None]:
sample_submission.loc[test_df[test_df['meter'] == 0]['row_id'], 'meter_reading'] = np.clip(np.expm1(y_test0),0, a_max=None)
sample_submission.loc[test_df[test_df['meter'] == 1]['row_id'], 'meter_reading'] = np.clip(np.expm1(y_test1),0, a_max=None)
sample_submission.loc[test_df[test_df['meter'] == 2]['row_id'], 'meter_reading'] = np.clip(np.expm1(y_test2),0, a_max=None)
sample_submission.loc[test_df[test_df['meter'] == 3]['row_id'], 'meter_reading'] = np.clip(np.expm1(y_test3),0, a_max=None)

In [None]:
sample_submission.to_csv('submission_Dec8_1.csv', index=False, float_format='%.4f')