The reason I used Light GBM:
In this dataset, the building_id column, as a categorical column,  identifies each building. To include a categorical variable in a ML model, we usually need to encode the variable by methods such as one-hot encoding, lable encoding, etc. However, since there are 1449 building ids in the variable, one-hot encoding will significantly increase the dimension of the data and use up the RAM. If lable encoding is applied, the building_id will be taken as a continuous column by the model, which is not correct. 

In LGBM, the categorical variables can be used without one-hot or label encoding. Instead, Fisher's method is used to find the optimal split of catgorical variables. 

Thus, tree-based algorithm LGBM is used instead of any neural network algorithms for its better strategy in dealing with categorical variables.

Another reason for me to choose the tree-based algorithm is that Deep neural networks are working much better with images, sound, language and other “natural” data, while tree-boosting frameworks show better performance given good hand-crafted features. In this project, the dataset is structured and tabulated. Thus, the feature columns are clear and tree-based algorithms can be used.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import gc
from sklearn import preprocessing
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

from sklearn.metrics import mean_squared_error
import lightgbm as lgbm
from sklearn.model_selection import KFold, StratifiedKFold
from tqdm import tqdm
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Original code from https://www.kaggle.com/gemartin/load-data-reduce-memory-usage by @gemartin
# Modified to support timestamp type, categorical type
# Modified to add option to use float16 or not. feather format does not support float16.
from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            # skip datetime type or categorical type
            continue
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if use_float16 and c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train=pd.read_csv("/kaggle/input/ashrae-energy-prediction/train.csv")
building_info=pd.read_csv("/kaggle/input/ashrae-energy-prediction/building_metadata.csv")
# weather_info_test=pd.read_csv("/kaggle/input/ashrae-energy-prediction/weather_test.csv")
# test=pd.read_csv("/kaggle/input/ashrae-energy-prediction/test.csv")
weather_info_train=pd.read_csv("/kaggle/input/ashrae-energy-prediction/weather_train.csv")

In [None]:
train_all=train.merge(building_info,on='building_id',how='left')
train_all=train_all.merge(weather_info_train,on=['site_id', 'timestamp'],how='left')
train_all.head()
gc.collect()

In [None]:
del weather_info_train

In [None]:
# delete the columns I don't want
train_all=train_all.drop(['precip_depth_1_hr','sea_level_pressure','wind_direction','wind_speed'],axis=1)
gc.collect()
# There  are 12 columns after dropping 4 columns related to weather. There are 
# 20216100 rows in the train_all dataframe before dropping any rows.

In [None]:
train_all=train_all.drop(train_all[(train_all['building_id']<= 104) & (train_all['meter']==0) & (train_all['timestamp']<= "2016-05-21")].index)


In [None]:
train_all["timestamp"] = pd.to_datetime(train_all["timestamp"])
train_all["hour"] = train_all["timestamp"].dt.hour# the hour of a day
# I feel like which day it is in a month might not change the energy consumption pattern much.
train_all["day"] = train_all["timestamp"].dt.day# the day of a month 
train_all["dayofweek"] = train_all["timestamp"].dt.weekday# the day of a week; same as dt.dayofweek
train_all["month"] = train_all["timestamp"].dt.month# the month of a year
train_all.drop('timestamp',axis=1,inplace=True)
train_all
gc.collect()

In [None]:
# By taking the logarithm of the meter_reading, the time series data will be more stable, and easier to be predicted.
train_all['log_meter_reading'] = np.log1p(train_all['meter_reading'])
gc.collect()

In [None]:
del train_all['meter_reading']

In [None]:
train_all.head()

In [None]:
# treating which time related variable as catogorical variable needs further study.
# Here, I take 'month','hour of a day', 'day of a month' as timestamps, and take 'day of a week'
# as categorical variable.
# What's the difference between taking a time related variable as numerical and categorical colum?
categorical_feacture=['building_id', 'site_id','primary_use','dayofweek','meter','hour','day','month']
numerical_feature=['square_feet', 'year_built', 'floor_count','air_temperature', 'cloud_coverage', 'dew_temperature']
feature_columns=categorical_feacture+numerical_feature
label_column='log_meter_reading'
gc.collect()

As shown above, there are many NaN values in the following columns: 'year_built', 'floor_count', 'cloud_coverage', 'air_temperature', and 'dew_temperature'. The methods used to deal with the missing values can result in very different results in the model. The missing values in these columns are dealt in different ways.

For 'cloud_coverage', 'air_temperature', and 'dew_temperature', a missing value is filled with the mean by averaging the values from the same day of the same month at the same site as the missing value. This is because these variables change over site, month, and the day in a month.

For 'floor_count', the mean value of the floor_count column is used to fill in the missing values.

For 'year_built', the missing values are left with further operation. LGBM will ignore missing values during a split, then allocate them to whichever side reduces the loss the most

In [None]:
# fill in the NaNs in air_temperature and cloud_coverage with the mean of temperatures from 
# the same day of a month during a certain month and a certain site.
# This is because, the temperature changes over month and the day in a month.
air_temperature_na_substitue = pd.DataFrame(train_all.groupby(['site_id','day','month'])['air_temperature'].mean(),columns=["air_temperature"])
train_all=train_all.set_index(['site_id','day','month'])
train_all.update(air_temperature_na_substitue,overwrite=False)
del air_temperature_na_substitue

cloud_coverage_na_substitue = pd.DataFrame(train_all.groupby(['site_id','day','month'])['cloud_coverage'].mean(),columns=["cloud_coverage"])
train_all.update(cloud_coverage_na_substitue,overwrite=False)
del cloud_coverage_na_substitue

dew_temp_na_substitue=pd.DataFrame(train_all.groupby(['site_id','day','month'])['dew_temperature'].mean(),columns=["dew_temperature"])
train_all.update(dew_temp_na_substitue,overwrite=False)
del dew_temp_na_substitue

train_all=train_all.reset_index()

gc.collect()

In [None]:
# fill in NaNs of floor_count with mean
train_all["floor_count"].fillna(train_all['floor_count'].mean(), inplace=True)

In [None]:
# train_all.isna().sum()

In [None]:
# train_all # 19867540 rows × 15 columns

In [None]:
# reduce the memory
train_all=reduce_mem_usage(train_all)
# building_info is 1449 rows × 6 columns
building_info=reduce_mem_usage(building_info)

In [None]:
train_all.head()

In [None]:
# prepare the train_y, train_x for each meter type
def create_x_y(train_df, target_meter):
    target_train_df = train_df[train_df['meter'] == target_meter]
    train_x = target_train_df[feature_columns]
    train_y = target_train_df['log_meter_reading'].values

    del target_train_df
    return train_x, train_y
# After this function, the train_x is dataframe, while train_y is a numpy array.
# But what is the usual format of dataset to feed into a LGBM model?

In [None]:
train_x0, train_y0 = create_x_y(train_all, target_meter=0)
sns.distplot(train_y0)
del train_x0, train_y0

In [None]:
train_x1, train_y1 = create_x_y(train_all, target_meter=1)
sns.distplot(train_y1)
del train_x1, train_y1

In [None]:
train_x2, train_y2 = create_x_y(train_all, target_meter=2)
sns.distplot(train_y2)
del train_x2, train_y2

In [None]:
train_x3, train_y3 = create_x_y(train_all, target_meter=3)
sns.distplot(train_y3)
del train_x3, train_y3

In [None]:
def fit_models(train_df,target_meter,folds=2,seed=None,shuffle=False, num_rounds=1500,lr=0.1,bf=0.1,l2=0.2, nl = 30):
    kfold = KFold(n_splits=folds, shuffle = shuffle, random_state = seed)
    train_x, train_y = create_x_y(train_df, target_meter)
    gc.collect()
    
    print('target_meter: ',target_meter)
    print('shape: ',train_x.shape[0])
    
    categoricals = [train_x.columns.get_loc(c_col) for c_col in categorical_feacture]
    print('categoricals: ', categoricals)
    
    models = []
    
    for train_idx, val_idx in kfold.split(train_x,train_y):
        xtrain = train_x.iloc[train_idx,:]
        xval = train_x.iloc[val_idx,:]
        ytrain = train_y[train_idx]
        yval = train_y[val_idx]
        print('')
        print('train shape: ', len(train_idx))
        print('valid shape: ', len(val_idx))

        params = {'boosting_type': 'gbdt',
                  'objective': 'regression',
                  'metric': {'rmse'}, # maybe l2?
                  'bagging_freq': 5, # maybe try 1 or 5?
                  'bagging_fraction': bf, # maybe try 0.7?
                  'learning_rate': lr, # maybe try 0.3 or 0.05?
                  'num_leaves': nl, # maybe try 330 or more?
                  'feature_fraction': 0.9, # maybe try other values?
                  'lambda_l2': l2 # maybe try other values?  
        }
        
        early_stopping_condition = 30 # try other values
        verbose_evaluation = 20 # try other values
        
        lgb_train_ds = lgbm.Dataset(xtrain, label = ytrain, categorical_feature = categoricals)
        lgb_val_ds = lgbm.Dataset(xval, label = yval, categorical_feature = categoricals)
        
        print('Training GBM: ')
        
        model = lgbm.train(params,
                           train_set = lgb_train_ds,
                           num_boost_round = num_rounds,
                           valid_sets = (lgb_train_ds, lgb_val_ds),
                           early_stopping_rounds = early_stopping_condition,
                           verbose_eval = verbose_evaluation)
        
        models.append(model)
    
    gc.collect()
    return models
        

In [None]:
models_0 = fit_models(train_all,target_meter=0,folds=5,num_rounds = 1000, lr = 0.1,bf = 0.7,l2 = 0.2,nl = 50)

In [None]:
models_1 = fit_models(train_all,target_meter=1,folds=5,num_rounds = 1000, lr = 0.1,bf = 0.7,l2 = 0.2,nl = 50)

In [None]:
models_2 = fit_models(train_all,target_meter=2,folds=5,num_rounds = 1000, lr = 0.1,bf = 0.7,l2 = 0.2, nl = 50)

In [None]:
models_3 = fit_models(train_all,target_meter=3,folds=5,num_rounds = 1000, lr = 0.1,bf = 0.7,l2 = 0.2, nl = 50)

## Start Test data preparation

In [None]:
del train_all

In [None]:
gc.collect()

In [None]:
#building_info=pd.read_csv("/kaggle/input/ashrae-energy-prediction/building_metadata.csv")
weather_info_test=pd.read_csv("/kaggle/input/ashrae-energy-prediction/weather_test.csv")
test=pd.read_csv("/kaggle/input/ashrae-energy-prediction/test.csv")

In [None]:
# Now it's going to be a pain in the ass, because I didn't wrap the data preprocessing in a function.
# Repeat the process for the test data
test_all=test.merge(building_info,on='building_id',how='left')
test_all=test_all.merge(weather_info_test,on=['site_id', 'timestamp'],how='left')
test_all.head()
# test_all=reduce_mem_usage(test_all)
gc.collect()

In [None]:
del weather_info_test, building_info
gc.collect()

In [None]:
test_all.drop('row_id',axis=1,inplace=True)
gc.collect()

In [None]:
test_all=test_all.drop(['precip_depth_1_hr','sea_level_pressure','wind_direction','wind_speed'],axis=1)
gc.collect()

In [None]:
test_all.dtypes

In [None]:
test_all["timestamp"] = pd.to_datetime(test_all["timestamp"])
test_all["hour"] = test_all["timestamp"].dt.hour# the hour of a day
# I feel like which day it is in a month might not change the energy consumption pattern much.
test_all["day"] = test_all["timestamp"].dt.day# the day of a month 
test_all["dayofweek"] = test_all["timestamp"].dt.weekday# the day of a week; same as dt.dayofweek
test_all["month"] = test_all["timestamp"].dt.month# the month of a year
test_all
gc.collect()

In [None]:
test_all.drop(test_all[(test_all['building_id']<= 104) & (test_all['meter']==0) & (test_all['timestamp']<= "2016-05-21")].index,inplace=True)
gc.collect()

In [None]:
test_all.drop('timestamp',axis=1,inplace=True)

In [None]:
test_all=reduce_mem_usage(test_all)

In [None]:
air_temperature_na_substitue = pd.DataFrame(test_all.groupby(['site_id','day','month'])['air_temperature'].mean(),columns=["air_temperature"])
test_all=test_all.set_index(['site_id','day','month'])
test_all.update(air_temperature_na_substitue,overwrite=False)
del air_temperature_na_substitue

cloud_coverage_na_substitue = pd.DataFrame(test_all.groupby(['site_id','day','month'])['cloud_coverage'].mean(),columns=["cloud_coverage"])
test_all.update(cloud_coverage_na_substitue,overwrite=False)
del cloud_coverage_na_substitue

dew_temp_na_substitue=pd.DataFrame(test_all.groupby(['site_id','day','month'])['dew_temperature'].mean(),columns=["dew_temperature"])
test_all.update(dew_temp_na_substitue,overwrite=False)
del dew_temp_na_substitue

test_all=test_all.reset_index()

gc.collect()

In [None]:
test_all["floor_count"].fillna(test_all['floor_count'].mean(), inplace=True)

In [None]:
test_all

In [None]:
def create_x(test_df,target_meter):
    target_df = test_df[test_df['meter'] == target_meter]
    test_x = target_df[feature_columns]
    return test_x

def create_predictions(test_set, models, batch_size):
    i = 0
    ret = []
    for j in tqdm(range(int(np.ceil(test_set.shape[0] / batch_size)))):
        ret.append(np.expm1(sum([model.predict(test_set.iloc[i:i+batch_size]) for model in models]) / len(models)))
        i += batch_size
    return ret

def generate_results(test_df, target_meter, models,batch_size = 1):
    test_x = create_x(test_df,target_meter)
    gc.collect()

    test_y = create_predictions(test_x,models,batch_size)

    # consider sns plots

    del test_x
    gc.collect()
    return test_y

In [None]:
%%time
test_y_0 = generate_results(test_all,target_meter = 0, models = models_0, batch_size = 100000)

In [None]:
%%time
test_y_1 = generate_results(test_all,target_meter = 1, models = models_1, batch_size = 100000)

In [None]:
%%time
test_y_2 = generate_results(test_all,target_meter = 2, models = models_2, batch_size = 100000)


In [None]:
%%time
test_y_3 = generate_results(test_all,target_meter = 3, models = models_3, batch_size = 100000)


In [None]:
sample_submission=pd.read_csv("/kaggle/input/ashrae-energy-prediction/sample_submission.csv")
sample_submission.head()


In [None]:
test_y_0 = np.concatenate(test_y_0)
sns.distplot(np.log1p(test_y_0))
gc.collect()

In [None]:
test_y_1 = np.concatenate(test_y_1)
sns.distplot(np.log1p(test_y_1))
gc.collect()

In [None]:
test_y_2 = np.concatenate(test_y_2)
sns.distplot(np.log1p(test_y_2))
gc.collect()

In [None]:
test_y_3 = np.concatenate(test_y_3)
sns.distplot(np.log1p(test_y_3))
gc.collect()

In [None]:
sample_submission.loc[test_all['meter'] == 0, 'meter_reading'] = test_y_0
sample_submission.loc[test_all['meter'] == 1, 'meter_reading'] = test_y_1
sample_submission.loc[test_all['meter'] == 2, 'meter_reading'] = test_y_2
sample_submission.loc[test_all['meter'] == 3, 'meter_reading'] = test_y_3
gc.collect()

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv('submission.csv', index = False, float_format='%.4f')


For individual tests

In [None]:
# # test_y_0_export = pd.DataFrame(test_y_0)
# test_y_0_export.to_csv('test_y_0.csv', index = False)

In [None]:
# test_y_1_export = pd.DataFrame(test_y_1)
# test_y_1_export.to_csv('test_y_1.csv', index = False)

In [None]:
# test_y_2_export = pd.DataFrame(test_y_2)
# test_y_2_export.to_csv('test_y_2.csv', index = False)

In [None]:
# test_y_3_export = pd.DataFrame(test_y_3)
# test_y_3_export.to_csv('test_y_3.csv', index = False)

In [None]:
# train111=pd.read_csv("/kaggle/input/ashrae-energy-prediction/train.csv")


In [None]:
# train111.head()

In [None]:
# train111.groupby('meter')['meter_reading'].describe()

In [None]:
# np.mean(test_y_2)

In [None]:
# sample_submission=pd.read_csv("/kaggle/input/ashrae-energy-prediction/sample_submission.csv")
# sample_submission.head()


In [None]:
# path = '../input/test-results/'
# test_y_0_df = pd.read_csv(path +'test_y_0.csv')
# test_y_1_df = pd.read_csv(path +'test_y_1.csv')
# test_y_2_df = pd.read_csv(path +'test_y_2.csv')
# test_y_3_df = pd.read_csv(path +'test_y_3.csv')

In [None]:
# test_y_0 = np.array(test_y_0_df).flatten()
# test_y_1 = np.array(test_y_1_df).flatten()
# test_y_2 = np.array(test_y_2_df).flatten()
# test_y_3 = np.array(test_y_3_df).flatten()


In [None]:
# sample_submission.loc[test_all['meter'] == 0, 'meter_reading'] = test_y_0
# sample_submission.loc[test_all['meter'] == 1, 'meter_reading'] = test_y_1
# sample_submission.loc[test_all['meter'] == 2, 'meter_reading'] = test_y_2
# sample_submission.loc[test_all['meter'] == 3, 'meter_reading'] = test_y_3
# gc.collect()
# sample_submission.head()

In [None]:
# sample_submission.to_csv('submission.csv', index = False, float_format='%.4f')
