# ASHRAE ENERGY PREDICTION III:

# From Previous Notebook ....

## - Importing Necessary Libraries

In [None]:
# Importing needed libraries to be used throughout the project

%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn import utils
from sklearn import preprocessing
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import SGDRegressor,Ridge,ElasticNet
from sklearn.model_selection import train_test_split,KFold,GroupKFold
import lightgbm as lgb
import gc
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn. linear_model import Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.layers as layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout, Activation
from tensorflow.keras.optimizers import *

from prettytable import PrettyTable

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## - Loading and Reducing Memory Usage of Data

In [None]:
data_path = "/kaggle/input/ashrae-energy-prediction/"

train_path = data_path + "train.csv"

building_path = data_path + "building_metadata.csv"

weather_train_path = data_path + "weather_train.csv"

In [None]:
train_data = pd.read_csv(train_path)

building_data = pd.read_csv(building_path)

weather_train_data = pd.read_csv(weather_train_path)

In [None]:
# Converting data into feather format since some dataframes are too large and take a long time to load
# This method is inspired from the kaggle notebook titled: ASHRAE: feather format for fast loading
# Which is found at: https://www.kaggle.com/corochann/ashrae-feather-format-for-fast-loading

train_data.to_feather('train_data.feather')

building_data.to_feather('building_data.feather')

weather_train_data.to_feather('weather_train_data.feather')

In [None]:
train_data = pd.read_feather('train_data.feather')

building_data = pd.read_feather('building_data.feather')

weather_train_data = pd.read_feather('weather_train_data.feather')

In [None]:
# This function helps in optimizing the memory used by the dataframes by by modifying/altering thedatatype of each column.
# This method is inspired from the kaggle notebook titled: load data (reduce memory usage)
# Which is found at: https://www.kaggle.com/gemartin/load-data-reduce-memory-usage

def reduce_mem_usage(df, df_name):

    start_mem = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:

        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2

    print('Memory usage of {} is reduced by {:.2f} %. Usage dropped from {:.2f} MB to {:.2f} MB.'.format(df_name, (100 * (start_mem - end_mem) / start_mem), start_mem, end_mem))
    
    return df


In [None]:
# Reducing Memory Usage of Data

train_data = reduce_mem_usage(train_data, 'Train Data')

building_data = reduce_mem_usage(building_data, 'Building Data')

weather_train_data = reduce_mem_usage(weather_train_data, 'Weather Train Data')

In [None]:
# Merging Data

train = train_data.merge(building_data, on='building_id', how='left')
train = train.merge(weather_train_data, on=['site_id', 'timestamp'], how='left')

In [None]:
# Breaking Timestamp into Hour, Day, Month, Year

# This function firstly converts timestamp to date and then breaks down date into 6 new columns: hour, day, dayOfWeek, dayOfYear, month and year

def breakdown_timestamp(dataframe):
    
    dataframe['timestamp']= pd.to_datetime(dataframe['timestamp'])

    dataframe['hour']= np.uint8(dataframe['timestamp'].dt.hour)
    
    dataframe['day']= np.uint16(dataframe['timestamp'].dt.day)
    dataframe['dayofweek']= np.uint8(dataframe['timestamp'].dt.dayofweek)
    dataframe['dayofyear']= np.uint16(dataframe['timestamp'].dt.dayofyear)

    dataframe['month']= np.uint8(dataframe['timestamp'].dt.month)

    dataframe['year']= np.uint16(dataframe['timestamp'].dt.year)
    
    return dataframe

In [None]:
train = breakdown_timestamp(train)

## - Applying Log Transformation to 'Meter Reading and 'Square Feet'

In [None]:
train['meter_reading'] = np.log1p(train['meter_reading'])

In [None]:
train['square_feet'] = np.log1p(train['square_feet'])

## - Data Preperation and Feature Engineering

In [None]:
# Removing Zero Meter Readings

zero_meter_readings = list(train[train['meter_reading'] == 0].index)
train.drop(zero_meter_readings, axis = 0, inplace = True)

In [None]:
# Dropping Columns with More than 50 % Missing Values

threshold = len(train) * 0.5
train.dropna(axis=1, thresh = threshold, inplace = True)

In [None]:
# Filling Missing Values

train['cloud_coverage'].fillna(train['cloud_coverage'].median(), inplace=True)
train['sea_level_pressure'].fillna(train['sea_level_pressure'].median(), inplace=True)
train['precip_depth_1_hr'].fillna(train['precip_depth_1_hr'].median(), inplace=True)
train['wind_direction'].fillna(train['wind_direction'].median(), inplace=True)
train['wind_speed'].fillna(train['wind_speed'].median(), inplace=True)
train['dew_temperature'].fillna(train['dew_temperature'].median(), inplace=True)
train['air_temperature'].fillna(train['air_temperature'].median(), inplace=True)

In [None]:
# adding new feature from existing ones to get better results

train['season'] = train['timestamp'].apply(lambda x: 'Spring' if x.month==3 or x.month==4 or x.month==5 else 
                                                  'Summer' if x.month==6 or x.month==7 or x.month==8 else 
                                                  'Autumn' if x.month==9 or x.month==10 or x.month==11 else 
                                                  'Winter')

train['isDayTime'] = train['timestamp'].apply(lambda x: 1 if x.hour >=6 and x.hour <=18 else 0)

In [None]:
# Encoding categorical data

categorical_features = ['primary_use', 'season']

encoder = preprocessing.LabelEncoder()

for i in categorical_features:
    
    train[i] = encoder.fit_transform(train[i])
    

## - Dropping Timestamp and Reducing Memory Usage Again

In [None]:
train = train.drop(['timestamp'],axis=1)

In [None]:
reduced_train_data = reduce_mem_usage(train, 'Train Data')

## - Removing Least Important Features Generated by our Feature Selection Method

In [None]:
new_data = reduced_train_data[['building_id','square_feet','primary_use','meter','site_id','air_temperature','dayofyear','hour','isDayTime','dew_temperature','dayofweek', 'meter_reading']]

In [None]:
new_data.drop(['site_id','dew_temperature'],axis=1, inplace = True)

# 8. Models and Fine Tuning

In [None]:
X_train = new_data.drop(['meter_reading'],axis = 1)

Y_train = new_data['meter_reading'].values

## 8.1 Basline Model

In [None]:
def baselineModel(y_actual,y_pred):

    rmsle_score = np.sqrt(np.mean((y_actual - y_pred) * (y_actual-y_pred)))
    
    print("The RMSLE Score of the Baseline Model is :",rmsle_score)

baselineModel(Y_train, np.median(Y_train))

The baseline score can be computed is median value of labels. The baseline score for 50% of the data is 1.772. So it can be expected that the baseline score could be more.

**Splitting the data for training and testing**

In [None]:
train_x, test_x, train_y, test_y = train_test_split(X_train,Y_train, test_size=0.2, random_state=42)

**Calculating RMSLE**

RMSLE is the suggested evaluation metric for the models performance. Since we already applied log1p transformation to the target value 'Meter Reading', therefore, Root Mean Squared Error (RMSE) can be used directly. And can be computed as follows:

In [None]:
def RMSLE(y_actual, y_pred):
    
    return np.sqrt(mean_squared_error(y_actual, y_pred))

## 8.2 Linear Regression

In [None]:
linear_Regression = LinearRegression()
linear_Regression.fit(train_x, train_y)

**Calculating Prediction Score**

In [None]:
print('Linear Regression Traininig RMSLE = ', RMSLE((train_y) , (linear_Regression.predict(train_x))))
print('Linear Regression Testing RMSLE = ',RMSLE((test_y) ,(linear_Regression.predict(test_x))))

**Linear Regresssion Cross Validation**

In [None]:
lin_scores = cross_val_score(linear_Regression, train_x, train_y,scoring="neg_mean_squared_error", cv=10)
lin_rmse_scores = np.sqrt(-lin_scores)

print("Linear Regression CV Scores:", lin_rmse_scores)
print("==========================================================================================")
print("Mean CV Score:", lin_rmse_scores.mean())

## 8.3 ElasticNet

**Hyperparameter Tuning**

In [None]:
parameters = {'alpha':[0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'fit_intercept' : [False],
              'l1_ratio':[0.5]}

elasticnet = GridSearchCV(estimator = ElasticNet(),
                        param_grid = parameters,
                        cv = 3, 
                        scoring = 'neg_mean_squared_error',
                        verbose = 1,
                        return_train_score = True,
                        n_jobs = -1)

elasticnet.fit(train_x, train_y)

In [None]:
print("Best ElasticNet Estimator is : ", elasticnet.best_estimator_)
print("Best ElasticNet Paramteres are : ", elasticnet.best_params_)

**Building Best ElasticNet Estimator**

In [None]:
elastic_model = ElasticNet(alpha = 0.001, fit_intercept = False, l1_ratio = 0.5)
elastic_model.fit(train_x, train_y)

**Calculating Prediction Score**

In [None]:
print('ElasticNet Training RMSLE = ',RMSLE((train_y) , (elastic_model.predict(train_x))))
print('ElasticNet Testing RMSLE = ',RMSLE((test_y) , (elastic_model.predict(test_x))))

## 8.4 Ridge

**Hyperparameters Tuning**

In [None]:
parameters = {'alpha':[0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
              'fit_intercept' : [True],
              'solver' : ['lsqr']}

ridge = GridSearchCV(estimator = Ridge(),
                        param_grid = parameters,
                        cv = 3, 
                        scoring = 'neg_mean_squared_error',
                        verbose = 1,
                        return_train_score = True,
                        n_jobs = -1)

ridge.fit(train_x, train_y)

In [None]:
print("Best Ridge Estimator is :",ridge.best_estimator_)
print("Best Ridge Paramteres are : ", ridge.best_params_)

**Best Ridge Estimator**

In [None]:
Ridge = Ridge(alpha = 0.0001, fit_intercept = True, solver = "lsqr")
Ridge.fit(train_x, train_y)

**Calculating Prediction Score**

In [None]:
print('Ridge Training RMSLE = ',RMSLE((train_y) , (Ridge.predict(train_x))))
print('Ridge Testing RMSLE = ',RMSLE(test_y , Ridge.predict(test_x)))

## 8.5 Lasso

**Hyperparameter Tuninig**

In [None]:
parameters = {'alpha':[0.001, 0.01, 0.1, 1, 10]}

lasso = GridSearchCV(estimator = Lasso(),
                        param_grid = parameters,
                        cv = 3, 
                        scoring = 'neg_mean_squared_error',
                        verbose = 1,
                        return_train_score = True,
                        n_jobs = -1)

lasso.fit(train_x, train_y)

In [None]:
print("Best Lasso Estimator ",lasso.best_estimator_)
print("Best Lasso Paramteres are : ", lasso.best_params_)

**Best Lasso Estimator**

In [None]:
Lasso = Lasso(alpha = 0.001)
Lasso.fit(train_x, train_y)

**Calculating Prediction Score**

In [None]:
print('Lasso Traininig RMSLE = ',RMSLE((train_y) , (Lasso.predict(train_x))))
print('Lasso Testing RMSLE = ',RMSLE(test_y , Lasso.predict(test_x)))

## 8.6 Decision Tree

**Hyperparameters Tuning**

In [None]:
parameters = {'max_depth': [3,5,7,9,11,15]}


decission_tree = GridSearchCV(estimator = DecisionTreeRegressor(),
                        param_grid = parameters,
                        cv = 3, 
                        scoring = 'neg_mean_squared_error',
                        verbose = 1,
                        return_train_score = True,
                        n_jobs = -1)

decission_tree.fit(train_x, train_y)

In [None]:
print("Best Decision Tree Estimator ",decission_tree.best_estimator_)
print("Best Decision Tree Paramteres are : ", decission_tree.best_params_)

**Best Decision Tree Estimator**

In [None]:
DecissionTree = DecisionTreeRegressor(max_depth=15)
DecissionTree.fit(train_x, train_y)

**Calculating Prediction Score**

In [None]:
print('Decision Tree Training RMSLE = ',RMSLE((train_y) , (DecissionTree.predict(train_x))))
print('Decision Tree Testing RMSLE = ',RMSLE(test_y , DecissionTree.predict(test_x)))

## 8.7 RandomForestRegressor

**NOTE: Running this model will take so much time and may lead to crashing the session in process.**

**Hyperparameters Tuning**

In [None]:
parameters = { 'n_estimators': [60,80,100],
              'max_depth':[5,7,9]}

forest_reg = GridSearchCV(estimator = RandomForestRegressor(),
                        param_grid = parameters,
                        cv = 3, 
                        scoring = 'neg_mean_squared_error',
                        verbose = 1,
                        return_train_score = True,
                        n_jobs = -1)
forest_reg.fit(train_x, train_y)

In [None]:
print("Best Random Forest Estimator ",forest_reg.best_estimator_)
print("Best Random Forest Paramteres are : ", forest_reg.best_params_)

**Best RandomForest Estimator**

In [None]:
# RandomForest = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth)
# RandomForest.fit(train_x, train_y)

**Calculating Prediction Score**

In [None]:
# print('Train RMSLE = ',RMSLE((train_y) , (RandomForest.predict(train_x))))
# print('Test RMSLE = ',RMSLE(test_y , RandomForest.predict(test_x)))

## 8.8 SGD Regressor

In [None]:
# Scaling the Data Prior to Passing it to the Model

x_train_scaled = preprocessing.scale(train_x)
x_test_scaled = preprocessing.scale(test_x)

**Hyperparameters Tuning**

In [None]:
parameters = {'alpha':[0.0001, 0.001],
             'eta0': [0.001],
             'penalty': ['l2'],
             'learning_rate': ['adaptive', 'invscaling'],
             'early_stopping': [True]}

sgd = GridSearchCV(estimator = SGDRegressor(),
                        param_grid = parameters,
                        cv = 3, 
                        scoring = 'neg_mean_squared_error',
                        verbose = 10,
                        return_train_score = True,
                        n_jobs = -1)

sgd.fit(x_train_scaled, train_y)

In [None]:
print("Best SGDRegressor Estimator ",sgd.best_estimator_)
print("Best SGDRegressor Paramteres are : ", sgd.best_params_)

**Best SGDRegressor**

In [None]:
sgd_regressor = SGDRegressor(early_stopping=True, eta0=0.001, learning_rate='adaptive')
sgd_regressor.fit(x_train_scaled, train_y)

**Calculating Prediction Score**

In [None]:
print('SGDRegressor Training RMSLE = ',RMSLE((train_y) , (sgd_regressor.predict(x_train_scaled))))
print('SGDRegressor Testing RMSLE = ',RMSLE(test_y , sgd_regressor.predict(x_test_scaled)))

## 8.9 SVR

**NOTE: Running this model will take so much time and may lead to crashing the session in process.**

**Building SVR Model**

In [None]:
svm_reg = SVR(kernel="linear")
svm_reg.fit(x_train_scaled, train_y)

**Calculating Prediction Score**

In [None]:
print('SVR Traininig RMSLE = ', RMSLE((train_y) , (svm_reg.predict(x_train_scaled))))
print('SVR Testing RMSLE = ',RMSLE((test_y) ,(svm_reg.predict(x_test_scaled))))

## 8.10 3 Layer Neural Network

In [None]:
from keras import backend as K

def NN_RMSLE(y_actual, y_pred):
    
    return K.sqrt(K.mean(K.square(y_pred - y_actual))) 

In [None]:
# Splitting Training Data into Trainig and Validation Sets

train_xx, val_xx, train_yy, val_yy = train_test_split(train_x,train_y, test_size=0.2, random_state=42)

In [None]:
from keras.callbacks import EarlyStopping

model = Sequential()

earlyStop= EarlyStopping(monitor='val_loss', mode='min', patience=3)

model.add(layers.Dense(512, activation='relu',input_shape=(train_xx.shape[1],)))
model.add(Dropout(0.5))
model.add(layers.Dense(256, activation='relu'))
model.add(Dropout(0.2))
model.add(layers.Dense(64, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(1  ,activation='linear'))

model.compile(optimizer='adam', loss = NN_RMSLE)

model.fit(train_xx, train_yy, epochs = 15, batch_size = 2048, validation_data=(val_xx,val_yy),callbacks = earlyStop)

In [None]:
print('Neural Network Training RMSLE = ', model.evaluate(train_x, train_y, verbose=0))
print('Neural Network Testing RMSLE = ', model.evaluate(test_x, test_y, verbose=0))

## 8.11 Scores Comparison

In [None]:
scores_table = PrettyTable(["Model Name","Training RMSLE","Testing RMSLE"])

scores_table.add_row(["LinearRegression", "1.3811028", "1.3812215"])
scores_table.add_row(["ElasticNet", "1.4735792", "1.4732091"])
scores_table.add_row(["Ridge", "1.3842031", "1.3842150"])
scores_table.add_row(["Lasso", "1.3811055", "1.3812206"])
scores_table.add_row(["DecisionTree", "0.6077021", "0.6098925"])
scores_table.add_row(["RandomForestRegressor", "--------", "--------"])
scores_table.add_row(["SGDRegressor", "1.3811040", "1.3812219"])
scores_table.add_row(["SVR", "--------", "--------"])
scores_table.add_row(["Neural Network", "2.2980730", "2.2978658"])

print(scores_table)

**observations:**
1. Out of all tested models, Decision Tree Regressor performed the best with an approximate RMSLE score of 0.6.
2. Linear Regression, Ridge, Lasso and SGDRegressor had compitetively close scores, with Linear Regression talking the lead as the second best prediction method.
3. Our constructed Neural Network Performed worst out of all tested models, indicating its unsuitability for such problem.

# 9.  Analyzing the Best Model

##  9.1 Viewing the Scores of the Hyperparameter Combinations Tested During the Grid Search

In [None]:
pd.DataFrame(decission_tree.cv_results_)

## 9.2 Viewing the Relative Importance of Each Attribute for Making Accurate Predictions

In [None]:
features_list = ['building_id','square_feet','primary_use','meter','air_temperature','dayofyear','hour','isDayTime','dayofweek']

features_importance = decission_tree.best_estimator_.feature_importances_

sorted(zip(features_importance, features_list), reverse=True)

**Observations**
1. Square Feet attribute contributes the most towards predicting the amount of energy consumption.
2. Building Id is the second highly influencing factor, with an importance of 0.25.
3. It seems that attributes relating to time and date do not affect energy consumption much.

**Ensemble Methods**
- We believe that Random Forest Regressor might offer better performance as they rely on Decision Trees and this ensemble can further enhance performance. Unfortunately, due to extremely slow running times/sessions crashing, we were unable to test this theory.