In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Data preparation

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

## ignore useless warnings
import warnings
warnings.filterwarnings(action='ignore')
pd.options.display.max_seq_items=8000
pd.options.display.max_rows = 8000

## Import the dataset

In [None]:
#Loading the train set
train = pd.read_csv('/content/drive/MyDrive/energy_dataset/train.csv')
weather_train = pd.read_csv('/content/drive/MyDrive/energy_dataset/weather_train.csv')

In [None]:
building = pd.read_csv('/content/drive/MyDrive/energy_dataset/building_metadata.csv')

In [None]:
# Loading test set
test = pd.read_csv('/content/drive/MyDrive/energy_dataset/test.csv')
weather_test = pd.read_csv('/content/drive/MyDrive/energy_dataset/weather_test.csv')

In [None]:
#https://www.kaggle.com/kernels/scriptcontent/3684066/download
# WE MAY USE THIS FUNCTION TO REDUCE MEMORY USEAGE

from pandas.api.types import is_datetime64_any_dtype as is_datetime
from pandas.api.types import is_categorical_dtype

def reduce_mem_usage(df, use_float16=False):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    #iterating over every column and finding the type of the column
    for col in df.columns:
      if is_datetime(df[col]) or is_categorical_dtype(df[col]):
            continue
      col_type = df[col].dtype
        
        #If the columns is not object
      if col_type != object:
          #Get the minimum and maximum value
            c_min = df[col].min()
            c_max = df[col].max()
            #If the type is int
            if str(col_type)[:3] == 'int':
              #If the min max values lies with thin the range of int8 type then assign the type as int8
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
              #If the min max values lies with thin the range of int16 type then assign the type as int16 
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
              #If the min max values lies with thin the range of int32 type then assign the type as int32
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
              #If the min max values lies with thin the range of int64 type then assign the type as int64
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
              #If the min max values lies with thin the range of float16 type then assign the type as float16
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
              #If the min max values lies with thin the range of float32 type then assign the type as float32
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
      else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
# Memory reduce
train = reduce_mem_usage(train)
weather_train=reduce_mem_usage(weather_train)

Memory usage of dataframe is 616.95 MB
Memory usage after optimization is: 173.84 MB
Decreased by 71.8%
Memory usage of dataframe is 9.60 MB
Memory usage after optimization is: 2.59 MB
Decreased by 73.1%


In [None]:
building = reduce_mem_usage(building)

Memory usage of dataframe is 0.07 MB
Memory usage after optimization is: 0.02 MB
Decreased by 73.9%


In [None]:
test = reduce_mem_usage(test)
weather_test = reduce_mem_usage(weather_test)

Memory usage of dataframe is 1272.51 MB
Memory usage after optimization is: 358.53 MB
Decreased by 71.8%
Memory usage of dataframe is 19.04 MB
Memory usage after optimization is: 5.13 MB
Decreased by 73.0%


## Combining the datasets

In [None]:
#We will combine the all the data for train set in a single dataframe and test set in another dataframe
# Merging train set
train_df = train.merge(building, on='building_id', how='left')
train_df = train_df.merge(weather_train, on=['site_id', 'timestamp'], how='left')

In [None]:
# Merging test set
test_df = test.merge(building, on='building_id', how='left')
test_df = test_df.merge(weather_test, on=['site_id', 'timestamp'], how='left')

## Break the datatime into day, month

In [None]:
# We will break the timestamp into hour of the day, day of week, month and the year.
# Maybe the timestamp column should be drop during training stage.

def break_datetime(df):
  df['timestamp']= pd.to_datetime(df['timestamp'])
  df['hour']= np.uint8(df['timestamp'].dt.hour)
  df['dayofweek']= np.uint8(df['timestamp'].dt.dayofweek)
  df['month']= np.uint8(df['timestamp'].dt.month)
  df['dayofyear']= np.uint16(df['timestamp'].dt.dayofyear)
  df['day']= np.uint16(df['timestamp'].dt.day) #day of month
  df['year']= np.uint16(df['timestamp'].dt.year)
  return df

In [None]:
train_df = break_datetime(train_df)

In [None]:
test_df = break_datetime(test_df)

In [None]:
train_df.head(5)

Unnamed: 0,building_id,meter,timestamp,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,...,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,hour,dayofweek,month,dayofyear,day,year
0,0,0,2016-01-01,0.0,0,Education,7432,2008.0,,25.0,...,,1019.5,0.0,0.0,0,4,1,1,1,2016
1,1,0,2016-01-01,0.0,0,Education,2720,2004.0,,25.0,...,,1019.5,0.0,0.0,0,4,1,1,1,2016
2,2,0,2016-01-01,0.0,0,Education,5376,1991.0,,25.0,...,,1019.5,0.0,0.0,0,4,1,1,1,2016
3,3,0,2016-01-01,0.0,0,Education,23685,2002.0,,25.0,...,,1019.5,0.0,0.0,0,4,1,1,1,2016
4,4,0,2016-01-01,0.0,0,Education,116607,1975.0,,25.0,...,,1019.5,0.0,0.0,0,4,1,1,1,2016


# Preprocessing

## Outlier treatment

In [None]:
from numpy.random import seed
from numpy.random import randn
from numpy import mean
from numpy import std
import scipy.stats as st

In [None]:
num_col=['building_id','meter','meter_reading','square_feet','cloud_coverage','wind_direction','precip_depth_1_hr','sea_level_pressure','wind_speed', 'dew_temperature', 'air_temperature']

In [None]:
out_per=[]
for i in num_col:
    data_mean, data_std = mean(train_df[i]), std(train_df[i])
# identify outliers
    cut_off = data_std * 3
    lower, upper = data_mean - cut_off, data_mean + cut_off
    print(i,': \n')
# identify outliers
    outliers = [x for x in train_df[i] if x < lower or x > upper]
    
    num_out=len(outliers)
    print('Identified outliers: %d' % num_out)
    outliers_removed = [x for x in train_df[i] if x >= lower and x <= upper]
    num_nout=len(outliers_removed)
    print('Non-outlier observations: %d' % num_nout)
    print('\n')

## Check building outiler

In [None]:
drop_indices= list(train_df[train_df['building_id']==1099].index)
train_df.drop(drop_indices, axis=0, inplace=True)

## Preprocessing for Neural Networks

In [None]:
train_df1 = train_df[:int(train_df.shape[0] / 2)]

In [None]:
# Warning this code cell should only be run on Colab PRO since this requires RAM upto 25 GB
# The code below groups the data on 5 features and then aggregates the rest of the features required to implement the neural network
# The purpose is to categorise our data further so that meaningful insights can be gained during implementation

train_df1= train_df1.groupby(['meter',train_df1['building_id'],'primary_use',train_df1['month'], train_df1['day']]).agg({'meter_reading':'sum', 'air_temperature': 'mean', 'wind_speed': 'mean', 'precip_depth_1_hr': 'mean', 'cloud_coverage': 'mean', 'square_feet': 'mean'})

In [None]:
train_df1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,meter_reading,air_temperature,wind_speed,precip_depth_1_hr,cloud_coverage,square_feet
meter,building_id,primary_use,month,day,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,Education,1,1,0.0,23.343750,1.854492,-0.173950,4.285156,7432.0
0,0,Education,1,2,0.0,19.531250,3.925781,-0.083313,5.667969,7432.0
0,0,Education,1,3,0.0,14.828125,5.000000,0.500000,8.000000,7432.0
0,0,Education,1,4,0.0,12.664062,4.285156,0.666504,2.427734,7432.0
0,0,Education,1,5,0.0,14.062500,6.257812,0.000000,1.333008,7432.0
...,...,...,...,...,...,...,...,...,...,...
3,1448,Warehouse/storage,7,27,0.0,,,,,
3,1448,Warehouse/storage,7,28,0.0,,,,,
3,1448,Warehouse/storage,7,29,0.0,,,,,
3,1448,Warehouse/storage,7,30,0.0,,,,,


In [None]:
import gc
gc.collect()

88

In [None]:
test_df= test_df.groupby(['row_id','meter',test_df['building_id'],'primary_use',test_df['month'], test_df['day']]).agg({ 'air_temperature': 'mean', 'wind_speed': 'mean', 'precip_depth_1_hr': 'mean', 'cloud_coverage': 'mean', 'square_feet': 'mean'})

In [None]:
#!pip install pickle5
#import pickle5 as pickle
#with open("/content/drive/MyDrive/energy_dataset/test_df_groupby.pkl", "rb") as fh:
  #test_df = pickle.load(fh)



In [None]:
test_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,air_temperature,wind_speed,precip_depth_1_hr,cloud_coverage,square_feet
row_id,meter,building_id,primary_use,month,day,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,0,0,Education,1,1,17.796875,3.599609,,4.0,7432.0
1,0,1,Education,1,1,17.796875,3.599609,,4.0,2720.0
2,0,2,Education,1,1,17.796875,3.599609,,4.0,5376.0
3,0,3,Education,1,1,17.796875,3.599609,,4.0,23685.0
4,0,4,Education,1,1,17.796875,3.599609,,4.0,116607.0
...,...,...,...,...,...,...,...,...,...,...
41697595,0,1444,Entertainment/public assembly,5,9,,,,,19619.0
41697596,0,1445,Education,5,9,,,,,4298.0
41697597,0,1446,Entertainment/public assembly,5,9,,,,,11265.0
41697598,0,1447,Lodging/residential,5,9,,,,,29775.0


In [None]:
train_df1 = train_df1.reset_index()

In [None]:
test_df = test_df.reset_index()

In [None]:
#Code below changes the type of the features to 'float32'

train_df1['wind_speed'] = train_df1['wind_speed'].astype('float32')
train_df1['air_temperature'] = train_df1['air_temperature'].astype('float32')
train_df1['precip_depth_1_hr'] = train_df1['precip_depth_1_hr'].astype('float32')

In [None]:
test_df['wind_speed'] = test_df['wind_speed'].astype('float32')
test_df['air_temperature'] = test_df['air_temperature'].astype('float32')
test_df['precip_depth_1_hr'] = test_df['precip_depth_1_hr'].astype('float32')

In [None]:
# NaN Treatment

train_df1['wind_speed'].fillna(train_df1['wind_speed'].mean(), inplace=True)
train_df1['precip_depth_1_hr'].fillna(train_df1['precip_depth_1_hr'].mean(), inplace=True)
train_df1['air_temperature'].fillna(train_df1['air_temperature'].mean(), inplace=True)
train_df1['cloud_coverage'].fillna(train_df1['cloud_coverage'].median(), inplace=True)
train_df1['square_feet'].fillna(train_df1['square_feet'].median(), inplace=True)

In [None]:
test_df['precip_depth_1_hr'].fillna(test_df['precip_depth_1_hr'].mean(), inplace=True)
test_df['wind_speed'].fillna(test_df['wind_speed'].mean(), inplace=True)
test_df['air_temperature'].fillna(test_df['air_temperature'].mean(), inplace=True)
test_df['cloud_coverage'].fillna(test_df['cloud_coverage'].median(), inplace=True)
test_df['square_feet'].fillna(test_df['square_feet'].median(), inplace=True)

In [None]:
train_df1.isnull().sum()

meter                0
building_id          0
primary_use          0
month                0
day                  0
meter_reading        0
air_temperature      0
wind_speed           0
precip_depth_1_hr    0
cloud_coverage       0
square_feet          0
dtype: int64

In [None]:
test_df.isnull().sum()

row_id               0
meter                0
building_id          0
primary_use          0
month                0
day                  0
air_temperature      0
wind_speed           0
precip_depth_1_hr    0
cloud_coverage       0
square_feet          0
dtype: int64

In [None]:
train1_encoded = train_df1[:]

In [None]:
test_encoded = test_df[:]

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
train1_encoded["primary_use"] = LabelEncoder().fit_transform(train1_encoded["primary_use"])

In [None]:
test_encoded["primary_use"] = LabelEncoder().fit_transform(test_encoded["primary_use"])

# Gated Recurrent Unit (GRU) and Prediction for Kaggle Score

In [None]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import backend as K
from keras.losses import MSE
from sklearn.model_selection  import train_test_split
from tensorflow.keras.utils import Sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, LSTM, Embedding
from tensorflow.keras.optimizers import RMSprop,Adam
import keras.backend as K
import tensorflow as tf

In [None]:
from sklearn import preprocessing
from keras import backend as K
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.layers import Dense, LSTM, GRU, Dropout, BatchNormalization
from keras.models import Sequential
from keras.layers import SimpleRNN
from keras import regularizers
from tensorflow import keras

In [None]:
X1 = train1_encoded[['meter', 'building_id', 'primary_use', 'month', 'day','air_temperature', 'wind_speed', 'precip_depth_1_hr', 'cloud_coverage',
       'square_feet']]
y1 = train1_encoded['meter_reading']

In [None]:
x_train1, x_val1, y_train1, y_val1 = train_test_split(X1,y1, test_size = 0.2, random_state= 45)

In [None]:
from sklearn.metrics import mean_squared_log_error

def root_mean_squared_error(y_true, y_pred):
  return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [None]:
# The following function reshapes the data and applies the log transormation to traget feature 'meter_reading' 
# since during T1 we were able to deduce that it was heavily skewed.

def transform(x_train,y_train,epochs=50,batch_size=500,verbose=1,validation_data=(x_val1,y_val1),callbacks =None):
  x_train = x_train.values[:]
  x_train= x_train.reshape((x_train.shape[0],1,x_train.shape[-1]))
  y_train = np.log1p(y_train) # log tran
  if validation_data != None:
    x_val = validation_data[0].values[:]
    x_val = x_val.reshape((x_val.shape[0],1,x_val.shape[-1]))
    y_val = np.log1p(validation_data[-1]) # log tran  
  return x_train,y_train,x_val,y_val

In [None]:
Early_stop = EarlyStopping(monitor='val_root_mean_squared_error', min_delta=0.0001, patience=5, verbose=True, mode='auto')

In [None]:
input_dim=x_train1.shape[-1]
input_dim

10

In [None]:
model = Sequential()
model.add(GRU(128,return_sequences=True, input_shape=(None,10)))
model.add(Dropout(0.2))
model.add(BatchNormalization())
model.add(GRU(128,return_sequences=False))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(optimizer="rmsprop", loss="mse", metrics=[root_mean_squared_error, 'accuracy'])
x_train,y_train,x_val,y_val = transform(x_train1,y_train1,epochs=30,batch_size=500,verbose=1,validation_data=(x_val1,y_val1), callbacks =[Early_stop])
model.fit(x_train,y_train,epochs=30,batch_size=500,verbose=1,validation_data=(x_val,y_val),callbacks=Early_stop)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 6: early stopping


<keras.callbacks.History at 0x7f8190e4c050>

In [None]:
model.save('GRU')



INFO:tensorflow:Assets written to: GRU/assets


INFO:tensorflow:Assets written to: GRU/assets


In [None]:
model = keras.models.load_model("/content/GRU.h5", custom_objects={"root_mean_squared_error": root_mean_squared_error })

## Test Set Prediction (kaggle)

In [None]:
submit = pd.read_csv('/content/drive/MyDrive/energy_dataset/sample_submission.csv')
test_X = test_encoded[['meter', 'building_id', 'primary_use', 'month', 'day','air_temperature', 'wind_speed', 'precip_depth_1_hr', 'cloud_coverage','square_feet']]

In [None]:
test_X = test_X.values[:]

In [None]:
test_X = test_X.reshape((test_X.shape[0],1,test_X.shape[-1]))

In [None]:
prediction = model.predict(test_X)

In [None]:
prediction = np.expm1(prediction)
submit['meter_reading'] = prediction
submit.to_csv('submission.csv', index=False,float_format='%.4f')