# Energy Usage Prediction - Model Building

In [2]:
import pandas as pd
import numpy as np

## Load dataset

In [4]:
train_path = 'data/train.csv'
test_path = 'data/test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [5]:
# replace 0s in year_built with np.nan
train_df.year_built = train_df.year_built.replace(0, np.nan)
test_df.year_built = test_df.year_built.replace(0, np.nan)

In [6]:
# check for duplicates
print(train_df.duplicated().sum())
print(test_df.duplicated().sum())

0
0


## Create X and y

In [7]:
X = train_df.drop(['site_eui', 'id'], axis=1)
y = train_df.site_eui

In [8]:
from sklearn import model_selection

X_train, X_val, y_train, y_val = model_selection.train_test_split(
    X, y, test_size=.1, random_state=25
)

In [9]:
X_test = test_df.drop(['id'], axis=1)

## Data preprocessing

In [10]:
from sklearn import pipeline
from sklearn import impute
from sklearn import preprocessing
from sklearn import compose

In [11]:
num_pipe = pipeline.Pipeline([
    ('median_imputer', impute.SimpleImputer(strategy="median")),
    ('standard_scaler', preprocessing.StandardScaler())
    ])

In [12]:
cat_pipe = pipeline.Pipeline([
    ('ohe', preprocessing.OneHotEncoder(min_frequency=.05, handle_unknown='ignore'))
])

In [13]:
# combine num_pipe and cat_pipe
preproc = compose.ColumnTransformer([
    ('num_pipe', num_pipe, X_train.select_dtypes(exclude=['object']).columns),
    ('cat_pipe', cat_pipe, X_train.select_dtypes(include=['object']).columns)
])

In [14]:
preproc

In [15]:
# from sklearn import linear_model
# from sklearn import metrics

# model = linear_model.SGDRegressor(
#     loss = rmse,
#     early_stopping = True,
#     validation_fraction = .1,
#     n_iter_no_change = 5 
# )

In [16]:
# full_pipe = pipeline.make_pipeline(
#     preproc,
#     model
# )

In [17]:
X_train_preproc = pd.DataFrame(preproc.fit_transform(X_train), columns=preproc.get_feature_names_out())
X_val_preproc = pd.DataFrame(preproc.transform(X_val), columns=preproc.get_feature_names_out())

In [18]:
X_train_preproc.head(10)

Unnamed: 0,num_pipe__Year_Factor,num_pipe__floor_area,num_pipe__year_built,num_pipe__energy_star_rating,num_pipe__ELEVATION,num_pipe__january_min_temp,num_pipe__january_avg_temp,num_pipe__january_max_temp,num_pipe__february_min_temp,num_pipe__february_avg_temp,...,cat_pipe__State_Factor_State_2,cat_pipe__State_Factor_State_4,cat_pipe__State_Factor_State_6,cat_pipe__State_Factor_infrequent_sklearn,cat_pipe__building_class_Commercial,cat_pipe__building_class_Residential,cat_pipe__facility_type_Education_Other_classroom,cat_pipe__facility_type_Multifamily_Uncategorized,cat_pipe__facility_type_Office_Uncategorized,cat_pipe__facility_type_infrequent_sklearn
0,0.430191,-0.442469,1.722111,-1.344469,-0.347695,1.765087,1.586583,0.734321,1.61214,1.441798,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,-0.927341,-0.32723,-0.82116,1.416306,-0.49801,1.125801,1.496712,1.10667,1.055693,1.393477,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,1.108957,3.865022,1.101801,1.243758,-0.213717,1.765087,1.305447,-0.568899,1.771125,1.368623,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,1.108957,-0.409567,-1.007253,-1.862114,-0.238225,-0.046224,0.019598,-0.010376,-1.011113,0.24504,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
4,-1.606107,-0.368781,-1.410454,-1.128783,-0.60748,1.658539,2.091244,1.851368,1.691633,1.817668,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
5,-0.248575,-0.366497,-0.417958,0.639838,-0.591141,-0.792057,-0.816895,-0.196551,-0.216188,-0.440677,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
6,1.108957,-0.556302,-0.697098,0.16533,0.099985,2.724016,2.625863,0.548147,2.407066,2.41445,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
7,-0.248575,-0.390656,-0.728113,-1.430743,-0.591141,-0.792057,-0.816895,-0.196551,-0.216188,-0.440677,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
8,-1.606107,-0.380358,-0.014757,0.16533,0.050969,0.166872,0.429779,0.548147,0.65823,0.60272,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
9,0.430191,-0.31736,-0.417958,-0.266041,-0.228422,-0.365867,-0.637153,-0.568899,-0.772636,-1.306413,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0


In [19]:
X_test_preproc = pd.DataFrame(preproc.transform(X_test), columns=preproc.get_feature_names_out())

## Model building

In [20]:
import tensorflow as tf

2023-06-22 14:50:31.970835: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-22 14:50:34.724870: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-06-22 14:50:34.724945: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-06-22 14:50:35.120346: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-06-22 14:50:47.525684: W tensorflow/stream_executor/platform/de

In [21]:
input_dim = X_train_preproc.shape[1]

In [24]:
# rmse loss function
def rmse_loss(y_true, y_pred):
    return tf.math.sqrt(tf.math.reduce_mean(((y_pred - y_true) ** 2), axis=1))

In [25]:
model.compile(
    loss=rmse_loss,
    optimizer='adam',
    metrics=['mse']
)

In [38]:
def instantiate_model(input_dim):
    '''instantiates model'''
    
    # simple model 
    # dropout added
    inputs = tf.keras.Input(shape=(input_dim,))

    x = tf.keras.layers.Dense(256, activation='relu')(inputs)
    x = tf.keras.layers.Dropout(rate=.1)(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    x = tf.keras.layers.Dropout(rate=.1)(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.Dropout(rate=.1)(x)
    x = tf.keras.layers.Dense(32, activation='relu')(x)

    outputs = tf.keras.layers.Dense(1, activation='linear')(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    
    print(model.summary())
    
    return model

In [39]:
def compile_model(model):
    '''compiles given model'''
    
    model.compile(
        loss=rmse_loss,
        optimizer='adam',
        metrics=['mse']
    )
    
    return model

In [40]:
def model_main(input_dim):
    '''instantiates and compiles model'''
    
    model = instantiate_model(input_dim)
    model = compile_model(model)
    
    return model

## Model training (with tensorboard)

In [29]:
# Load the TensorBoard notebook extension.
%load_ext tensorboard

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [34]:
from datetime import datetime
from packaging import version

print("TensorFlow version: ", tf.__version__)
assert version.parse(tf.__version__).release[0] >= 2, \
    "This notebook requires TensorFlow 2.0 or above."

TensorFlow version:  2.10.0


In [39]:
# # Clear any logs from previous runs
# !rm -rf ./logs/

In [40]:
logdir = 'logs/' + datetime.now().strftime("%Y%m%d-%H%M%S")
tb_cb = tf.keras.callbacks.TensorBoard(log_dir=logdir)

In [41]:
# instantiates and compiles model
model = model_main(input_dim)

# callbacks
es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    patience=10,
    restore_best_weights=True
)

logdir = 'logs/' + datetime.now().strftime("%Y%m%d-%H%M%S")
tb_cb = tf.keras.callbacks.TensorBoard(log_dir=logdir)
file_writer = tf.summary.create_file_writer(logdir + "/metrics")
file_writer.set_as_default()

# training
history = model.fit(
    X_train_preproc, y_train,
    verbose=1,
    callbacks=[es, tb_cb],
    validation_data=(X_val_preproc, y_val),
    epochs=1000,
    batch_size=32,
)

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_3 (InputLayer)        [(None, 71)]              0         
                                                                 
 dense_10 (Dense)            (None, 256)               18432     
                                                                 
 dense_11 (Dense)            (None, 128)               32896     
                                                                 
 dense_12 (Dense)            (None, 64)                8256      
                                                                 
 dense_13 (Dense)            (None, 32)                2080      
                                                                 
 dense_14 (Dense)            (None, 1)                 33        
                                                                 
Total params: 61,697
Trainable params: 61,697
Non-trainable

In [37]:
%tensorboard --logdir logs

Reusing TensorBoard on port 6006 (pid 11389), started 0:00:43 ago. (Use '!kill 11389' to kill it.)

## Model training (with mlflow)

In [29]:
import mlflow

In [41]:
mlflow.autolog()

# instantiates and compiles model
model = model_main(input_dim)

# callbacks
es = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss', 
    patience=10,
    restore_best_weights=True
)

# training
history = model.fit(
    X_train_preproc, y_train,
    verbose=1,
    callbacks=[es],
    validation_data=(X_val_preproc, y_val),
    epochs=1000,
    batch_size=32,
)

2023/06/22 15:35:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for tensorflow.


2023/06/22 15:35:09 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.


Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 71)]              0         
                                                                 
 dense_20 (Dense)            (None, 256)               18432     
                                                                 
 dropout_3 (Dropout)         (None, 256)               0         
                                                                 
 dense_21 (Dense)            (None, 128)               32896     
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                                 
 dense_22 (Dense)            (None, 64)                8256      
                                                                 
 dropout_5 (Dropout)         (None, 64)                0   

2023/06/22 15:35:10 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID 'e308ce714f4a4a3e8776d62e92db6df6', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current tensorflow workflow


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000




INFO:tensorflow:Assets written to: /tmp/tmpe_3f1yd1/model/data/model/assets




## Prediction

In [32]:
pred = model.predict(X_test_preproc)



In [36]:
from datetime import datetime

exp_name = 'amazing-finch-125'

pred_df = pd.concat((test_df[['id']], pd.DataFrame(pred)), axis=1).rename(columns={0: 'site_eui'})
pred_df.to_csv(f'pred/{datetime.now().strftime("%Y%m%d-%H%M%S")}_{exp_name}.csv', index=False)