### Pipeline for Pre-processing 

In [1]:
# Importing the required libraries 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error,  r2_score
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
#import tensorflow.keras.layers as layers
from scipy.sparse import isspmatrix
import matplotlib.pyplot as plt
#from sklearn.model_selection import GridSearchCV
#from scikeras.wrappers import KerasRegressor

In [2]:
# Load datasets

# Train Data 
temp_df = pd.read_csv("../../data/cleaned/train.csv", nrows=0)  # Read only the header
total_columns = len(temp_df.columns)
columns_to_use = temp_df.columns[1:total_columns] 
train_data = pd.read_csv("../../data/cleaned/train.csv", usecols=columns_to_use)

# Test Data 
test_data = pd.read_csv("../../data/cleaned/test.csv", usecols=columns_to_use)

# Dropping the columns that are not relevant to our analysis 
train_data = train_data.drop(columns=['building_name', 'site_name'])
test_data = test_data.drop(columns=['building_name', 'site_name'])

# Building index on building_id for furhter assessment 
#train_data.set_index('building_id', inplace=True)
#test_data.set_index('building_id', inplace=True)

In [3]:
# Filtering for electrity meter_reading
train_data = train_data[train_data['meter'] == 'electricity']
test_data = test_data[test_data['meter'] == 'electricity']

train_data = train_data.drop(columns=['meter'])
test_data = test_data.drop(columns=['meter'])

In [4]:
# Inspecting the data frames
print(train_data.sample(2))
print('-------------------------------------------------------------')
print(test_data.sample(2))

              date  meter_reading sub_primaryspaceusage      sqm      sqft  \
34688   2016-10-11          0.000              Academic   6052.1   65144.0   
258138  2016-04-18      12691.183    College Laboratory  14537.7  156483.0   

           timezone  airTemperature  cloudCoverage  dewTemperature  \
34688   US/Mountain       14.455044       1.334908        7.430044   
258138   US/Central       15.188158       1.747599        4.782018   

        precipDepth1HR  precipDepth6HR  seaLvlPressure  windDirection  \
34688         0.366489       11.150400     1022.003142     145.886540   
258138        0.993188       12.707306     1020.930271     169.464298   

        windSpeed  season  building_id  site_id  
34688    2.772494    Fall           76        2  
258138   3.517544  Spring          309        8  
-------------------------------------------------------------
              date  meter_reading sub_primaryspaceusage      sqm      sqft  \
232419  2017-10-07      2873.2800    College

In [5]:
# Separating into X and Y dataframes 
X_train = train_data.drop(columns=['meter_reading'])  # Exclude target variable
y_train = train_data['meter_reading']

X_test = test_data.drop(columns=['meter_reading'])  # Exclude target variable
y_test = test_data['meter_reading']

#### Setting up a pipeline to process the data 

In [6]:
# Convert 'site_id' from numeric to categorical 
X_train['site_id'] = X_train['site_id'].astype('category')
X_test['site_id'] = X_test['site_id'].astype('category')

# Making sure the date columns is in the right format 
X_train['date'] = pd.to_datetime(X_train['date'])
X_test['date'] = pd.to_datetime(X_test['date'])

In [7]:
print(X_train.dtypes)
print(X_train.columns)

date                     datetime64[ns]
sub_primaryspaceusage            object
sqm                             float64
sqft                            float64
timezone                         object
airTemperature                  float64
cloudCoverage                   float64
dewTemperature                  float64
precipDepth1HR                  float64
precipDepth6HR                  float64
seaLvlPressure                  float64
windDirection                   float64
windSpeed                       float64
season                           object
building_id                       int64
site_id                        category
dtype: object
Index(['date', 'sub_primaryspaceusage', 'sqm', 'sqft', 'timezone',
       'airTemperature', 'cloudCoverage', 'dewTemperature', 'precipDepth1HR',
       'precipDepth6HR', 'seaLvlPressure', 'windDirection', 'windSpeed',
       'season', 'building_id', 'site_id'],
      dtype='object')


In [8]:
# Define features and types based on your dataset
numerical_features = ['sqm', 'sqft', 'airTemperature', 'cloudCoverage', 'dewTemperature',
                      'precipDepth1HR', 'precipDepth6HR', 'seaLvlPressure', 'windDirection', 'windSpeed']
categorical_features = ['timezone', 'season', 'sub_primaryspaceusage', 'site_id']
date_feature = 'date'
id_feature = 'building_id'

In [9]:
# Save the 'building_id' and 'date' columns
building_ids_train = X_train[id_feature].values
dates_train = X_train[date_feature].values
building_ids_test = X_test[id_feature].values
dates_test = X_test[date_feature].values

In [10]:
# Drop the 'building_id' and 'date' columns for preprocessing
X_train = X_train.drop(columns=[id_feature, date_feature])
X_test = X_test.drop(columns=[id_feature, date_feature])

# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Fit the preprocessor on the training data and transform both training and test data
preprocessor.fit(X_train)
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [11]:
# Convert the processed data back to dense DataFrames
X_train_processed_df = pd.DataFrame(X_train_processed.toarray(), columns=preprocessor.get_feature_names_out())
X_test_processed_df = pd.DataFrame(X_test_processed.toarray(), columns=preprocessor.get_feature_names_out())

In [12]:
# Reattach the 'building_id' and 'date' columns to the processed data
X_train_processed_df[id_feature] = building_ids_train
X_train_processed_df[date_feature] = dates_train
X_test_processed_df[id_feature] = building_ids_test
X_test_processed_df[date_feature] = dates_test

# Sort the DataFrames by 'building_id' and 'date' to ensure the correct sequence
X_train_processed_df.sort_values(by=[id_feature, date_feature], inplace=True)
X_test_processed_df.sort_values(by=[id_feature, date_feature], inplace=True)

In [13]:
X_train_processed_df.columns

Index(['num__sqm', 'num__sqft', 'num__airTemperature', 'num__cloudCoverage',
       'num__dewTemperature', 'num__precipDepth1HR', 'num__precipDepth6HR',
       'num__seaLvlPressure', 'num__windDirection', 'num__windSpeed',
       'cat__timezone_Europe/Dublin', 'cat__timezone_Europe/London',
       'cat__timezone_US/Central', 'cat__timezone_US/Eastern',
       'cat__timezone_US/Mountain', 'cat__timezone_US/Pacific',
       'cat__season_Fall', 'cat__season_Spring', 'cat__season_Summer',
       'cat__season_Winter', 'cat__sub_primaryspaceusage_Academic',
       'cat__sub_primaryspaceusage_Auditorium',
       'cat__sub_primaryspaceusage_Classroom',
       'cat__sub_primaryspaceusage_College Classroom',
       'cat__sub_primaryspaceusage_College Laboratory',
       'cat__sub_primaryspaceusage_Education',
       'cat__sub_primaryspaceusage_K-12 School',
       'cat__sub_primaryspaceusage_Other - Education',
       'cat__sub_primaryspaceusage_Primary/Secondary Classroom',
       'cat__sub_pri

In [14]:
# Assume y_train and y_test are Pandas Series with 'meter_reading' for training and testing datasets
scaler = MinMaxScaler()

# Fit scaler on the training set
y_train_scaled = scaler.fit_transform(y_train.values.reshape(-1, 1))

# Only transform the test set, do not fit the scaler to it to avoid data leakage
y_test_scaled = scaler.transform(y_test.values.reshape(-1, 1))

In [15]:
# Making it suitable for timeseries data 
sequence_length = 5

def create_sequences(data, target, sequence_length):
    X = []
    y = []
    for i in range(sequence_length, data.shape[0]):  # Start from sequence_length
        if isspmatrix(data):  # Check if 'data' is a sparse matrix
            seq = data[(i - sequence_length):i].toarray()  # Convert to dense array
        else:  # If 'data' is already a dense array or DataFrame
            seq = data.iloc[(i - sequence_length):i].values if hasattr(data, 'iloc') else data[(i - sequence_length):i]
        X.append(seq)
        y.append(target[i])
    return np.array(X), np.array(y)

# Ensure y_train_scaled and y_test_scaled are numpy arrays for consistency
y_train_scaled = np.array(y_train_scaled).flatten()
y_test_scaled = np.array(y_test_scaled).flatten()

# Create sequences using the modified function
X_train_sequences, y_train_scaled_sequences = create_sequences(X_train_processed, y_train_scaled, sequence_length)
X_test_sequences, y_test_scaled_sequences = create_sequences(X_test_processed, y_test_scaled, sequence_length)


#### Fitting the RNN 

In [None]:
# Define RNN model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_train_sequences.shape[1], X_train_sequences.shape[2])))
model.add(Dense(1))  # The output layer with one neuron, as we are doing regression to predict 'meter reading'
model.compile(optimizer='adam', loss='mse')

# Fit the RNN model on the training data
history = model.fit(X_train_sequences, y_train_scaled, epochs=10, batch_size=32, validation_split=0.2)



2024-03-05 10:53:01.139891: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-03-05 10:53:01.139952: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-03-05 10:53:01.139963: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-03-05 10:53:01.140443: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-03-05 10:53:01.140992: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/10


In [None]:
# Evaluate the model on the test data
y_pred_scaled = model.predict(X_test_sequences)
y_pred_scaled = np.nan_to_num(y_pred_scaled, nan=0)
y_test_scaled = y_test_scaled[sequence_length:]

# Inverse the scaling of the predictions to get them on the same scale as the original 'meter reading' data
y_pred = scaler.inverse_transform(y_pred_scaled)
y_true = scaler.inverse_transform(y_test_scaled.reshape(-1, 1))  # Reshape if y_test_scaled is a 1D array

# Calculate MSE and R^2
mse = mean_squared_error(y_true, y_pred)
r_squared = r2_score(y_true, y_pred)

print(f'Mean Squared Error (MSE) on Test Data: {mse}')
print(f'R-squared Score on Test Data: {r_squared}')

In [None]:
# Plotting y_pred against y_true
plt.figure(figsize=(8, 6))
plt.scatter(y_true, y_pred, color='blue', alpha=0.5)
plt.plot([y_true.min(), y_true.max()], [y_true.min(), y_true.max()], 'k--', lw=2)  # Plotting the diagonal line
plt.xlabel('True Values')
plt.ylabel('Predicted Values')
plt.title('True vs Predicted Values')
plt.grid(True)
plt.show()

##### Hyperparameter tuning 

In [None]:
 def create_rnn_model(lstm_units=50, activation='relu', optimizer='adam', input_shape=(None, None)):

    model = Sequential()
    model.add(LSTM(units=lstm_units, activation=activation, input_shape=input_shape))
    model.add(Dense(1))
    model.compile(optimizer=optimizer, loss='mse')
    return model

# Note: Adjust the input_shape dynamically based on your actual data
#model = KerasRegressor(model=create_rnn_model, lstm_units=50, optimizer='adam', input_shape=(X_train_sequences.shape[1], X_train_sequences.shape[2]))

In [None]:
param_grid = {
    'model__lstm_units': [20, 50, 100],
    'model__activation': ['relu', 'tanh', 'sigmoid'],  # Including different activation functions to try
    'batch_size': [16, 32, 64],
    'epochs': [10, 20],
    'optimizer': ['adam', 'rmsprop']
}

In [None]:
model = KerasRegressor(model=create_rnn_model, input_shape=(X_train_sequences.shape[1], X_train_sequences.shape[2]))

# Setup GridSearchCV with the updated param_grid including 'model__activation'
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3, scoring='neg_mean_squared_error')

# Perform the search
grid_result = grid.fit(X_train_sequences, y_train_scaled)

# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))