In [1]:
# Importing the required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error,  r2_score
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
# Mouting the drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# Loading the dataset
# Train Data
temp_df = pd.read_csv("/content/drive/My Drive/Energy-Prediction-Using-BDG2-Data/data/cleaned/train.csv", nrows=0)  # Read only the header
total_columns = len(temp_df.columns)
columns_to_use = temp_df.columns[1:total_columns]
train_data = pd.read_csv("/content/drive/My Drive/Energy-Prediction-Using-BDG2-Data/data/cleaned/train.csv",  usecols=columns_to_use)
print(train_data.head(5))

# Test Data
test_data = pd.read_csv("/content/drive/My Drive/Energy-Prediction-Using-BDG2-Data/data/cleaned/test.csv", usecols=columns_to_use)
print(test_data.head())

# Dropping the columns that are not relevant to our analysis
train_data = train_data.drop(columns=['building_name', 'site_name'])
test_data = test_data.drop(columns=['building_name', 'site_name'])

train_data = train_data[(train_data['meter'] == 'electricity') | (train_data['meter'] == 'chilledwater') | (train_data['meter'] == 'steam')
| (train_data['meter'] == 'hotwater') | (train_data['meter'] == 'gas')]

test_data = test_data[(test_data['meter'] == 'electricity') | (test_data['meter'] == 'chilledwater') | (test_data['meter'] == 'steam')
| (test_data['meter'] == 'hotwater') | (test_data['meter'] == 'gas')]

# Building index on building_id for furhter assessment
train_data.set_index('building_id', inplace=True)
test_data.set_index('building_id', inplace=True)

            building_name        meter        date  meter_reading site_name  \
0  Bear_education_Alfredo  electricity  2016-01-01         2.9050      Bear   
1  Bear_education_Alfredo  electricity  2016-01-02         2.7700      Bear   
2  Bear_education_Alfredo  electricity  2016-01-03         2.6725      Bear   
3  Bear_education_Alfredo  electricity  2016-01-04         4.5650      Bear   
4  Bear_education_Alfredo  electricity  2016-01-05         4.7825      Bear   

  sub_primaryspaceusage    sqm    sqft    timezone  airTemperature  \
0             Education  609.8  6564.0  US/Pacific        5.246861   
1             Education  609.8  6564.0  US/Pacific        5.993973   
2             Education  609.8  6564.0  US/Pacific        5.660314   
3             Education  609.8  6564.0  US/Pacific        5.048507   
4             Education  609.8  6564.0  US/Pacific        4.745567   

   cloudCoverage  dewTemperature  precipDepth1HR  precipDepth6HR  \
0       1.927009        0.254484    

In [4]:
# Filtering for electrity meter_reading
train_data = train_data[train_data['meter'] == 'steam']
test_data = test_data[test_data['meter'] == 'steam']

train_data = train_data.drop(columns=['meter'])
test_data = test_data.drop(columns=['meter'])

In [5]:
# Inspecting the data frames
print(train_data.sample(2))
print('-------------------------------------------------------------')
print(test_data.sample(2))

                   date  meter_reading sub_primaryspaceusage      sqm  \
building_id                                                             
186          2016-07-28      8877.6947     College Classroom  16780.6   
158          2016-04-06      8292.9035    College Laboratory   3079.5   

                 sqft    timezone  airTemperature  cloudCoverage  \
building_id                                                        
186          180625.0  US/Eastern       23.670989       2.214316   
158           33148.0  US/Eastern       10.521538       1.675878   

             dewTemperature  precipDepth1HR  precipDepth6HR  seaLvlPressure  \
building_id                                                                   
186               16.152527        0.667022       17.467215     1012.905035   
158                1.119341        1.032437       12.616553     1016.230858   

             windDirection  windSpeed  season  site_id  
building_id                                             
186

In [6]:
# Separating into X and Y dataframes
X_train = train_data.drop(columns=['meter_reading'])  # Exclude target variable
y_train = train_data['meter_reading']

X_test = test_data.drop(columns=['meter_reading'])  # Exclude target variable
y_test = test_data['meter_reading']

In [7]:
# Convert 'site_id' from numeric to categorical
X_train['site_id'] = X_train['site_id'].astype('category')
X_test['site_id'] = X_test['site_id'].astype('category')

In [8]:
print(X_train.dtypes)
print(X_train.columns)

date                       object
sub_primaryspaceusage      object
sqm                       float64
sqft                      float64
timezone                   object
airTemperature            float64
cloudCoverage             float64
dewTemperature            float64
precipDepth1HR            float64
precipDepth6HR            float64
seaLvlPressure            float64
windDirection             float64
windSpeed                 float64
season                     object
site_id                  category
dtype: object
Index(['date', 'sub_primaryspaceusage', 'sqm', 'sqft', 'timezone',
       'airTemperature', 'cloudCoverage', 'dewTemperature', 'precipDepth1HR',
       'precipDepth6HR', 'seaLvlPressure', 'windDirection', 'windSpeed',
       'season', 'site_id'],
      dtype='object')


In [9]:
# Define features and types based on your dataset
numerical_features = ['sqm', 'sqft', 'airTemperature', 'cloudCoverage', 'dewTemperature',
                      'precipDepth1HR', 'precipDepth6HR', 'seaLvlPressure', 'windDirection', 'windSpeed']
categorical_features = ['timezone', 'season', 'sub_primaryspaceusage', 'site_id']

In [10]:
# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [11]:
# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Fit the preprocessor on the training data and transform both training and test data
preprocessor.fit(X_train)
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [12]:
# Convert the processed data back to dense DataFrames
X_train_processed_df = pd.DataFrame(X_train_processed, columns=preprocessor.get_feature_names_out())
X_test_processed_df = pd.DataFrame(X_test_processed, columns=preprocessor.get_feature_names_out())

In [13]:
# Checking the columns
X_train_processed_df.columns

Index(['num__sqm', 'num__sqft', 'num__airTemperature', 'num__cloudCoverage',
       'num__dewTemperature', 'num__precipDepth1HR', 'num__precipDepth6HR',
       'num__seaLvlPressure', 'num__windDirection', 'num__windSpeed',
       'cat__timezone_US/Central', 'cat__timezone_US/Eastern',
       'cat__season_Fall', 'cat__season_Spring', 'cat__season_Summer',
       'cat__season_Winter', 'cat__sub_primaryspaceusage_Auditorium',
       'cat__sub_primaryspaceusage_Classroom',
       'cat__sub_primaryspaceusage_College Classroom',
       'cat__sub_primaryspaceusage_College Laboratory',
       'cat__sub_primaryspaceusage_Research',
       'cat__sub_primaryspaceusage_Student Union', 'cat__site_id_3',
       'cat__site_id_4', 'cat__site_id_6', 'cat__site_id_8', 'cat__site_id_10',
       'cat__site_id_12'],
      dtype='object')

In [14]:
y_train_scaled = np.log1p(y_train.values.reshape(-1, 1))
y_test_scaled = np.log1p(y_test.values.reshape(-1, 1))

### Neural Networks

In [15]:
# Define the neural network model
model = Sequential()
model.add(Dense(64, input_dim=X_train_processed_df.shape[1], activation='relu')) # Adjust input_dim based on your features
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))  # Output layer, adjust units and activation based on your output

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

In [16]:
# Fit the model to the training data
model.fit(X_train_processed, y_train_scaled, epochs=10, batch_size=10, verbose=1)

# Predict the values for X_train
y_pred = model.predict(X_train_processed_df)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
# Assuming y_train_scaled and y_pred are both numpy arrays
#mae = mean_absolute_error(y_train_scaled, y_pred)
mse = mean_squared_error(y_train_scaled, y_pred)
r2 = r2_score(y_train_scaled, y_pred)

#print('Mean Absolute Error:', mae)
print('Mean Squared Error:', mse)
print('R-squared Score:', r2)


Mean Squared Error: 5.619180881178215
R-squared Score: 0.5146740526085418


In [22]:
# Predict the values for X_test
y_pred = model.predict(X_test_processed_df)
y_pred = np.nan_to_num(y_pred, nan=0)
mse = mean_squared_error(y_test_scaled, y_pred)
r2 = r2_score(y_test_scaled, y_pred)
print('Mean Squared Error:', mse)
print('R-squared Score:', r2)

Mean Squared Error: 7.309620919672645
R-squared Score: 0.2696935771070813


In [27]:
import kerastuner as kt
from tensorflow import keras
from tensorflow.keras import layers

def build_model(hp):
    model = keras.Sequential()
    model.add(layers.Dense(
        units=hp.Int('input_units', min_value=32, max_value=256, step=64),  # Reduced max value and increased step
        activation='relu',
        input_shape=(X_train_processed.shape[1],)
    ))

    # Reduced the maximum number of layers
    for i in range(hp.Int('n_layers', 1, 3)):
        model.add(layers.Dense(
            units=hp.Int(f'layer_{i}_units', min_value=32, max_value=256, step=64),  # Simplify layers
            activation='relu'
        ))

    model.add(layers.Dense(1))  # Assuming a single output; adjust based on your problem

    model.compile(
        optimizer=keras.optimizers.Adam(hp.Choice('learning_rate', values=[1e-3, 1e-4])),  # Simplified choices
        loss='mse',  # Assuming regression; change if necessary
        metrics=['mae', 'mse']  # Assuming regression; change if necessary
    )

    return model

tuner = kt.RandomSearch(
    build_model,
    objective='val_mse',  # Assuming regression; change if necessary
    max_trials=10,  # Reduced number of trials for faster execution
    executions_per_trial=2,  # Reduced for quicker assessments
    directory='my_dir',
    project_name='quick_tune'
)

# Note: Adjust validation_split based on your dataset size
tuner.search(
    X_train_processed_df, y_train_scaled,
    epochs=5,  # Reduced number of epochs for tuning
    validation_split=0.1,  # Adjust based on your dataset
    batch_size=256  # Increased batch size for faster processing
)

# Get the optimal hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the model with the best hyperparameters and train it on the data
model = tuner.hypermodel.build(best_hps)
history = model.fit(
    X_train_processed_df, y_train_scaled,
    epochs=10,  # You can increase this for the final model training
    validation_split=0.1,  # Keep consistent with tuning phase or adjust as necessary
    batch_size=256  # Set a fixed batch size
)

Reloading Tuner from my_dir/quick_tune/tuner0.json
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
# Check and replace NaN values in the test features (if X_test_processed_df is a DataFrame)
if X_test_processed_df.isna().any().any():  # This checks the whole DataFrame
    X_test_processed_df = X_test_processed_df.fillna(0)  # Replace NaN with 0 or use another appropriate method

# Check and replace NaN values in the test labels (if y_test_scaled is a NumPy array)
if np.isnan(y_test_scaled).any():  # This checks the NumPy array
    y_test_scaled = np.nan_to_num(y_test_scaled)  # Replace NaN with 0 (default)

In [32]:
# Assuming your test set is named X_test_processed_df and y_test_scaled
test_loss, test_mae, test_mse = model.evaluate(X_test_processed_df, y_test_scaled, batch_size=256)
print(f'Test Loss: {test_loss}, Test MAE: {test_mae}, Test MSE: {test_mse}')

Test Loss: 7.636962890625, Test MAE: 2.045722484588623, Test MSE: 7.636962890625


In [34]:
y_pred = model.predict(X_test_processed_df, batch_size=256)

# Calculate R-squared score
r_squared = r2_score(y_test_scaled, y_pred)
print(f'R-squared Score: {r_squared}')

R-squared Score: 0.23698891832818003
