In [16]:
# Importing the required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error,  r2_score
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [2]:
# Load datasets

# Train Data
temp_df = pd.read_csv("../../data/cleaned/train.csv", nrows=0)  # Read only the header
total_columns = len(temp_df.columns)
columns_to_use = temp_df.columns[1:total_columns]
train_data = pd.read_csv("../../data/cleaned/train.csv", usecols=columns_to_use)

# Test Data
test_data = pd.read_csv("../../data/cleaned/test.csv", usecols=columns_to_use)

# Dropping the columns that are not relevant to our analysis
# train_data = train_data.drop(columns=['building_name', 'site_name','date'])
# test_data = test_data.drop(columns=['building_name', 'site_name','date'])



train_data = train_data[(train_data['meter'] == 'electricity') | (train_data['meter'] == 'chilledwater') | (train_data['meter'] == 'steam')
| (train_data['meter'] == 'hotwater') | (train_data['meter'] == 'gas')]

test_data = test_data[(test_data['meter'] == 'electricity') | (test_data['meter'] == 'chilledwater') | (test_data['meter'] == 'steam')
| (test_data['meter'] == 'hotwater') | (test_data['meter'] == 'gas')]


# Building index on building_id for furhter assessment
train_data.set_index('building_id', inplace=True)
test_data.set_index('building_id', inplace=True)

In [3]:
# Inspecting the data frames
print(train_data.sample(2))
print('-------------------------------------------------------------')
print(test_data.sample(2))

                     building_name        meter        date  meter_reading  \
building_id                                                                  
313            Hog_education_Sonia        steam  2016-11-16     55207.3826   
53           Bear_education_Pattie  electricity  2016-03-21      3084.5000   

            site_name sub_primaryspaceusage      sqm      sqft    timezone  \
building_id                                                                  
313               Hog    College Laboratory  17103.3  184098.0  US/Central   
53               Bear             Education   8032.9   86465.0  US/Pacific   

             airTemperature  cloudCoverage  dewTemperature  precipDepth1HR  \
building_id                                                                  
313               12.008991       1.487829         5.98461        0.448363   
53                 8.442544       2.023483        -0.94057        0.445467   

             precipDepth6HR  seaLvlPressure  windDirection  w

In [4]:
# Separating into X and Y dataframes
X_train = train_data.drop(columns=['meter_reading'])  # Exclude target variable
y_train = train_data['meter_reading']

X_test = test_data.drop(columns=['meter_reading'])  # Exclude target variable
y_test = test_data['meter_reading']

In [5]:
# Convert 'site_id' from numeric to categorical
X_train['site_id'] = X_train['site_id'].astype('category')
X_test['site_id'] = X_test['site_id'].astype('category')

In [6]:
print(X_train.dtypes)
print(X_train.columns)

building_name              object
meter                      object
date                       object
site_name                  object
sub_primaryspaceusage      object
sqm                       float64
sqft                      float64
timezone                   object
airTemperature            float64
cloudCoverage             float64
dewTemperature            float64
precipDepth1HR            float64
precipDepth6HR            float64
seaLvlPressure            float64
windDirection             float64
windSpeed                 float64
season                     object
site_id                  category
dtype: object
Index(['building_name', 'meter', 'date', 'site_name', 'sub_primaryspaceusage',
       'sqm', 'sqft', 'timezone', 'airTemperature', 'cloudCoverage',
       'dewTemperature', 'precipDepth1HR', 'precipDepth6HR', 'seaLvlPressure',
       'windDirection', 'windSpeed', 'season', 'site_id'],
      dtype='object')


In [7]:
# Define features and types based on your dataset
numerical_features = ['sqm', 'sqft', 'airTemperature', 'cloudCoverage', 'dewTemperature',
                      'precipDepth1HR', 'precipDepth6HR', 'seaLvlPressure', 'windDirection', 'windSpeed']
categorical_features = ['timezone', 'season', 'sub_primaryspaceusage', 'site_id']

In [8]:
# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [9]:
# Create a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Fit the preprocessor on the training data and transform both training and test data
preprocessor.fit(X_train)
X_train_processed = preprocessor.transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [10]:
# Convert the processed data back to dense DataFrames
X_train_processed_df = pd.DataFrame(X_train_processed.toarray(), columns=preprocessor.get_feature_names_out())
X_test_processed_df = pd.DataFrame(X_test_processed.toarray(), columns=preprocessor.get_feature_names_out())

In [11]:
# Checking the columns
X_train_processed_df.columns

Index(['num__sqm', 'num__sqft', 'num__airTemperature', 'num__cloudCoverage',
       'num__dewTemperature', 'num__precipDepth1HR', 'num__precipDepth6HR',
       'num__seaLvlPressure', 'num__windDirection', 'num__windSpeed',
       'cat__timezone_Europe/Dublin', 'cat__timezone_Europe/London',
       'cat__timezone_US/Central', 'cat__timezone_US/Eastern',
       'cat__timezone_US/Mountain', 'cat__timezone_US/Pacific',
       'cat__season_Fall', 'cat__season_Spring', 'cat__season_Summer',
       'cat__season_Winter', 'cat__sub_primaryspaceusage_Academic',
       'cat__sub_primaryspaceusage_Auditorium',
       'cat__sub_primaryspaceusage_Classroom',
       'cat__sub_primaryspaceusage_College Classroom',
       'cat__sub_primaryspaceusage_College Laboratory',
       'cat__sub_primaryspaceusage_Education',
       'cat__sub_primaryspaceusage_K-12 School',
       'cat__sub_primaryspaceusage_Other - Education',
       'cat__sub_primaryspaceusage_Primary/Secondary Classroom',
       'cat__sub_pri

In [12]:
y_train_scaled = np.log1p(y_train.values.reshape(-1, 1))
y_test_scaled = np.log1p(y_test.values.reshape(-1, 1))

### Neural Networks

In [18]:
# Define the neural network model
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu')) # Adjust input_dim based on your features
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))  # Output layer, adjust units and activation based on your output

# Compile the model
model.compile(loss='mean_squared_error', optimizer='adam')

2024-03-06 19:03:43.273043: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M2
2024-03-06 19:03:43.273093: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2024-03-06 19:03:43.273104: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2024-03-06 19:03:43.273485: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2024-03-06 19:03:43.273911: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [None]:
# Fit the model to the training data
model.fit(X_train, y_train, epochs=100, batch_size=10, verbose=1)

# Predict the values for X_train
y_pred = model.predict(X_train)

## LightGBM

### Light GBM for all meter values with hyperparameter tuning

In [None]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'num_leaves': [20, 31, 40, 50],
    'learning_rate': [0.01, 0.05, 0.1],
    'feature_fraction': [0.8, 0.9],
}


unique_meters = train_data['meter'].unique()
results = []

for meter_value in unique_meters:
    print(f"Processing meter value: {meter_value}")

    # Splitting data for the specific meter value
    X = train_data[train_data['meter'] == meter_value].drop(columns=['meter_reading', 'meter'])
    y = train_data[train_data['meter'] == meter_value]['meter_reading']

    # Convert 'site_id' from numeric to categorical
    X['site_id'] = X['site_id'].astype('category')

    # Splitting data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define and apply preprocessing pipeline
    preprocessor = ColumnTransformer(transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Scale the target variable
    y_scaler = MinMaxScaler()
    y_train_scaled = y_scaler.fit_transform(np.array(y_train).reshape(-1, 1))
    y_test_scaled = y_scaler.transform(np.array(y_test).reshape(-1, 1))

    # Preparing dataset for LightGBM
    model = LGBMRegressor(boosting_type='gbdt', force_col_wise=True)

    # Set up GridSearchCV
    grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3)

    # Fit the grid search to the data
    grid.fit(X_train_processed, y_train_scaled.flatten())

    # Get the best parameters
    best_params = grid.best_params_

    # Train a final model with the best parameters
    final_model = LGBMRegressor(boosting_type='rf', force_col_wise=True, **best_params)
    final_model.fit(X_train_processed, y_train_scaled.flatten())

    # Model prediction and evaluation
    y_pred_scaled = final_model.predict(X_test_processed)
    rsquared = r2_score(y_test_scaled, y_pred_scaled)


    results.append({
        'meter_value': meter_value,
        'best_params': best_params,
        'r-squared': rsquared
    })

# Convert results to a DataFrame and print
results_df = pd.DataFrame(results)
print(results_df)

In [None]:
import ast

results_df['best_params'] = results_df['best_params'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Convert the DataFrame to a dictionary where keys are meter values and values are the best parameter sets
best_params_dict = pd.Series(results_df.best_params.values, index=results_df.meter_value).to_dict()

results = []
for meter_value in unique_meters:
    print(f"Fitting model for meter type: {meter_value}")

    # Filtering and preprocessing the training data
    X_train = train_data[train_data['meter'] == meter_value].drop(columns=['meter_reading', 'meter'])
    y_train = train_data[train_data['meter'] == meter_value]['meter_reading']
    X_train['site_id'] = X_train['site_id'].astype('category')  # Convert 'site_id' to categorical

    # Apply the same preprocessing as before
    X_train_processed = preprocessor.fit_transform(X_train)
    y_train_scaled = y_scaler.fit_transform(np.array(y_train).reshape(-1, 1))

    # Filtering and preprocessing the test data
    X_test = test_data[test_data['meter'] == meter_value]
    X_test['site_id'] = X_test['site_id'].astype('category')  # Convert 'site_id' to categorical
    X_test_processed = preprocessor.transform(X_test)

    # Define and train the final model using the best parameters found for this meter type
    best_params = best_params_dict[meter_value]  # Retrieve best parameters for the current meter type
    final_model = LGBMRegressor(boosting_type='rf', force_col_wise=True, **best_params)
    final_model.fit(X_train_processed, y_train_scaled.flatten())

    # Predict on the test dataset
    y_pred_scaled = final_model.predict(X_test_processed)
    y_pred = y_scaler.inverse_transform(y_pred_scaled.reshape(-1, 1))  # Convert predictions back to original scale

    results.append({
        'meter_value': meter_value,
        'predictions': y_pred.flatten()
    })


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

# Ensure results is a list containing dictionaries with 'meter_value' and 'predictions'
updated_results = []  # This will store the extended results including the new metrics

for result in results:
    meter_value = result['meter_value']
    predictions = result['predictions']

    # Extract the actual values from the test data
    y_true = test_data[test_data['meter'] == meter_value]['meter_reading'].values  # Ensure this column name matches your actual data

    # Check if y_true and predictions have the same length
    if len(predictions) != len(y_true):
        print(f"Error: Mismatched number of predictions and true values for meter {meter_value}")
        continue  # Skip to the next loop iteration

    # Calculate metrics
    mse = mean_squared_error(y_true, predictions)
    r_squared = r2_score(y_true, predictions)
    negative_mse = -mse

    # Update the results with the new metrics
    updated_results.append({
        'meter_value': meter_value,
        'mse': mse,
        'r_squared': r_squared,
        'negative_mse': negative_mse
    })

# Convert updated results to a DataFrame and print
updated_results_df = pd.DataFrame(updated_results)
print(updated_results_df)


In [14]:
X_train_processed_df.columns

Index(['num__sqm', 'num__sqft', 'num__airTemperature', 'num__cloudCoverage',
       'num__dewTemperature', 'num__precipDepth1HR', 'num__precipDepth6HR',
       'num__seaLvlPressure', 'num__windDirection', 'num__windSpeed',
       'cat__timezone_Europe/Dublin', 'cat__timezone_Europe/London',
       'cat__timezone_US/Central', 'cat__timezone_US/Eastern',
       'cat__timezone_US/Mountain', 'cat__timezone_US/Pacific',
       'cat__season_Fall', 'cat__season_Spring', 'cat__season_Summer',
       'cat__season_Winter', 'cat__sub_primaryspaceusage_Academic',
       'cat__sub_primaryspaceusage_Auditorium',
       'cat__sub_primaryspaceusage_Classroom',
       'cat__sub_primaryspaceusage_College Classroom',
       'cat__sub_primaryspaceusage_College Laboratory',
       'cat__sub_primaryspaceusage_Education',
       'cat__sub_primaryspaceusage_K-12 School',
       'cat__sub_primaryspaceusage_Other - Education',
       'cat__sub_primaryspaceusage_Primary/Secondary Classroom',
       'cat__sub_pri

In [15]:
y_train_scaled

array([[1.36225778],
       [1.327075  ],
       [1.30087263],
       ...,
       [7.26893232],
       [7.18440168],
       [6.90135987]])