In [1]:
# Importing the required libraries 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error,  r2_score
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import ast


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Load datasets

# Train Data
temp_df = pd.read_csv("../../data/cleaned/train.csv", nrows=0)  # Read only the header
total_columns = len(temp_df.columns)
columns_to_use = temp_df.columns[1:total_columns]
train_data = pd.read_csv("../../data/cleaned/train.csv", usecols=columns_to_use)

# Test Data
test_data = pd.read_csv("../../data/cleaned/test.csv", usecols=columns_to_use)


# # Dropping the columns that are not relevant to our analysis
train_data = train_data.drop(columns=['building_name', 'site_name','date'])
test_data = test_data.drop(columns=['building_name', 'site_name','date'])


train_data = train_data[(train_data['meter'] == 'electricity') | (train_data['meter'] == 'chilledwater') | (train_data['meter'] == 'steam')
| (train_data['meter'] == 'hotwater') | (train_data['meter'] == 'gas')]

test_data = test_data[(test_data['meter'] == 'electricity') | (test_data['meter'] == 'chilledwater') | (test_data['meter'] == 'steam')
| (test_data['meter'] == 'hotwater') | (test_data['meter'] == 'gas')]


# Building index on building_id for furhter assessment
train_data.set_index('building_id', inplace=True)
test_data.set_index('building_id', inplace=True)

In [None]:
# Inspecting the data frames
print(train_data.sample(2))
print('-------------------------------------------------------------')
print(test_data.sample(2))

In [4]:
# Define features and types based on your dataset
numerical_features = ['sqm', 'sqft', 'airTemperature', 'cloudCoverage', 'dewTemperature',
                      'precipDepth1HR', 'precipDepth6HR', 'seaLvlPressure', 'windDirection', 'windSpeed']
categorical_features = ['timezone', 'season', 'sub_primaryspaceusage', 'site_id']

### Random Forest

### Random Forest Parameter Tuning

In [6]:

unique_meters = ['electricity','chilledwater', 'steam', 'hotwater', 'gas' ]

results = []

for meter_value in unique_meters:
    print(f"Processing meter value: {meter_value}")

    # Splitting data for the specific meter value
    X = train_data[train_data['meter'] == meter_value].drop(columns=['meter_reading', 'meter'])
    y = train_data[train_data['meter'] == meter_value]['meter_reading']

    # Convert 'site_id' from numeric to categorical
    X['site_id'] = X['site_id'].astype('category')

    # Splitting data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define and apply preprocessing pipeline
    preprocessor = ColumnTransformer(transformers=[
        ('num', MinMaxScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    # Scale the target variable
    y_train_scaled = np.log1p(y_train.values.reshape(-1, 1))

    y_test_scaled = np.log1p(y_test.values.reshape(-1, 1))

    rf = RandomForestRegressor()
    param_distributions = {
        'n_estimators': [100, 150],
        'max_depth': [None, 10, 20],
        'min_samples_split': [100],
        'min_samples_leaf': [100]
    }

    rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_distributions,
                                   n_iter=5,
                                   cv=3,
                                   verbose=1, random_state=42, n_jobs=-1)
    
    rf_random.fit(X_train_processed, y_train)
    print(f"Best parameters for meter value {meter_value}: {rf_random.best_params_}")

    y_pred = rf_random.best_estimator_.predict(X_test_processed)
    r2 = r2_score(y_test, y_pred)
    results.append({'meter_value': meter_value, 'r-squared': r2, 'best_params': rf_random.best_params_})

results_df = pd.DataFrame(results)
print(results_df[['meter_value', 'r-squared', 'best_params']])


Processing meter value: electricity
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best parameters for meter value electricity: {'n_estimators': 150, 'min_samples_split': 100, 'min_samples_leaf': 100, 'max_depth': 20}
Processing meter value: chilledwater
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best parameters for meter value chilledwater: {'n_estimators': 100, 'min_samples_split': 100, 'min_samples_leaf': 100, 'max_depth': 20}
Processing meter value: steam
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best parameters for meter value steam: {'n_estimators': 100, 'min_samples_split': 100, 'min_samples_leaf': 100, 'max_depth': None}
Processing meter value: hotwater
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best parameters for meter value hotwater: {'n_estimators': 100, 'min_samples_split': 100, 'min_samples_leaf': 100, 'max_depth': 20}
Processing meter value: gas
Fitting 3 folds for each of 5 candidates, totalling 15 fits
Be

In [10]:
results_df[['meter_value', 'r-squared']]

Unnamed: 0,meter_value,r-squared
0,electricity,0.900766
1,chilledwater,0.851433
2,steam,0.322394
3,hotwater,0.730532
4,gas,0.805817


# Predictions

In [25]:


# Convert the string representation of dictionaries to actual dictionaries
results_df['best_params'] = results_df['best_params'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Convert the DataFrame to a dictionary where keys are meter values and values are the best parameter sets
best_params_dict = pd.Series(results_df.best_params.values, index=results_df.meter_value).to_dict()

results = []
for meter_value in unique_meters:
    print(f"Fitting model for meter type: {meter_value}")

    # Filtering the training data
    X_train = train_data[train_data['meter'] == meter_value].drop(columns=['meter_reading', 'meter'])
    y_train = train_data[train_data['meter'] == meter_value]['meter_reading']

    # Define preprocessing steps for the pipeline
    numeric_features = X_train.select_dtypes(include=['float64', 'int64']).columns
    categorical_features = X_train.select_dtypes(include=['object']).columns

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),  # Impute missing values with median
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),  # Impute missing values with 'Missing'
        ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encode categorical features
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])

    # Define and train the final model using the best parameters found for this meter type
    best_params = best_params_dict[meter_value]  # Retrieve best parameters for the current meter type
    final_model = RandomForestRegressor(**best_params)
    model_pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('model', final_model)])
    model_pipeline.fit(X_train, y_train)

    # Filtering and preprocessing the test data
    X_test = test_data[test_data['meter'] == meter_value]
    y_test = X_test['meter_reading']
    X_test = X_test.drop(columns=['meter_reading', 'meter'])

    # Predict on the test dataset
    y_pred = model_pipeline.predict(X_test)

    # Evaluate the model on the test data
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({
        'meter_value': meter_value,
        'mse': mse,
        'r_squared': r2,
        'predictions': y_pred
    })



Fitting model for meter type: electricity
Fitting model for meter type: chilledwater
Fitting model for meter type: steam
Fitting model for meter type: hotwater
Fitting model for meter type: gas


In [28]:
for result in results:
    meter_value = result['meter_value']
    r_squared = result['r_squared']
    print(f"Meter value: {meter_value}, R-squared: {r_squared}")

Meter value: electricity, R-squared: 0.8526786664889417
Meter value: chilledwater, R-squared: 0.6125232218931265
Meter value: steam, R-squared: -16013.703113977832
Meter value: hotwater, R-squared: 0.5622079015059647
Meter value: gas, R-squared: 0.73969688287177
