# Sci-kit Learn Models for Building Energy Modelling

Each regressor has its own strengths and weaknesses, making the choice dependent on the specific dataset characteristics, desired interpretability, and performance requirements.

In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import TimeSeriesSplit

In [None]:
## Regression model library from Scikit-Learn library
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import HuberRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import TheilSenRegressor
    
# Make array of models. Each model is an array of two elements.
# First element is a model-name, second is a model itself
models = [#['RandomForestRegressor', RandomForestRegressor(n_estimators = 1000, random_state = 42)],
#['AdaBoostRegressor', AdaBoostRegressor(n_estimators = 1000, random_state = 42)],
#['BaggingRegressor', BaggingRegressor(n_estimators = 1000, random_state = 42)],
#['DecisionTreeRegressor', DecisionTreeRegressor(random_state = 42)],
#['DummyRegressor', DummyRegressor()],
#['ExtraTreeRegressor', ExtraTreeRegressor(random_state = 42)],
#['ExtraTreesRegressor', ExtraTreesRegressor(n_estimators = 1000, random_state = 42)],
['GaussianProcessRegressor', GaussianProcessRegressor(random_state = 42)],
# ['GradientBoostingRegressor', GradientBoostingRegressor(n_estimators = 1000, random_state = 42)],
# ['HuberRegressor', HuberRegressor()],
# ['KNeighborsRegressor', KNeighborsRegressor()],
# ['MLPRegressor', MLPRegressor(random_state = 42)],
# ['PassiveAggressiveRegressor', PassiveAggressiveRegressor(random_state = 42)],
# ['RANSACRegressor', RANSACRegressor(random_state = 42)],
# ['SGDRegressor', SGDRegressor(random_state = 42)],
# ['TheilSenRegressor', TheilSenRegressor(random_state = 42)]
]

## 1. Load cleaned meter data, weather data and schedule data

In [8]:
# Define output file name and path
input_dir = 'data/chapter4/campus-bldg/cleaned'  # Directory where output files will be saved
input_file = os.path.join(input_dir, 'clean_meter_data.csv')     # Output CSV file name

# Load cleaned meter data
meter_data = pd.read_csv(input_file, index_col=0, parse_dates=True)

# Display the first few rows of the meter data
print("Meter Data:")
print(meter_data.head())
print(meter_data.info())

# Resample the meter data to hourly frequency
meter_data_hourly = meter_data['power_kw'].resample('H').mean()

# Display the first few rows of the resampled meter data
print("Resampled Meter Data:")
print(meter_data_hourly.head())

Meter Data:
                     power_kw         site_name
datetime                                       
2019-05-01 00:00:00    40.856  J Don Boney Bldg
2019-05-01 00:15:00    42.392  J Don Boney Bldg
2019-05-01 00:30:00    42.776  J Don Boney Bldg
2019-05-01 00:45:00    41.472  J Don Boney Bldg
2019-05-01 01:00:00    39.552  J Don Boney Bldg
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 35136 entries, 2019-05-01 00:00:00 to 2020-04-30 23:45:00
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   power_kw   35132 non-null  float64
 1   site_name  35136 non-null  object 
dtypes: float64(1), object(1)
memory usage: 823.5+ KB
None
Resampled Meter Data:
datetime
2019-05-01 00:00:00    41.874
2019-05-01 01:00:00    36.749
2019-05-01 02:00:00    33.984
2019-05-01 03:00:00    31.834
2019-05-01 04:00:00    31.642
Freq: h, Name: power_kw, dtype: float64


  meter_data_hourly = meter_data['power_kw'].resample('H').mean()


In [9]:
# Load weather data
weather_file = os.path.join(input_dir, '77051_2019-01-01_2020-12-31_Weather.csv')
weather_data = pd.read_csv(weather_file, index_col=['datetime'], parse_dates=True)

# Display the first few rows of the weather data
print("\nWeather Data:")
print(weather_data.head())
print(weather_data.info())

# Subset the weather data to the same time period as the meter data
weather_data = weather_data[meter_data_hourly.index.min():meter_data_hourly.index.max()]

# Select outdoor temperature and dewpoint
outdoor_temp = weather_data['temperature']
outdoor_dewpoint = weather_data['dewPoint']
outdoor_cloudcover = weather_data['cloudCover']

# Display the first few rows of the outdoor temperature and dewpoint data
print("\nOutdoor Temperature and Dewpoint:")
print(pd.DataFrame({'outdoor_temp': outdoor_temp, 'outdoor_dewpoint': outdoor_dewpoint, 'outdoor_cloudcover': outdoor_cloudcover}).head())


Weather Data:
                                          time        summary  \
datetime                                                        
2019-01-01 00:00:00  2019-01-01 00:00:00-06:00  Mostly Cloudy   
2019-01-01 01:00:00  2019-01-01 01:00:00-06:00  Mostly Cloudy   
2019-01-01 02:00:00  2019-01-01 02:00:00-06:00  Partly Cloudy   
2019-01-01 03:00:00  2019-01-01 03:00:00-06:00  Partly Cloudy   
2019-01-01 04:00:00  2019-01-01 04:00:00-06:00  Partly Cloudy   

                                    icon  precipIntensity  precipAccumulation  \
datetime                                                                        
2019-01-01 00:00:00  partly-cloudy-night              0.0                 0.0   
2019-01-01 01:00:00  partly-cloudy-night              0.0                 0.0   
2019-01-01 02:00:00  partly-cloudy-night              0.0                 0.0   
2019-01-01 03:00:00  partly-cloudy-night              0.0                 0.0   
2019-01-01 04:00:00  partly-cloudy-night   

In [15]:
# Load seasonal schedule data
schedule_file = os.path.join(input_dir, 'schedule.csv')
schedule_data = pd.read_csv(schedule_file, index_col=['Date'], parse_dates=True)

# Display the first few rows of the schedule data
print("\nSchedule Data:")
# print(schedule_data.head())
# print(schedule_data.info())

# Rename the columns for clarity
schedule_data.columns = ['seasonal']

# Resample the schedule data to hourly frequency
schedule_data = schedule_data.reindex(pd.date_range(start=meter_data_hourly.index.min(), 
                                                   end=meter_data_hourly.index.max(), 
                                                   freq='h')).ffill().bfill()

# Display the first few rows of the resampled schedule data
print("\nResampled Schedule Data:")
print(schedule_data.head())
print(schedule_data.info())


Schedule Data:

Resampled Schedule Data:
                    seasonal
2019-05-01 00:00:00  Regular
2019-05-01 01:00:00  Regular
2019-05-01 02:00:00  Regular
2019-05-01 03:00:00  Regular
2019-05-01 04:00:00  Regular
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 8784 entries, 2019-05-01 00:00:00 to 2020-04-30 23:00:00
Freq: h
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   seasonal  8784 non-null   object
dtypes: object(1)
memory usage: 137.2+ KB
None


In [18]:
# Create an array of months from the meter data
months = np.array([meter_data_hourly.index.month.unique()])[0]
n_splits = 3
tscv = TimeSeriesSplit(n_splits=n_splits)

# Display the months and number of splits
print(tscv)

for train_index, test_index in tscv.split(months):
    month_train, month_test = months[train_index], months[test_index]
    print(month_train, month_test)

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None)
[5 6 7] [ 8  9 10]
[ 5  6  7  8  9 10] [11 12  1]
[ 5  6  7  8  9 10 11 12  1] [2 3 4]


In [19]:
# Create train-test indices for time-series split and every-fourth-month split
def create_train_test_indices(months):
    train_test_lists = []
    
    #Get time-series split version
    n_splits = 3
    tscv = TimeSeriesSplit(n_splits=n_splits)
    for train_index, test_index in tscv.split(months):
        month_train, month_test = months[train_index], months[test_index]
        train_test_lists.append([month_train, month_test])
        
    #Add the 'every-fourth-month' version
    train_test_lists.append([np.concatenate([months[0:3], months[4:7], 
                                            months[8:11]]), np.array([months[4], months[7], months[11]])])
    
    return train_test_lists

In [20]:
# Create train-test indices
train_test_lists = create_train_test_indices(months)

# Display the train-test indices
print("Train-Test Indices:")
print("Each element is a list of two arrays: train and test indices")
for train_index, test_index in train_test_lists:  
    print(train_index, test_index)

Train-Test Indices:
Each element is a list of two arrays: train and test indices
[5 6 7] [ 8  9 10]
[ 5  6  7  8  9 10] [11 12  1]
[ 5  6  7  8  9 10 11 12  1] [2 3 4]
[ 5  6  7  9 10 11  1  2  3] [ 9 12  4]


In [22]:
train_index

array([ 5,  6,  7,  9, 10, 11,  1,  2,  3], dtype=int32)

In [21]:
# Create a list of training months
training_data = meter_data_hourly[meter_data_hourly.index.month.isin(train_index)]
# Display the first few rows of the training data
print("\nTraining Data:")
print(training_data.head())


Training Data:
datetime
2019-05-01 00:00:00    41.874
2019-05-01 01:00:00    36.749
2019-05-01 02:00:00    33.984
2019-05-01 03:00:00    31.834
2019-05-01 04:00:00    31.642
Name: power_kw, dtype: float64


In [24]:
# Merge the training data with weather and schedule data
training_data = pd.merge(pd.DataFrame({"energy":training_data}), outdoor_temp, right_index=True, left_index=True)
training_data = pd.merge(training_data, outdoor_dewpoint, right_index=True, left_index=True)
training_data = pd.merge(training_data, outdoor_cloudcover, right_index=True, left_index=True)

# Merge the training data with schedule data dummy variable
training_data = pd.merge(training_data, pd.get_dummies(schedule_data), right_index=True, left_index=True)

# Display the first few rows of the merged training data
print("\nMerged Training Data:")
print(training_data.head())
print(training_data.info())


Merged Training Data:
                     energy  temperature  dewPoint  cloudCover  \
2019-05-01 00:00:00  41.874        25.19     22.06        0.99   
2019-05-01 01:00:00  36.749        24.78     21.85        0.73   
2019-05-01 02:00:00  33.984        24.68     21.74        0.96   
2019-05-01 03:00:00  31.834        24.32     21.60        0.70   
2019-05-01 04:00:00  31.642        24.04     21.97        0.44   

                     seasonal_Break  seasonal_Holiday  seasonal_Regular  \
2019-05-01 00:00:00           False             False              True   
2019-05-01 01:00:00           False             False              True   
2019-05-01 02:00:00           False             False              True   
2019-05-01 03:00:00           False             False              True   
2019-05-01 04:00:00           False             False              True   

                     seasonal_Summer  
2019-05-01 00:00:00            False  
2019-05-01 01:00:00            False  
2019-05-01 0

In [27]:
# Create a function to create a list of training and testing data
def get_features_and_labels(meter_data_hourly, outdoor_temp, outdoor_dewpoint, outdoor_cloudcover, schedule, months):
    data = meter_data_hourly[meter_data_hourly.index.month.isin(months)]
    data = pd.merge(pd.DataFrame({"energy":data}), outdoor_temp, right_index=True, left_index=True)
    data = pd.merge(data, outdoor_dewpoint, right_index=True, left_index=True)
    data = pd.merge(data, outdoor_cloudcover, right_index=True, left_index=True)
    data = pd.merge(data, pd.get_dummies(schedule), right_index=True, left_index=True)
    
    features = pd.concat((pd.get_dummies(data.index.hour),
                            pd.get_dummies(data.index.dayofweek),
                            data.drop(["energy"], axis=1).reset_index(drop=True)),axis=1)
    
    features = features.ffill().bfill()
    
    # Convert features to numpy array
    labels = data["energy"].values
    features = np.array(features)
    
    return features, labels



In [28]:
# Create train-test indices for the first split
train_features, train_labels = get_features_and_labels(meter_data_hourly, outdoor_temp, outdoor_dewpoint, outdoor_cloudcover, schedule_data, train_index)
test_features, test_labels = get_features_and_labels(meter_data_hourly, outdoor_temp, outdoor_dewpoint, outdoor_cloudcover, schedule_data, test_index)

## 3. Training models

In [34]:
# Train a dummy regressor as a baseline model
testmodel = DummyRegressor()
testmodel.fit(train_features, train_labels)

# Predict using the trained model
predictions = testmodel.predict(test_features)

# Evaluate the model's performance
r2 = r2_score(test_labels, predictions)
print(f"R^2 Score: {r2}")

# Calculate the metrics for the model
metrics = {
    'R^2 Score': r2,
    'Mean Absolute Error': np.mean(np.abs(test_labels - predictions)),
    'Mean Squared Error': np.mean((test_labels - predictions) ** 2),
    'Root Mean Squared Error': np.sqrt(np.mean((test_labels - predictions) ** 2)),
    'CVRSME': np.sqrt(np.mean((test_labels - predictions) ** 2)) / np.mean(test_labels)
}
# Display the metrics
print("\nModel Metrics:")
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

# Save the model metrics to a CSV file
output_dir = 'data/chapter4/campus-bldg/models'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'model_metrics.csv')
metrics_df = pd.DataFrame(metrics, index=[0])
# Add model name to the metrics DataFrame
metrics_df.insert(0, 'Model', 'DummyRegressor')
# Add site name to the metrics DataFrame
metrics_df.insert(0, 'Site', 'Campus Building')

# Save the metrics DataFrame to a CSV file
metrics_df.to_csv(output_file, index=False)

R^2 Score: -0.0392760366399556

Model Metrics:
R^2 Score: -0.0393
Mean Absolute Error: 7.9542
Mean Squared Error: 104.7918
Root Mean Squared Error: 10.2368
CVRSME: 0.2452


In [35]:
## Loop through training data
index = 0
# Loop through each train-test split
for train_index, test_index in train_test_lists:
    # Get features and labels for training and testing data
    train_features, train_labels = get_features_and_labels(meter_data_hourly, outdoor_temp, outdoor_dewpoint, outdoor_cloudcover, schedule_data, train_index)
    test_features, test_labels = get_features_and_labels(meter_data_hourly, outdoor_temp, outdoor_dewpoint, outdoor_cloudcover, schedule_data, test_index)
    
    # Train a dummy regressor as a baseline model
    testmodel = DummyRegressor()
    testmodel.fit(train_features, train_labels)
    
    # Predict using the trained model
    predictions = testmodel.predict(test_features)
    
    # Evaluate the model's performance
    r2 = r2_score(test_labels, predictions)
    print(f"R^2 Score for train-test split {train_index} - {test_index}: {r2}")

    # Calculate the metrics for the model
    metrics = {
        'R^2 Score': r2,
        'Mean Absolute Error': np.mean(np.abs(test_labels - predictions)),
        'Mean Squared Error': np.mean((test_labels - predictions) ** 2),
        'Root Mean Squared Error': np.sqrt(np.mean((test_labels - predictions) ** 2)),
        'CVRSME': np.sqrt(np.mean((test_labels - predictions) ** 2)) / np.mean(test_labels)
    }

    # Save the model metrics to a CSV file
    output_file = os.path.join(output_dir, f'model_metrics_split_validation_{index}.csv')
    metrics_df = pd.DataFrame(metrics, index=[0])
    # Add model name to the metrics DataFrame
    metrics_df.insert(0, 'Model', 'DummyRegressor')
    # Add site name to the metrics DataFrame
    metrics_df.insert(0, 'Site', 'Campus Building')
    metrics_df.to_csv(output_file, index=False)

    # Increment the index for the next split
    index += 1

R^2 Score for train-test split [5 6 7] - [ 8  9 10]: -0.0030925894992683833
R^2 Score for train-test split [ 5  6  7  8  9 10] - [11 12  1]: -0.7170448993234819
R^2 Score for train-test split [ 5  6  7  8  9 10 11 12  1] - [2 3 4]: -2.2216996262480748e-05
R^2 Score for train-test split [ 5  6  7  9 10 11  1  2  3] - [ 9 12  4]: -0.0392760366399556


In [41]:
# Make array of models. Each model is an array of two elements.
# First element is a model-name, second is the model parameters
models = [['RandomForestRegressor', RandomForestRegressor(n_estimators = 1000, random_state = 42)],
['AdaBoostRegressor', AdaBoostRegressor(n_estimators = 1000, random_state = 42)],
['BaggingRegressor', BaggingRegressor(n_estimators = 1000, random_state = 42)],
['DecisionTreeRegressor', DecisionTreeRegressor(random_state = 42)],
['DummyRegressor', DummyRegressor()],
['ExtraTreeRegressor', ExtraTreeRegressor(random_state = 42)],
['ExtraTreesRegressor', ExtraTreesRegressor(n_estimators = 1000, random_state = 42)],
['GaussianProcessRegressor', GaussianProcessRegressor(random_state = 42)],
['GradientBoostingRegressor', GradientBoostingRegressor(n_estimators = 1000, random_state = 42)],
['HuberRegressor', HuberRegressor()],
['KNeighborsRegressor', KNeighborsRegressor()],
['MLPRegressor', MLPRegressor(random_state = 42)],
['PassiveAggressiveRegressor', PassiveAggressiveRegressor(random_state = 42)],
['RANSACRegressor', RANSACRegressor(random_state = 42)],
['SGDRegressor', SGDRegressor(random_state = 42)],
['TheilSenRegressor', TheilSenRegressor(random_state = 42)]
]

In [42]:
# Loop through each train-test split
index = 0
# Loop through each train-test split
for train_index, test_index in train_test_lists:
    # Print the current train-test split indices
    print(f"\nTrain-Test Split {index}:")

    # Define an empty DataFrame to store all metrics for each model
    metrics_df_all = pd.DataFrame()

    # Get features and labels for training and testing data
    train_features, train_labels = get_features_and_labels(meter_data_hourly, outdoor_temp, outdoor_dewpoint, outdoor_cloudcover, schedule_data, train_index)
    test_features, test_labels = get_features_and_labels(meter_data_hourly, outdoor_temp, outdoor_dewpoint, outdoor_cloudcover, schedule_data, test_index)
    
    # Loop through each model in the models array
    for model_name, model in models:
        print(f"\nTraining model: {model_name}")
        # Train the model
        model.fit(train_features, train_labels)
        
        # Predict using the trained model
        predictions = model.predict(test_features)
        
        # Evaluate the model's performance
        r2 = r2_score(test_labels, predictions)
        print(f"R^2 Score for {model_name} on train-test split {train_index} - {test_index}: {r2}")

        # Calculate the metrics for the model
        metrics = {
            'R^2 Score': r2,
            'Mean Absolute Error': np.mean(np.abs(test_labels - predictions)),
            'Mean Squared Error': np.mean((test_labels - predictions) ** 2),
            'Root Mean Squared Error': np.sqrt(np.mean((test_labels - predictions) ** 2)),
            'CVRSME': np.sqrt(np.mean((test_labels - predictions) ** 2)) / np.mean(test_labels)
        }

        # Save the model metrics to a CSV file
        # output_file = os.path.join(output_dir, f'model_metrics_{model_name}_split_validation_{index}.csv')
        metrics_df = pd.DataFrame(metrics, index=[0])
        # Add model name to the metrics DataFrame
        metrics_df.insert(0, 'Model', model_name)
        # Add site name to the metrics DataFrame
        metrics_df.insert(0, 'Site', 'Campus Building')
        # metrics_df.to_csv(output_file, index=False)

        # Append the metrics DataFrame for the current model to the all metrics DataFrame
        # This will accumulate metrics for all models across all splits
        if metrics_df_all.empty:
            metrics_df_all = metrics_df
        else:
            # Concatenate the current model's metrics DataFrame with the all metrics DataFrame
            metrics_df_all = pd.concat([metrics_df_all, metrics_df])

    # Save all model metrics to a single CSV file
    all_metrics_output_file = os.path.join(output_dir, f'predict_metrics_all_model_validation_{index}.csv')
    metrics_df_all.to_csv(all_metrics_output_file, index=False)
    print(f"All model metrics saved to {all_metrics_output_file}")

    # Increment the index for the next split
    index += 1


Train-Test Split 0:

Training model: RandomForestRegressor
R^2 Score for RandomForestRegressor on train-test split [5 6 7] - [ 8  9 10]: 0.2064117851544479

Training model: AdaBoostRegressor
R^2 Score for AdaBoostRegressor on train-test split [5 6 7] - [ 8  9 10]: 0.1722833675949026

Training model: BaggingRegressor
R^2 Score for BaggingRegressor on train-test split [5 6 7] - [ 8  9 10]: 0.20683288849167225

Training model: DecisionTreeRegressor
R^2 Score for DecisionTreeRegressor on train-test split [5 6 7] - [ 8  9 10]: 0.06380737690898985

Training model: DummyRegressor
R^2 Score for DummyRegressor on train-test split [5 6 7] - [ 8  9 10]: -0.0030925894992683833

Training model: ExtraTreeRegressor
R^2 Score for ExtraTreeRegressor on train-test split [5 6 7] - [ 8  9 10]: -0.117488537124389

Training model: ExtraTreesRegressor
R^2 Score for ExtraTreesRegressor on train-test split [5 6 7] - [ 8  9 10]: 0.03742913840212991

Training model: GaussianProcessRegressor
R^2 Score for Gaussi

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


R^2 Score for KNeighborsRegressor on train-test split [5 6 7] - [ 8  9 10]: 0.11775927628922422

Training model: MLPRegressor




R^2 Score for MLPRegressor on train-test split [5 6 7] - [ 8  9 10]: 0.03906483705421382

Training model: PassiveAggressiveRegressor
R^2 Score for PassiveAggressiveRegressor on train-test split [5 6 7] - [ 8  9 10]: -0.1966216868383821

Training model: RANSACRegressor
R^2 Score for RANSACRegressor on train-test split [5 6 7] - [ 8  9 10]: -0.04693238409925149

Training model: SGDRegressor
R^2 Score for SGDRegressor on train-test split [5 6 7] - [ 8  9 10]: -8340323440882.027

Training model: TheilSenRegressor
R^2 Score for TheilSenRegressor on train-test split [5 6 7] - [ 8  9 10]: 0.11898717383519997
All model metrics saved to data/chapter4/campus-bldg/models/predict_metrics_all_model_validation_0.csv

Train-Test Split 1:

Training model: RandomForestRegressor
R^2 Score for RandomForestRegressor on train-test split [ 5  6  7  8  9 10] - [11 12  1]: -0.6750791146949848

Training model: AdaBoostRegressor
R^2 Score for AdaBoostRegressor on train-test split [ 5  6  7  8  9 10] - [11 12  1

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


R^2 Score for KNeighborsRegressor on train-test split [ 5  6  7  8  9 10] - [11 12  1]: -0.8819406995244847

Training model: MLPRegressor




R^2 Score for MLPRegressor on train-test split [ 5  6  7  8  9 10] - [11 12  1]: -1.1459528148159812

Training model: PassiveAggressiveRegressor
R^2 Score for PassiveAggressiveRegressor on train-test split [ 5  6  7  8  9 10] - [11 12  1]: -0.45630885762886697

Training model: RANSACRegressor
R^2 Score for RANSACRegressor on train-test split [ 5  6  7  8  9 10] - [11 12  1]: -1.0325826601774435

Training model: SGDRegressor




R^2 Score for SGDRegressor on train-test split [ 5  6  7  8  9 10] - [11 12  1]: -7247751.137543494

Training model: TheilSenRegressor
R^2 Score for TheilSenRegressor on train-test split [ 5  6  7  8  9 10] - [11 12  1]: -1.3331760956757264
All model metrics saved to data/chapter4/campus-bldg/models/predict_metrics_all_model_validation_1.csv

Train-Test Split 2:

Training model: RandomForestRegressor
R^2 Score for RandomForestRegressor on train-test split [ 5  6  7  8  9 10 11 12  1] - [2 3 4]: 0.16258099379039515

Training model: AdaBoostRegressor
R^2 Score for AdaBoostRegressor on train-test split [ 5  6  7  8  9 10 11 12  1] - [2 3 4]: 0.13027322864320523

Training model: BaggingRegressor
R^2 Score for BaggingRegressor on train-test split [ 5  6  7  8  9 10 11 12  1] - [2 3 4]: 0.16180994173302077

Training model: DecisionTreeRegressor
R^2 Score for DecisionTreeRegressor on train-test split [ 5  6  7  8  9 10 11 12  1] - [2 3 4]: -0.6458984753718537

Training model: DummyRegressor
R

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


R^2 Score for KNeighborsRegressor on train-test split [ 5  6  7  8  9 10 11 12  1] - [2 3 4]: 0.021049804594017507

Training model: MLPRegressor




R^2 Score for MLPRegressor on train-test split [ 5  6  7  8  9 10 11 12  1] - [2 3 4]: 0.3492834170699862

Training model: PassiveAggressiveRegressor
R^2 Score for PassiveAggressiveRegressor on train-test split [ 5  6  7  8  9 10 11 12  1] - [2 3 4]: 0.2702021476289409

Training model: RANSACRegressor
R^2 Score for RANSACRegressor on train-test split [ 5  6  7  8  9 10 11 12  1] - [2 3 4]: -0.22941448920237173

Training model: SGDRegressor
R^2 Score for SGDRegressor on train-test split [ 5  6  7  8  9 10 11 12  1] - [2 3 4]: -13008.266753595699

Training model: TheilSenRegressor
R^2 Score for TheilSenRegressor on train-test split [ 5  6  7  8  9 10 11 12  1] - [2 3 4]: 0.26698084279764644
All model metrics saved to data/chapter4/campus-bldg/models/predict_metrics_all_model_validation_2.csv

Train-Test Split 3:

Training model: RandomForestRegressor
R^2 Score for RandomForestRegressor on train-test split [ 5  6  7  9 10 11  1  2  3] - [ 9 12  4]: 0.3804353686442362

Training model: AdaB

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


R^2 Score for KNeighborsRegressor on train-test split [ 5  6  7  9 10 11  1  2  3] - [ 9 12  4]: 0.20162844479815822

Training model: MLPRegressor




R^2 Score for MLPRegressor on train-test split [ 5  6  7  9 10 11  1  2  3] - [ 9 12  4]: 0.24085793520781784

Training model: PassiveAggressiveRegressor
R^2 Score for PassiveAggressiveRegressor on train-test split [ 5  6  7  9 10 11  1  2  3] - [ 9 12  4]: 0.0085278760477977

Training model: RANSACRegressor
R^2 Score for RANSACRegressor on train-test split [ 5  6  7  9 10 11  1  2  3] - [ 9 12  4]: -0.33034710168355597

Training model: SGDRegressor
R^2 Score for SGDRegressor on train-test split [ 5  6  7  9 10 11  1  2  3] - [ 9 12  4]: -19460.586543522564

Training model: TheilSenRegressor
R^2 Score for TheilSenRegressor on train-test split [ 5  6  7  9 10 11  1  2  3] - [ 9 12  4]: 0.23505993916841272
All model metrics saved to data/chapter4/campus-bldg/models/predict_metrics_all_model_validation_3.csv
