In [2]:
import pandas as pd

# Read in the csv
house_path = 'C:\\Users\\nicho\\OneDrive - The University of Western Ontario\\Ecolux\\Databases\\REFIT\\Regression Training Set\\house5.csv'
house_df = pd.read_csv(house_path)

In [3]:
# Converting the time to int with error handling
def convert_time_to_seconds(time_str):
    try:
        hours, minutes, seconds = [int(part) for part in time_str.split(':')]
        return hours * 3600 + minutes * 60 + seconds
    except ValueError:
        return 0

house_df['Time'] = house_df['Time'].apply(convert_time_to_seconds)

In [12]:
# Creating the x and y inputs
house_df.drop(house_df.iloc[:, 0:1], inplace=True, axis=1)
X = house_df.drop(columns=['HVAC', 'AlwaysOn', 'Intermit'])
y = house_df[['HVAC', 'AlwaysOn', 'Intermit']].copy()

# Determining sums
total_energy = X['Total']
y['HVAC_frac'] = y['HVAC'] / total_energy
y['AlwaysOn_frac'] = y['AlwaysOn'] / total_energy
y['Intermit_frac'] = y['Intermit'] / total_energy

# New y with fractions
y_frac = y[['AlwaysOn_frac', 'Intermit_frac', 'HVAC_frac']]

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import ElasticNet

# Assuming X is your feature matrix and y is the target matrix with 3 columns
X_train, X_test, y_train, y_test = train_test_split(X, y_frac, test_size=0.2, random_state=42)

# y testing data for usage amounts
total_energy_test = X_test['Total']
y_test_actuals = y_test * total_energy_test[:, None]

In [14]:
# Initialize and fit the model
linear_model = MultiOutputRegressor(ElasticNet(random_state=42)).fit(X_train, y_train)

# Predicting the percentages
y_pred_fracs_linear = linear_model.predict(X_test)

# Converting back to energy values
y_pred_actuals_linear = y_pred_fracs_linear * total_energy_test.values[:, None]

In [15]:
# Calculate performance metrics
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test_actuals, y_pred_actuals_linear, multioutput='raw_values')
r2 = r2_score(y_test_actuals, y_pred_actuals_linear, multioutput='raw_values')

print("MSE per output:", mse)
print("R^2 per output:", r2)

MSE per output: [ 10682.51675107 212137.97634034 141786.21964131]
R^2 per output: [-8.3082404  -0.17197638  0.24855303]


In [None]:
# Now I'm going to try random forest
from sklearn.ensemble import RandomForestRegressor

# Initialize and fit the model
forest_model = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, random_state=42)).fit(X_train, y_train)

# Predict the fractions
y_pred_fracs_forest = forest_model.predict(X_test)

# Convert predictions back to energy values, as before
y_pred_actuals_forest = y_pred_fracs_forest * total_energy_test[:, None]

In [None]:
# Calculate performance metrics
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test_actuals, y_pred_actuals_forest, multioutput='raw_values')
r2 = r2_score(y_test_actuals, y_pred_actuals_forest, multioutput='raw_values')

print("MSE per output:", mse)
print("R^2 per output:", r2)

In [10]:
# Now I'm going to try gradient boosting
from xgboost import XGBRegressor

# Initialize and fit the model
boost_model = MultiOutputRegressor(XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)).fit(X_train, y_train)

# Predict the fractions
y_pred_fracs_boost = boost_model.predict(X_test)

# Convert predictions back to energy values, as before
y_pred_actuals_boost = y_pred_fracs_boost * total_energy_test[:, None]

In [11]:
# Calculate performance metrics
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test_actuals, y_pred_actuals_boost, multioutput='raw_values')
r2 = r2_score(y_test_actuals, y_pred_actuals_boost, multioutput='raw_values')

print("MSE per output:", mse)
print("R^2 per output:", r2)

MSE per output: [33623.66344407   994.7341691  33044.27320436]
R^2 per output: [0.82179932 0.13323564 0.81744378]
