In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

df = pd.read_csv("total_with_new_outputs_updated_equipment.csv")
df = df.drop(columns=['Unnamed: 0'])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3718 entries, 0 to 3717
Data columns (total 37 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   U-Factor Wall [Btu/hr-ft2-°F]   3718 non-null   float64
 1   U-Factor Roof [Btu/hr-ft2-°F]   3718 non-null   float64
 2   Net Wall Area [ft2]             3718 non-null   float64
 3   Net Roof Area [ft2]             3718 non-null   float64
 4   Glass U-Factor [Btu/hr-ft2-°F]  3718 non-null   float64
 5   Glass SHGC                      3718 non-null   float64
 6   Infiltration (cfm/ft2)          3718 non-null   float64
 7   Window-Wall Ratio               3718 non-null   float64
 8   Plug and Process [W/ft2]        3718 non-null   float64
 9   Lighting [W/ft2]                3718 non-null   float64
 10  Building Area [ft2]             3718 non-null   float64
 11  vpz (cfm)                       3718 non-null   float64
 12  vot (cfm)                       37

In [2]:
import pandas as pd

# Assuming 'df' is your original DataFrame

# Features for Subset A (HVAC Types and Building Categories)
features_a = [
    'Chiller', 'Boiler', 'DX Cooling', 'DX Heating', 'Building Type_Hospital', 
    'Building Type_Lodging', 'Building Type_Office', 'Building Type_Restaurant',
    'Building Type_Retail', 'Building Type_School', 'Building Type_Warehouse'
]

# Features for Subset B (Continuous Variables, Building Categories, and Climate Zones)
features_b = [
    'U-Factor Wall [Btu/hr-ft2-°F]', 'U-Factor Roof [Btu/hr-ft2-°F]', 
    'Net Wall Area [ft2]', 'Net Roof Area [ft2]', 'Glass U-Factor [Btu/hr-ft2-°F]', 
    'Glass SHGC', 'Infiltration (cfm/ft2)', 'Window-Wall Ratio', 
    'Plug and Process [W/ft2]', 'Lighting [W/ft2]', 'Building Area [ft2]', 
    'vpz (cfm)', 'vot (cfm)', 
    'Climate Category_cool', 'Climate Category_hot', 'Climate Category_temperate'
]

# Target Variables
targets = [
    'Heating (kBtu/ft2)', 'Cooling (kBtu/ft2)', 'Fans (kBtu/ft2)', 
    'Pumps (kBtu/ft2)', 'Heat Rejection (kBtu/ft2)','Total Lighting (kBtu/ft2)','Total Equipment (kBtu/ft2)'
]

# Reset the index to add a new "Index" column
df = df.reset_index()
df.rename(columns={'index': 'Index'}, inplace=True)

# Define the order of features and targets as they will appear in the new DataFrame
new_column_order = ['Index'] + features_a + features_b + targets

# Reorder the DataFrame columns based on the new sequence
df = df.reindex(columns=new_column_order)

# Display the first few rows of the reordered DataFrame
df.head()


Unnamed: 0,Index,Chiller,Boiler,DX Cooling,DX Heating,Building Type_Hospital,Building Type_Lodging,Building Type_Office,Building Type_Restaurant,Building Type_Retail,Building Type_School,Building Type_Warehouse,U-Factor Wall [Btu/hr-ft2-°F],U-Factor Roof [Btu/hr-ft2-°F],Net Wall Area [ft2],Net Roof Area [ft2],Glass U-Factor [Btu/hr-ft2-°F],Glass SHGC,Infiltration (cfm/ft2),Window-Wall Ratio,Plug and Process [W/ft2],Lighting [W/ft2],Building Area [ft2],vpz (cfm),vot (cfm),Climate Category_cool,Climate Category_hot,Climate Category_temperate,Heating (kBtu/ft2),Cooling (kBtu/ft2),Fans (kBtu/ft2),Pumps (kBtu/ft2),Heat Rejection (kBtu/ft2),Total Lighting (kBtu/ft2),Total Equipment (kBtu/ft2)
0,0,1,1,0,1,0,0,0,0,0,1,0,0.272,0.204,4770.95,7105.24,0.455905,0.526942,0.10745,26.08,1.099684,0.992195,184254.072142,56864.170672,35988.964912,1,0,0,10.948703,1.693893,2.469517,1.23998,0.227522,8.949458,7.371101
1,1,0,0,1,1,0,0,0,0,0,1,0,0.221042,0.151064,4770.95,7105.24,0.35714,0.328865,0.10745,26.08,1.099684,0.355772,184254.072142,34496.213952,34496.213952,1,0,0,5.28559,0.774698,1.399652,1.835715,0.0,3.387683,7.371101
2,2,0,0,1,1,0,0,0,0,0,1,0,0.221042,0.151064,4770.95,7105.24,0.356154,0.320267,0.10745,26.08,1.099684,0.355772,184254.072142,34496.213952,34496.213952,1,0,0,5.248758,0.763844,1.399189,1.822649,0.0,3.387889,7.371101
3,3,0,0,1,1,0,0,0,0,0,1,0,0.619,0.22,1756.82,4238.89,0.078116,0.793355,0.099661,27.73,0.76134,0.990002,59812.193686,96468.36864,20940.043488,1,0,0,21.100159,7.12968,17.022207,0.047381,0.0,6.662683,8.926996
4,4,0,0,1,1,0,0,0,0,0,1,0,0.619,0.22,1756.82,4238.89,0.078116,0.793355,0.099661,27.73,0.76134,0.990002,59812.193686,88931.936256,20940.043488,1,0,0,19.92688,5.545659,13.071028,0.047381,0.0,6.662683,8.926996


In [3]:
import pandas as pd

# Set a fixed random seed for reproducibility
# random_seed = 42

# Select 50 rows randomly 
df_test = df.sample(n=50)

# Remove these rows from the original DataFrame to create df_remaining
df_remaining = df.drop(df_test.index)

# Display the shape of df_test for verification
print("Shape of df_test:", df_test.shape)


Shape of df_test: (50, 35)


In [4]:
df_remaining.shape

(3668, 35)

In [5]:
df.shape

(3718, 35)

In [6]:
import pandas as pd

# Assuming your DataFrame is named 'df'

# Features for Subset A (HVAC Types and Building Categories)
features_a = [
    'Chiller', 'Boiler', 'DX Cooling', 'DX Heating', 'Building Type_Hospital', 
    'Building Type_Lodging', 'Building Type_Office', 'Building Type_Restaurant',
    'Building Type_Retail', 'Building Type_School', 'Building Type_Warehouse'
]

# Features for Subset B (Continuous Variables, Building Categories, and Climate Zones)
features_b = [
    'U-Factor Wall [Btu/hr-ft2-°F]', 'U-Factor Roof [Btu/hr-ft2-°F]', 
    'Net Wall Area [ft2]', 'Net Roof Area [ft2]', 'Glass U-Factor [Btu/hr-ft2-°F]', 
    'Glass SHGC', 'Infiltration (cfm/ft2)', 'Window-Wall Ratio', 
    'Plug and Process [W/ft2]', 'Lighting [W/ft2]', 'Building Area [ft2]', 
    'vpz (cfm)', 'vot (cfm)', 
    'Climate Category_cool', 'Climate Category_hot', 'Climate Category_temperate'
]

# Target Variables
targets = [
    'Heating (kBtu/ft2)', 'Cooling (kBtu/ft2)', 'Fans (kBtu/ft2)', 
    'Pumps (kBtu/ft2)', 'Heat Rejection (kBtu/ft2)','Total Lighting (kBtu/ft2)','Total Equipment (kBtu/ft2)'
]

# Creating Subset A
subset_a = df_remaining[features_a + targets]
subset_a.reset_index(inplace=True)


# Creating Subset B
subset_b = df_remaining[features_b + targets]
subset_b.reset_index(inplace=True)

# Now 'subset_a' and 'subset_b' are ready for separate model training


In [7]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.metrics import r2_score, mean_squared_error

# Assuming subset_a and subset_b are defined
# targets = ['your', 'target', 'columns']

# For Subset A
X_a = subset_a.drop(columns=targets + ['index'])
y_a = subset_a[targets]

# For Subset B
X_b = subset_b.drop(columns=targets + ['index'])
y_b = subset_b[targets]

# Apply log1p transformation
y_a_transformed = np.log1p(y_a)
y_b_transformed = np.log1p(y_b)

# Sanitize column names
sanitize_col_names = lambda cols: ["".join([c if c.isalnum() else "_" for c in str(col)]) for col in cols]
X_a.columns = sanitize_col_names(X_a.columns)
X_b.columns = sanitize_col_names(X_b.columns)

# Initialize models
rf_regressor_a = RandomForestRegressor()
rf_regressor_b = RandomForestRegressor()

# Cross-validation function
def cross_validate(model, X, y, kf):
    r2_scores = []
    mse_scores = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        predictions = model.predict(X_test)

        r2_scores.append(r2_score(y_test, predictions))
        mse_scores.append(mean_squared_error(y_test, predictions))

    return {
        "Mean R^2": np.mean(r2_scores),
        "Std R^2": np.std(r2_scores),
        "Mean MSE": np.mean(mse_scores),
        "Std MSE": np.std(mse_scores)
    }

# Perform cross-validation
kf = KFold(n_splits=5, shuffle=True)

results = {
    "Random Forest on Subset A": cross_validate(rf_regressor_a, X_a, y_a_transformed, kf),
    "Random Forest on Subset B": cross_validate(rf_regressor_b, X_b, y_b_transformed, kf)
}

results




{'Random Forest on Subset A': {'Mean R^2': 0.6857335758693945,
  'Std R^2': 0.011460351055352072,
  'Mean MSE': 0.2955180345431576,
  'Std MSE': 0.011908890125962126},
 'Random Forest on Subset B': {'Mean R^2': 0.9572389540213931,
  'Std R^2': 0.004249838119577103,
  'Mean MSE': 0.03771675799591377,
  'Std MSE': 0.002274753015162408}}

In [8]:
# Train the Random Forest model on the entire Subset A
# rf_regressor_a = DecisionTreeRegressor()
rf_regressor_a.fit(X_a, y_a_transformed)

# Train the Random Forest model on the entire Subset B
# rf_regressor_b = DecisionTreeRegressor()
rf_regressor_b.fit(X_b, y_b_transformed)


In [9]:
X_a.columns

Index(['Chiller', 'Boiler', 'DX_Cooling', 'DX_Heating',
       'Building_Type_Hospital', 'Building_Type_Lodging',
       'Building_Type_Office', 'Building_Type_Restaurant',
       'Building_Type_Retail', 'Building_Type_School',
       'Building_Type_Warehouse'],
      dtype='object')

In [10]:
y_a.columns

Index(['Heating (kBtu/ft2)', 'Cooling (kBtu/ft2)', 'Fans (kBtu/ft2)',
       'Pumps (kBtu/ft2)', 'Heat Rejection (kBtu/ft2)',
       'Total Lighting (kBtu/ft2)', 'Total Equipment (kBtu/ft2)'],
      dtype='object')

In [11]:
X_b.columns

Index(['U_Factor_Wall__Btu_hr_ft2__F_', 'U_Factor_Roof__Btu_hr_ft2__F_',
       'Net_Wall_Area__ft2_', 'Net_Roof_Area__ft2_',
       'Glass_U_Factor__Btu_hr_ft2__F_', 'Glass_SHGC',
       'Infiltration__cfm_ft2_', 'Window_Wall_Ratio',
       'Plug_and_Process__W_ft2_', 'Lighting__W_ft2_', 'Building_Area__ft2_',
       'vpz__cfm_', 'vot__cfm_', 'Climate_Category_cool',
       'Climate_Category_hot', 'Climate_Category_temperate'],
      dtype='object')

In [12]:
y_b.columns

Index(['Heating (kBtu/ft2)', 'Cooling (kBtu/ft2)', 'Fans (kBtu/ft2)',
       'Pumps (kBtu/ft2)', 'Heat Rejection (kBtu/ft2)',
       'Total Lighting (kBtu/ft2)', 'Total Equipment (kBtu/ft2)'],
      dtype='object')

In [13]:
# Get feature importances for Subset A
feature_importances_a = rf_regressor_a.feature_importances_

# Combine feature names with their importances for Subset A
importances_a = pd.DataFrame({
    'Feature': X_a.columns,
    'Importance': feature_importances_a
}).sort_values(by='Importance', ascending=False)

# Get feature importances for Subset B
feature_importances_b = rf_regressor_b.feature_importances_

# Combine feature names with their importances for Subset B
importances_b = pd.DataFrame({
    'Feature': X_b.columns,
    'Importance': feature_importances_b
}).sort_values(by='Importance', ascending=False)

# Now, 'importances_a' and 'importances_b' contain the features and their importances for Subset A and B respectively
importances_a

Unnamed: 0,Feature,Importance
7,Building_Type_Restaurant,0.396411
4,Building_Type_Hospital,0.189929
1,Boiler,0.113357
10,Building_Type_Warehouse,0.076943
8,Building_Type_Retail,0.058624
6,Building_Type_Office,0.044181
9,Building_Type_School,0.03814
2,DX_Cooling,0.033144
3,DX_Heating,0.02296
0,Chiller,0.020116


In [14]:
importances_b

Unnamed: 0,Feature,Importance
10,Building_Area__ft2_,0.270694
5,Glass_SHGC,0.230073
8,Plug_and_Process__W_ft2_,0.137896
6,Infiltration__cfm_ft2_,0.075174
9,Lighting__W_ft2_,0.05387
11,vpz__cfm_,0.050259
7,Window_Wall_Ratio,0.035187
13,Climate_Category_cool,0.02726
12,vot__cfm_,0.021808
15,Climate_Category_temperate,0.020726


In [15]:
subset_a.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3668 entries, 0 to 3667
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   index                       3668 non-null   int64  
 1   Chiller                     3668 non-null   int64  
 2   Boiler                      3668 non-null   int64  
 3   DX Cooling                  3668 non-null   int64  
 4   DX Heating                  3668 non-null   int64  
 5   Building Type_Hospital      3668 non-null   int64  
 6   Building Type_Lodging       3668 non-null   int64  
 7   Building Type_Office        3668 non-null   int64  
 8   Building Type_Restaurant    3668 non-null   int64  
 9   Building Type_Retail        3668 non-null   int64  
 10  Building Type_School        3668 non-null   int64  
 11  Building Type_Warehouse     3668 non-null   int64  
 12  Heating (kBtu/ft2)          3668 non-null   float64
 13  Cooling (kBtu/ft2)          3668 

In [16]:
subset_b.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3668 entries, 0 to 3667
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   index                           3668 non-null   int64  
 1   U-Factor Wall [Btu/hr-ft2-°F]   3668 non-null   float64
 2   U-Factor Roof [Btu/hr-ft2-°F]   3668 non-null   float64
 3   Net Wall Area [ft2]             3668 non-null   float64
 4   Net Roof Area [ft2]             3668 non-null   float64
 5   Glass U-Factor [Btu/hr-ft2-°F]  3668 non-null   float64
 6   Glass SHGC                      3668 non-null   float64
 7   Infiltration (cfm/ft2)          3668 non-null   float64
 8   Window-Wall Ratio               3668 non-null   float64
 9   Plug and Process [W/ft2]        3668 non-null   float64
 10  Lighting [W/ft2]                3668 non-null   float64
 11  Building Area [ft2]             3668 non-null   float64
 12  vpz (cfm)                       36

In [17]:
df_test.head()

Unnamed: 0,Index,Chiller,Boiler,DX Cooling,DX Heating,Building Type_Hospital,Building Type_Lodging,Building Type_Office,Building Type_Restaurant,Building Type_Retail,Building Type_School,Building Type_Warehouse,U-Factor Wall [Btu/hr-ft2-°F],U-Factor Roof [Btu/hr-ft2-°F],Net Wall Area [ft2],Net Roof Area [ft2],Glass U-Factor [Btu/hr-ft2-°F],Glass SHGC,Infiltration (cfm/ft2),Window-Wall Ratio,Plug and Process [W/ft2],Lighting [W/ft2],Building Area [ft2],vpz (cfm),vot (cfm),Climate Category_cool,Climate Category_hot,Climate Category_temperate,Heating (kBtu/ft2),Cooling (kBtu/ft2),Fans (kBtu/ft2),Pumps (kBtu/ft2),Heat Rejection (kBtu/ft2),Total Lighting (kBtu/ft2),Total Equipment (kBtu/ft2)
1599,1599,0,1,1,0,0,0,0,0,0,1,0,0.278,0.182,1584.1,6857.62,0.351917,0.374301,0.259365,35.0,4.798904,0.684686,73958.7569,33860.76184,5265.204912,1,0,0,9.069515,2.065471,3.815937,0.053697,0.0,3.985999,19.258207
2849,2849,0,1,1,1,0,0,1,0,0,0,0,0.29,2.858,217.94,0.0,0.360145,0.394,0.282798,21.2,0.629882,0.738003,5502.075124,3095.895568,467.848704,1,0,0,6.501295,1.533162,3.453921,0.0,0.0,9.279935,9.095611
2082,2082,0,1,1,1,0,0,0,0,0,1,0,1.024288,0.235692,7371.31,12085.78,0.514419,0.458824,0.092772,16.36,0.643567,0.789991,229583.330739,110734.363904,70957.477216,1,0,0,13.913156,4.826745,8.06058,2.277568,0.0,5.975438,5.625265
1065,1065,0,1,1,0,0,0,0,1,0,0,0,0.505,2.858,157.92,0.0,0.610749,0.253,1.085253,14.0,55.31414,1.650004,2500.884526,9219.458768,1202.040624,0,0,1,19.014064,47.80993,61.753072,0.0,0.0,38.839173,285.775662
377,377,0,1,1,0,0,1,0,0,0,0,0,0.586059,0.273,3328.34,1978.84,0.620788,0.253,0.080191,30.16,3.817245,0.999265,122119.567031,53260.16768,0.0,0,0,1,16.677044,18.878871,4.971322,1.63416,0.0,13.650259,35.512839


In [18]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Assuming you have the trained models: rf_regressor_a and rf_regressor_b

# Updated features for Subset A
features_a_updated = [
    'Chiller', 'Boiler', 'DX_Cooling', 'DX_Heating', 'Building_Type_Hospital',
    'Building_Type_Lodging', 'Building_Type_Office',
    'Building_Type_Restaurant', 'Building_Type_Retail',
    'Building_Type_School', 'Building_Type_Warehouse'
]

# Updated features for Subset B
features_b_updated = [
    'U_Factor_Wall__Btu_hr_ft2__F_', 'U_Factor_Roof__Btu_hr_ft2__F_',
    'Net_Wall_Area__ft2_', 'Net_Roof_Area__ft2_',
    'Glass_U_Factor__Btu_hr_ft2__F_', 'Glass_SHGC',
    'Infiltration__cfm_ft2_', 'Window_Wall_Ratio',
    'Plug_and_Process__W_ft2_', 'Lighting__W_ft2_', 'Building_Area__ft2_',
    'vpz__cfm_', 'vot__cfm_', 'Climate_Category_cool',
    'Climate_Category_hot', 'Climate_Category_temperate'
]

# For Subset A
data_str_a = "0	0	1	1	0	0	0	1	0	0	0	"
X_new_a = np.array([float(num) for num in data_str_a.split()]).reshape(1, -1)
X_new_a_df = pd.DataFrame(X_new_a, columns=features_a_updated)

# Predict the targets for the new data using rf_regressor_a
predictions_a_transformed = rf_regressor_a.predict(X_new_a_df)
predictions_a = np.expm1(predictions_a_transformed)

# For Subset B
data_str_b = "0.03	0.01637	38595.218	58123.88	0.209	0.278	0.1722	35.1	1.039	0.58	33740.087072	32052.564912	7066.105344	1	0	0"
X_new_b = np.array([float(num) for num in data_str_b.split()]).reshape(1, -1)
X_new_b_df = pd.DataFrame(X_new_b, columns=features_b_updated)

# Predict the targets for the new data using rf_regressor_b
predictions_b_transformed = rf_regressor_b.predict(X_new_b_df)
predictions_b = np.expm1(predictions_b_transformed)

# Combine the predictions using weighted average
weight_a = 0.15
weight_b = 0.85
combined_predictions = weight_a * predictions_a + weight_b * predictions_b

# Display the combined predictions
print(combined_predictions)


[[5.32135071e+00 1.92366769e+01 7.60888999e+00 4.35727125e-01
  2.42441080e-02 7.67012325e+00 3.98784601e+01]]


In [19]:
predictions_a

array([[0.00000000e+00, 1.14931154e+02, 3.70497002e+01, 4.10686071e-02,
        0.00000000e+00, 2.17581212e+01, 2.07864907e+02]])

In [20]:
predictions_b

array([[ 6.2604126 ,  2.34941634,  2.4134529 ,  0.50537275,  0.02852248,
         5.18400596, 10.2337929 ]])

In [21]:
# subset_a.to_csv("dataset_A_for_energy_modelling_with_new_outputs.csv")

In [22]:
# subset_b.to_csv("dataset_B_for_energy_modelling_with_new_outputs.csv")

In [23]:
subset_a.loc[800:810]

Unnamed: 0,index,Chiller,Boiler,DX Cooling,DX Heating,Building Type_Hospital,Building Type_Lodging,Building Type_Office,Building Type_Restaurant,Building Type_Retail,Building Type_School,Building Type_Warehouse,Heating (kBtu/ft2),Cooling (kBtu/ft2),Fans (kBtu/ft2),Pumps (kBtu/ft2),Heat Rejection (kBtu/ft2),Total Lighting (kBtu/ft2),Total Equipment (kBtu/ft2)
800,811,0,1,1,1,0,0,1,0,0,0,0,0.167726,9.690117,1.447853,0.002651,0.0,5.177595,13.463514
801,812,0,1,1,1,0,0,1,0,0,0,0,0.247259,7.655488,1.7066,0.002651,0.0,5.182367,13.463514
802,813,0,1,1,1,0,0,1,0,0,0,0,1.45392,5.732991,5.021535,0.0,0.0,16.50819,9.095611
803,814,0,1,1,1,0,0,1,0,0,0,0,1.634798,7.584844,4.604653,0.0,0.0,16.511636,9.095611
804,815,0,1,1,1,0,0,1,0,0,0,0,9.515939,2.857882,3.950045,0.0,0.0,16.506468,9.095611
805,816,0,1,1,1,0,0,1,0,0,0,0,5.481484,3.879416,4.740743,0.0,0.0,16.506468,9.095611
806,817,0,0,1,0,0,0,1,0,0,0,0,0.0,18.246348,5.197246,0.0,0.0,16.515081,9.095611
807,818,0,1,1,1,0,0,1,0,0,0,0,0.534023,7.06288,4.856161,0.0,0.0,16.509913,9.095611
808,819,0,1,1,1,0,0,1,0,0,0,0,33.8467,1.534884,4.859606,0.0,0.0,16.473737,9.095611
809,820,0,1,1,1,0,0,1,0,0,0,0,11.751943,2.35659,4.527134,0.0,0.0,16.5013,9.095611


In [24]:
# For Subset A
data_str_a = "0	1	1	1	0	0	1	0	0	0	0"
X_new_a = np.array([float(num) for num in data_str_a.split()]).reshape(1, -1)
X_new_a_df = pd.DataFrame(X_new_a, columns=features_a_updated)

# Predict the targets for the new data using rf_regressor_a
predictions_a_transformed = rf_regressor_a.predict(X_new_a_df)
predictions_a = np.expm1(predictions_a_transformed)
predictions_a

array([[3.70842803e+00, 3.41898347e+00, 2.34858275e+00, 1.26034213e-03,
        0.00000000e+00, 9.73463388e+00, 1.16469029e+01]])

In [25]:
subset_a.head()

Unnamed: 0,index,Chiller,Boiler,DX Cooling,DX Heating,Building Type_Hospital,Building Type_Lodging,Building Type_Office,Building Type_Restaurant,Building Type_Retail,Building Type_School,Building Type_Warehouse,Heating (kBtu/ft2),Cooling (kBtu/ft2),Fans (kBtu/ft2),Pumps (kBtu/ft2),Heat Rejection (kBtu/ft2),Total Lighting (kBtu/ft2),Total Equipment (kBtu/ft2)
0,0,1,1,0,1,0,0,0,0,0,1,0,10.948703,1.693893,2.469517,1.23998,0.227522,8.949458,7.371101
1,1,0,0,1,1,0,0,0,0,0,1,0,5.28559,0.774698,1.399652,1.835715,0.0,3.387683,7.371101
2,2,0,0,1,1,0,0,0,0,0,1,0,5.248758,0.763844,1.399189,1.822649,0.0,3.387889,7.371101
3,3,0,0,1,1,0,0,0,0,0,1,0,21.100159,7.12968,17.022207,0.047381,0.0,6.662683,8.926996
4,4,0,0,1,1,0,0,0,0,0,1,0,19.92688,5.545659,13.071028,0.047381,0.0,6.662683,8.926996


In [26]:
subset_b.head()

Unnamed: 0,index,U-Factor Wall [Btu/hr-ft2-°F],U-Factor Roof [Btu/hr-ft2-°F],Net Wall Area [ft2],Net Roof Area [ft2],Glass U-Factor [Btu/hr-ft2-°F],Glass SHGC,Infiltration (cfm/ft2),Window-Wall Ratio,Plug and Process [W/ft2],Lighting [W/ft2],Building Area [ft2],vpz (cfm),vot (cfm),Climate Category_cool,Climate Category_hot,Climate Category_temperate,Heating (kBtu/ft2),Cooling (kBtu/ft2),Fans (kBtu/ft2),Pumps (kBtu/ft2),Heat Rejection (kBtu/ft2),Total Lighting (kBtu/ft2),Total Equipment (kBtu/ft2)
0,0,0.272,0.204,4770.95,7105.24,0.455905,0.526942,0.10745,26.08,1.099684,0.992195,184254.072142,56864.170672,35988.964912,1,0,0,10.948703,1.693893,2.469517,1.23998,0.227522,8.949458,7.371101
1,1,0.221042,0.151064,4770.95,7105.24,0.35714,0.328865,0.10745,26.08,1.099684,0.355772,184254.072142,34496.213952,34496.213952,1,0,0,5.28559,0.774698,1.399652,1.835715,0.0,3.387683,7.371101
2,2,0.221042,0.151064,4770.95,7105.24,0.356154,0.320267,0.10745,26.08,1.099684,0.355772,184254.072142,34496.213952,34496.213952,1,0,0,5.248758,0.763844,1.399189,1.822649,0.0,3.387889,7.371101
3,3,0.619,0.22,1756.82,4238.89,0.078116,0.793355,0.099661,27.73,0.76134,0.990002,59812.193686,96468.36864,20940.043488,1,0,0,21.100159,7.12968,17.022207,0.047381,0.0,6.662683,8.926996
4,4,0.619,0.22,1756.82,4238.89,0.078116,0.793355,0.099661,27.73,0.76134,0.990002,59812.193686,88931.936256,20940.043488,1,0,0,19.92688,5.545659,13.071028,0.047381,0.0,6.662683,8.926996


In [27]:
# For Subset B
data_str_b = "0.221042	0.151064	4770.95	7105.24	0.357140	0.328865	0.107450	26.08	1.099684	0.355772	184254.072142	34496.213952	34496.213952	1	0	0"
X_new_b = np.array([float(num) for num in data_str_b.split()]).reshape(1, -1)
X_new_b_df = pd.DataFrame(X_new_b, columns=features_b_updated)

# Predict the targets for the new data using rf_regressor_b
predictions_b_transformed = rf_regressor_b.predict(X_new_b_df)
predictions_b = np.expm1(predictions_b_transformed)
predictions_b

array([[5.25863659, 0.76840457, 1.39950627, 1.832508  , 0.        ,
        3.38780226, 7.37110121]])

In [28]:
# Prepare test data for Subset A model
X_test_a = df_test[features_a]  # 'features_a' are the features used for Subset A

# Prepare test data for Subset B model
X_test_b = df_test[features_b]  # 'features_b' are the features used for Subset B

# Sanitize column names for models (if needed)
X_test_a.columns = ["".join([c if c.isalnum() else "_" for c in str(col)]) for col in X_test_a.columns]
X_test_b.columns = ["".join([c if c.isalnum() else "_" for c in str(col)]) for col in X_test_b.columns]


In [29]:
# Generate predictions from both models
predictions_a = rf_regressor_a.predict(X_test_a)
predictions_b = rf_regressor_b.predict(X_test_b)

# Define the weights
weight_a = 0.3
weight_b = 0.7

# Ensure predictions_a and predictions_b are defined here
# For example, predictions_a and predictions_b can be lists, NumPy arrays, or Pandas Series containing prediction values
predictions_a = np.expm1(predictions_a)
predictions_b = np.expm1(predictions_b)

# Combine the predictions
combined_predictions = weight_a * predictions_a + weight_b * predictions_b

# The combined_predictions now contain the weighted sum of predictions_a and predictions_b

# Apply inverse of log1p transformation if it was applied to the training data


In [30]:
y_test_actual  = df_test[targets]  # 'targets' are the target variables


In [31]:
import pandas as pd

# Assuming 'df_test' is your original complete test data DataFrame
# and 'y_test_actual' is a DataFrame with actual target values

# Convert predictions to DataFrame
predictions_df = pd.DataFrame(combined_predictions, columns=['Predicted_' + t for t in targets])

# Reset index on actual values DataFrame for proper concatenation
y_test_actual.reset_index(drop=True, inplace=True)

# Reset index on df_test if it's not already aligned
df_test.reset_index(drop=True, inplace=True)

# Concatenate original complete test data, actual values, and predicted values
full_comparison_df = pd.concat([df_test, predictions_df], axis=1)

# Display the DataFrame
full_comparison_df


Unnamed: 0,Index,Chiller,Boiler,DX Cooling,DX Heating,Building Type_Hospital,Building Type_Lodging,Building Type_Office,Building Type_Restaurant,Building Type_Retail,Building Type_School,Building Type_Warehouse,U-Factor Wall [Btu/hr-ft2-°F],U-Factor Roof [Btu/hr-ft2-°F],Net Wall Area [ft2],Net Roof Area [ft2],Glass U-Factor [Btu/hr-ft2-°F],Glass SHGC,Infiltration (cfm/ft2),Window-Wall Ratio,Plug and Process [W/ft2],Lighting [W/ft2],Building Area [ft2],vpz (cfm),vot (cfm),Climate Category_cool,Climate Category_hot,Climate Category_temperate,Heating (kBtu/ft2),Cooling (kBtu/ft2),Fans (kBtu/ft2),Pumps (kBtu/ft2),Heat Rejection (kBtu/ft2),Total Lighting (kBtu/ft2),Total Equipment (kBtu/ft2),Predicted_Heating (kBtu/ft2),Predicted_Cooling (kBtu/ft2),Predicted_Fans (kBtu/ft2),Predicted_Pumps (kBtu/ft2),Predicted_Heat Rejection (kBtu/ft2),Predicted_Total Lighting (kBtu/ft2),Predicted_Total Equipment (kBtu/ft2)
0,1599,0,1,1,0,0,0,0,0,0,1,0,0.278,0.182,1584.1,6857.62,0.351917,0.374301,0.259365,35.0,4.798904,0.684686,73958.7569,33860.76184,5265.204912,1,0,0,9.069515,2.065471,3.815937,0.053697,0.0,3.985999,19.258207,10.32591,4.184397,4.587016,0.126836,0.0,5.757984,17.862481
1,2849,0,1,1,1,0,0,1,0,0,0,0,0.29,2.858,217.94,0.0,0.360145,0.394,0.282798,21.2,0.629882,0.738003,5502.075124,3095.895568,467.848704,1,0,0,6.501295,1.533162,3.453921,0.0,0.0,9.279935,9.095611,5.058971,2.197512,3.089125,0.000378,0.0,7.977725,9.457692
2,2082,0,1,1,1,0,0,0,0,0,1,0,1.024288,0.235692,7371.31,12085.78,0.514419,0.458824,0.092772,16.36,0.643567,0.789991,229583.330739,110734.363904,70957.477216,1,0,0,13.913156,4.826745,8.06058,2.277568,0.0,5.975438,5.625265,14.725342,3.940219,8.03088,1.632278,0.022325,5.822456,6.466529
3,1065,0,1,1,0,0,0,0,1,0,0,0,0.505,2.858,157.92,0.0,0.610749,0.253,1.085253,14.0,55.31414,1.650004,2500.884526,9219.458768,1202.040624,0,0,1,19.014064,47.80993,61.753072,0.0,0.0,38.839173,285.775662,26.168166,39.695389,51.165595,0.014497,0.0,30.006013,257.030688
4,377,0,1,1,0,0,1,0,0,0,0,0,0.586059,0.273,3328.34,1978.84,0.620788,0.253,0.080191,30.16,3.817245,0.999265,122119.567031,53260.16768,0.0,0,0,1,16.677044,18.878871,4.971322,1.63416,0.0,13.650259,35.512839,8.104236,12.087816,5.834434,1.194538,0.0,11.634712,31.79719
5,3660,1,0,0,1,0,0,0,0,0,1,0,0.731487,0.206498,17751.78,14577.56,0.350107,0.232,0.078493,22.05,0.78019,0.764443,310003.226253,118609.393312,99070.141392,1,0,0,1.308677,2.005743,1.738951,2.848527,0.06176,7.306278,10.956436,5.948413,2.836147,3.227429,2.469171,0.422119,7.985497,10.570748
6,1635,0,1,1,0,0,0,0,0,0,1,0,0.477,0.273,3816.8,11740.57,0.644735,0.261265,0.280946,35.0,3.013002,1.13377,210886.3288,136034.426768,26509.095792,0,0,1,1.333322,11.576657,6.73996,0.462433,0.0,15.783996,14.565193,3.599441,12.25313,6.844688,0.561708,0.0,13.914546,14.650858
7,2532,0,1,1,1,0,1,0,0,0,0,0,0.363,0.273,1504.54,1003.41,0.480604,0.394,0.278439,10.87,2.620766,1.048122,43201.773762,6550.729408,1087.197328,0,0,1,2.368787,4.738013,6.864151,0.005704,0.0,14.126281,22.453345,3.451085,6.026013,6.506197,0.007653,0.0,12.460091,22.233412
8,1597,0,1,1,0,0,0,0,0,0,1,0,0.437,0.221,1584.1,6857.62,0.432047,0.252661,0.259365,35.0,4.798904,0.684686,73958.7569,34817.224272,5265.204912,0,0,1,1.784684,6.883451,4.041618,0.011021,0.0,4.052639,19.258207,3.263944,6.954661,4.727036,0.094085,0.0,6.034629,17.851031
9,732,0,1,1,1,0,0,1,0,0,0,0,0.363,0.273,1313.34,1660.73,0.480604,0.394,0.203256,33.0,1.354516,0.999999,53627.794941,38701.131312,0.0,1,0,0,19.064754,4.195805,1.588185,0.002651,0.0,13.747888,15.02289,12.64191,3.585522,1.829359,0.002234,0.0,11.884329,14.000641


In [32]:
from sklearn.metrics import r2_score

# Define the target variable names
targets = [
    'Heating (kBtu/ft2)', 
    'Cooling (kBtu/ft2)', 
    'Fans (kBtu/ft2)', 
    'Pumps (kBtu/ft2)', 
    'Heat Rejection (kBtu/ft2)',
    'Total Lighting (kBtu/ft2)', 
    'Total Equipment (kBtu/ft2)'
]

# Initialize a dictionary to store the R2 scores
r2_scores = {}

# Loop over each target variable and calculate R2 score
for target in targets:
    actual = full_comparison_df[target]
    predicted = full_comparison_df['Predicted_' + target]

    # Ensure both actual and predicted are Series (1D) for consistent comparison
    if isinstance(actual, pd.DataFrame):
        actual = actual.squeeze()  # Convert DataFrame to Series
    if isinstance(predicted, pd.DataFrame):
        predicted = predicted.squeeze()  # Convert DataFrame to Series

    # Calculate R2 score
    r2_scores[target] = r2_score(actual, predicted)

# Print R2 scores for each target
print("R-squared (R²) scores for each target:")
for target, score in r2_scores.items():
    print(f"{target}: {score}")


R-squared (R²) scores for each target:
Heating (kBtu/ft2): 0.8412159381630449
Cooling (kBtu/ft2): 0.906534653296388
Fans (kBtu/ft2): 0.9652903298800521
Pumps (kBtu/ft2): 0.9291352449177879
Heat Rejection (kBtu/ft2): 0.9372911714917127
Total Lighting (kBtu/ft2): 0.8982455020384835
Total Equipment (kBtu/ft2): 0.9896338478359284


In [33]:
from cortex import CortexClient, CortexSignature


In [34]:
#     # Instantiate Cortex client
# cortex_token = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJpYXQiOjE3MDEyNzIyMDQ3ODgsImlkIjoiNjU2NGM2YzA0Y2Y0YTc5Y2JkMTExYjRiIn0.xhq24zN1grYTXFgCKHLoMHaE60z3zNwr7jn77Eu0_Sk"
# client = CortexClient(cortex_token)
# #

In [35]:
# # default parameters 
# n_estimators = 100

# client.log_param("n_estimators", n_estimators)


In [36]:
# signature_a = CortexSignature()
# signature_b = CortexSignature()


# # Input feature descriptions
# input_descriptions_a = [
#     "Indicates if the building uses a chiller for cooling (1 for yes, 0 for no)",
#     "Indicates if the building has a boiler for heating purposes (1 for yes, 0 for no)",
#     "Indicates the presence of a Direct Expansion (DX) cooling system (1 for yes, 0 for no)",
#     "Indicates the presence of a Direct Expansion (DX) heating system (1 for yes, 0 for no)",
#     "Binary indicator if the building is a hospital (1 for yes, 0 for no)",
#     "Binary indicator if the building type is lodging/accommodation (1 for yes, 0 for no)",
#     "Binary indicator if the building is used as an office (1 for yes, 0 for no)",
#     "Binary indicator if the building is a restaurant (1 for yes, 0 for no)",
#     "Binary indicator if the building is a retail store (1 for yes, 0 for no)",
#     "Binary indicator if the building is a school (1 for yes, 0 for no)",
#     "Binary indicator if the building is a warehouse (1 for yes, 0 for no)"
# ]

# input_descriptions_b = [
#     "U-Factor for walls, measuring heat transfer (in Btu/hr*ft²*°F)",
#     "U-Factor for the roof, indicating heat transfer rate (in Btu/hr*ft²*°F)",
#     "Net wall area of the building (in square feet)",
#     "Net roof area of the building (in square feet)",
#     "U-Factor for glass, measuring heat transfer through glass (in Btu/hr*ft²*°F)",
#     "Solar Heat Gain Coefficient (SHGC) for glass, indicating how much solar radiation passes through",
#     "Infiltration rate, measuring air leakage (in cubic feet per minute per square foot)",
#     "Window to wall ratio, representing the proportion of window area to wall area",
#     "Plug and process power density (in Watts per square foot)",
#     "Lighting power density (in Watts per square foot)",
#     "Total building area (in square feet)",
#     "Ventilation per zone (in cubic feet per minute)",
#     "Volume of outdoor air required for temperature control (in cubic feet per minute)",
#     "Binary indicator for a cool climate category (1 for yes, 0 for no)",
#     "Binary indicator for a hot climate category (1 for yes, 0 for no)",
#     "Binary indicator for a temperate climate category (1 for yes, 0 for no)"
# ]


# # Output feature descriptions
# output_descriptions = [
#     "Energy usage for heating per square foot, measured in thousands of British Thermal Units (kBtu)",
#     "Energy usage for cooling per square foot, measured in kBtu",
#     "Energy consumption by fans per square foot, measured in kBtu",
#     "Energy usage by pumps per square foot, measured in kBtu",
#     "Energy used for heat rejection processes per square foot, measured in kBtu",
#     "Energy used for total lighting per square foot, measured in kBtu",
#     "Energy used for total equipment per square foot, measured in kBtu"
# ]

# # Assuming X_a and y_a are your input and output DataFrames respectively
# signature_a.inputs_from_dataframe(X_a, randomSample=True, descriptions=input_descriptions_a)
# signature_a.outputs_from_dataframe(y_a, randomSample=True, descriptions=output_descriptions)

# # Assuming X_b and y_b are your input and output DataFrames respectively
# signature_b.inputs_from_dataframe(X_b, randomSample=True, descriptions=input_descriptions_b)
# signature_b.outputs_from_dataframe(y_b, randomSample=True, descriptions=output_descriptions)


  for (colname, coldata) in df.iteritems():


In [37]:
# experiment_name = "Building Energy Usage Prediction"
# experiment_description = "This experiment focuses on predicting energy usage in various types of buildings, using a Random Forest model. It involves analyzing binary indicators of building types and energy systems. The goal is to understand how different factors contribute to overall energy consumption and to develop a model that can predict energy usage based on these indicators."

# # Check if the experiment exists
# exp = client.get_experiment_by_name(experiment_name)
# if exp is None:
#     exp = client.create_experiment(experiment_name, experiment_description)

# exp_id = exp["_id"]

In [38]:
# #'rf_regressor_a' is your trained Random Forest model
# client.sklearn.log_model(
#     rf_regressor_a,
#     run_name="Random Forest Model with new outputs Run A",
#     cortex_experiment_id = exp_id,  # replace with your actual experiment ID
#     run_description = "Random Forest model run for predicting energy usage in buildings. This run focuses on utilizing binary indicators of building types and energy systems (like chillers, boilers, etc.). Random Forest was chosen for its robustness in handling diverse features and complex interactions",
#     signature=signature_a
# )


# #'rf_regressor_b' is your trained Random Forest model
# client.sklearn.log_model(
#     rf_regressor_b,
#     run_name="Random Forest Model with new outputs Run B",
#     cortex_experiment_id = exp_id,  # replace with your actual experiment ID
#     run_description = "Random Forest model run for predicting energy usage in buildings. This run focuses on utilizing indicators that could affect the energy consumption like climate categories and others in buildings. Random Forest was chosen for its robustness in handling diverse features and complex interactions",
#     signature=signature_b
# )


{'_id': '6570a6a6da4054ed92e96142',
 'timeLogged': 1701881510,
 'name': 'Random Forest Model with new outputs Run B',
 'description': 'Random Forest model run for predicting energy usage in buildings. This run focuses on utilizing indicators that could affect the energy consumption like climate categories and others in buildings. Random Forest was chosen for its robustness in handling diverse features and complex interactions',
 'datasetVersion': None,
 'mode': '',
 'user': {'permissions': {'deployment': {'shared': False,
    'dedicated': False,
    'serverless': True},
   'admin': False},
  'bookmarked': {'experiments': [], 'datasets': []},
  '_id': '6564c6c04cf4a79cbd111b4b',
  'ssoId': '64f09fa68cb18e001fe5800f',
  'sso': {'profile': {'name': 'Korutla Rajashekar',
    'location': 'Portland',
    'picture': '',
    'title': 'Sustainability',
    'company': 'Thornton Tomasetti, Inc.'},
   'isOauth': True,
   'isCS': False,
   'tokens': [],
   'isVerified': True,
   '_id': '64f09fa68cb