In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("energy_dataset_processed.csv")

In [3]:
X = df.drop(['Heating_Load', 'Cooling_Load'], axis = 1)
y = df[['Heating_Load', 'Cooling_Load']]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
scaler = StandardScaler()

rf_model = RandomForestRegressor(random_state=42)

In [6]:
my_pipe = Pipeline(steps = [('scaler', scaler),
                        ('rf_model', rf_model)])

In [7]:
my_pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('rf_model', RandomForestRegressor(random_state=42))])

In [8]:
predictions = my_pipe.predict(X_test)

In [9]:
print('MAE for rf_model: ',mean_absolute_error(predictions, y_test))

MAE for rf_model:  0.7588386363636348


In [40]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 4)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Bootstrap
bootstrap = [True, False]

# Hyper-parameter tuning 

In [44]:
parameters = {'rf_model__n_estimators': n_estimators,
               'rf_model__max_features': max_features,
               'rf_model__max_depth': max_depth,
               'rf_model__min_samples_split': min_samples_split,
               'rf_model__min_samples_leaf': min_samples_leaf,
               'rf_model__bootstrap': bootstrap
            }

In [47]:
rf_grid = GridSearchCV(my_pipe, parameters, cv=3)

In [48]:
rf_grid.fit(X_train, y_train)

GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('rf_model',
                                        RandomForestRegressor(random_state=42))]),
             param_grid={'rf_model__bootstrap': [True, False],
                         'rf_model__max_depth': [10, 43, 76, 110, None],
                         'rf_model__max_features': ['auto', 'sqrt'],
                         'rf_model__min_samples_leaf': [1, 2, 4],
                         'rf_model__min_samples_split': [2, 5, 10],
                         'rf_model__n_estimators': [100, 200, 300, 400, 500,
                                                    600, 700, 800, 900, 1000]})

In [37]:
model.best_params_

{'rf_model__max_depth': 43,
 'rf_model__max_features': 'sqrt',
 'rf_model__min_samples_leaf': 1,
 'rf_model__min_samples_split': 2,
 'rf_model__n_estimators': 100}

In [38]:
model.best_score_

0.980425727682596

In [49]:
rf_grid.best_params_

{'rf_model__bootstrap': False,
 'rf_model__max_depth': 43,
 'rf_model__max_features': 'sqrt',
 'rf_model__min_samples_leaf': 1,
 'rf_model__min_samples_split': 2,
 'rf_model__n_estimators': 800}

In [50]:
rf_grid.best_score_

0.9806033213031776