# Building Energy Load Etimation With Random Forrest

In this study, it was tried to estimate the heat and cooling load of the buildings using random forest.

### Import libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Load Data

In [None]:
columns = ["Relative Compactness",
"Surface Area",
"Wall Area",
"Roof Area",
"Overall Height",
"Orientation",
"Glazing Area",
"Glazing Area Distribution",
"Heating Load",
"Cooling Load"]

In [None]:
df = pd.read_csv("../input/eergy-efficiency-dataset/ENB2012_data.csv")
df.rename(columns= dict(zip(df.columns, columns)), inplace=True)

### Data First Look

In [None]:
df.info(verbose= True)

In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.describe().T

In [None]:
sns.set_style('ticks')
fig, ax = plt.subplots()
fig.set_size_inches(15, 10)
ax=sns.boxplot(data=df, orient="h", ax=ax)

### Data Train Test Splite

y-values for heating and cooling

In [None]:
heating = df.pop("Heating Load")
cooling = df.pop("Cooling Load")

In [None]:
df.head()

Data train and test split for heating

In [None]:
 X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(df, heating, test_size=0.3, random_state=42)

Data train and split for cooling

In [None]:
 X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(df, cooling, test_size=0.33, random_state=42)

### Create Model

In [None]:
rf_params = {'max_depth': list(range(1,10)),
            'max_features': [.25,.50,.75],
            'n_estimators' : [100, 200, 500, 1000, 2000]}

In [None]:
rf_model = RandomForestRegressor(random_state = 42, bootstrap= True, n_jobs=-1)

#### For Heating

In [None]:
rf_cv_model_heating = GridSearchCV(rf_model, 
                           rf_params, 
                           cv = 10, 
                            n_jobs = -1)

In [None]:
rf_cv_model_heating.fit(X_train_h, y_train_h)

In [None]:
rf_cv_model_heating.best_params_

In [None]:
rf_model_heating = RandomForestRegressor(random_state = 42, bootstrap= True, n_jobs=-1, **rf_cv_model_heating.best_params_)

In [None]:
rf_model_heating.fit(X_train_h, y_train_h)

#### Feature Importens for Heating

In [None]:
Importance = pd.DataFrame({"Importance": rf_model_heating.feature_importances_*100},
                        index = X_train_h.columns).sort_values(by = "Importance", axis = 0,
                        ascending = True).plot(kind ="barh", color = "r")

### For Cooling

In [None]:
rf_cv_model_cooling = GridSearchCV(rf_model, 
                           rf_params, 
                           cv = 10, 
                            n_jobs = -1)

In [None]:
rf_cv_model_cooling.fit(X_train_h, y_train_h)

In [None]:
rf_cv_model_cooling.best_params_

In [None]:
rf_model_cooling = RandomForestRegressor(random_state = 42, bootstrap= True, n_jobs=-1, **rf_cv_model_cooling.best_params_)

In [None]:
rf_model_cooling.fit(X_train_c, y_train_c)

#### Feature Importance for Cooling

In [None]:
Importance = pd.DataFrame({"Importance": rf_model_heating.feature_importances_*100},
                        index = X_train_h.columns).sort_values(by = "Importance", axis = 0,
                        ascending = True).plot(kind ="barh", color = "b")

### Model Evaluation

In [None]:
def print_evaluation(model, xTrain, xTest, yTrain, yTest) -> None:
    print(f"Train rmse: {mean_squared_error(yTrain, model.predict(xTrain), squared=False)}")
    print(f"Test rmse: {mean_squared_error(yTest, model.predict(xTest), squared=False)}")
    print(f"Train R^2: {r2_score(yTrain, model.predict(xTrain))}")
    print(f"Test R^2: {r2_score(yTest, model.predict(xTest))}")
    print(f"Train accuracy: {model.score(xTrain, yTrain)}")
    print(f"Test accuracy: {model.score(xTest, yTest)}") 

#### Heating Evalation

In [None]:
print_evaluation(rf_cv_model_heating, X_train_h, X_test_h, y_train_h, y_test_h)

#### Cooling Evalation

In [None]:
print_evaluation(rf_cv_model_cooling, X_train_c, X_test_c, y_train_c, y_test_c)

## Conclusion

Our model estimating the heat load from the random forest models we created was more successful.

To predict two different values from the same data set;

- Made two different data splits.
- Two different model tunes were made.
- There is no missing or outlier in the data.

**Note:** This is a study you do while learning data science. Please share your criticisms and comments with me.
**Note:** May the force be with you
