In [None]:
########################## Library Importing and Settings ###########################
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
import joblib
import boto3

In [None]:
########################## Data Loading From Local  ###########################
row_data = pd.read_csv("data.csv")
data = row_data.copy()
data.head()
data.columns = map(str.lower, data.columns)

In [None]:
########################## AWS Process ###########################
#s3_bucket = "busonbucket"
#def upload_to_s3 (localpath, remotepath):
#    boto3.client("s3").upload_file(Filename=localpath, Bucket=s3_bucket, Key=remotepath)
#def download_from_s3(localpath, remotepath):
#    boto3.client("s3").download_file(s3_bucket, remotepath, localpath)

In [None]:
########################## Feature Engineering ###########################
first_drop = ["id", "year_factor", "max_wind_speed",
              "days_with_fog", "direction_max_wind_speed", "direction_peak_wind_speed",
              "cooling_degree_days", "heating_degree_days", "precipitation_inches",
              "snowfall_inches", "snowdepth_inches", "days_below_20f",
              "days_below_10f", "days_below_0f", "days_above_80f",
              "days_above_100f", "days_above_110f"]

data.drop(columns = first_drop, inplace = True)
data['year_built'] = data['year_built'].fillna(data.groupby('state_factor')['year_built'].transform('mean'))
data["year_built"] = data["year_built"].astype("int")
data["build_age"] = 2023 - data["year_built"]
data.drop(columns = "year_built", inplace = True)

In [None]:
def get_manual_facility_groups():
    facility_groups = {
        "Living_Space": {
            "2to4_Unit_Building",
            "5plus_Unit_Building",
            "Mixed_Use_Predominantly_Residential",
            "Multifamily_Uncategorized",
            "Mixed_Use_Commercial_and_Residential",
            "Mixed_Use_Predominantly_Commercial",
        },
        "Social_Institutions": {
            "Education_College_or_university",
            "Education_Other_classroom",
            "Education_Preschool_or_daycare",
            "Education_Uncategorized",
            "Health_Care_Inpatient",
            "Health_Care_Outpatient_Clinic",
            "Health_Care_Outpatient_Uncategorized",
            "Health_Care_Uncategorized",
            "Nursing_Home",
            "Religious_worship"
        },
        "Business_Commercial_Venues": {
            "Commercial_Other",
            "Commercial_Unknown",
            "Industrial",
            "Parking_Garage",
            "Food_Sales",
            "Food_Service_Other",
            "Food_Service_Restaurant_or_cafeteria",
            "Food_Service_Uncategorized",
            "Grocery_store_or_food_market",
            "Office_Bank_or_other_financial",
            "Office_Medical_non_diagnostic",
            "Office_Mixed_use",
            "Office_Uncategorized",
            "Retail_Enclosed_mall",
            "Retail_Strip_shopping_mall",
            "Retail_Uncategorized",
            "Retail_Vehicle_dealership_showroom",
            "Laboratory",
            "Data_Center",
            "Lodging_Dormitory_or_fraternity_sorority",
            "Lodging_Hotel",
            "Lodging_Other",
            "Lodging_Uncategorized",
        },
        "Public": {
            "Public_Assembly_Drama_theater",
            "Public_Assembly_Entertainment_culture",
            "Public_Assembly_Library",
            "Public_Assembly_Movie_Theater",
            "Public_Assembly_Other",
            "Public_Assembly_Recreation",
            "Public_Assembly_Social_meeting",
            "Public_Assembly_Stadium",
            "Public_Assembly_Uncategorized",
            "Public_Safety_Courthouse",
            "Public_Safety_Fire_or_police_station",
            "Public_Safety_Penitentiary",
            "Public_Safety_Uncategorized",
        },
        "Warehouse_Service": {
            "Warehouse_Distribution_or_Shipping_center",
            "Warehouse_Nonrefrigerated",
            "Warehouse_Refrigerated",
            "Warehouse_Selfstorage",
            "Warehouse_Uncategorized",
            "Service_Drycleaning_or_Laundry",
            "Service_Uncategorized",
            "Service_Vehicle_service_repair_shop",
        },
    }

    return facility_groups
facility_groups = get_manual_facility_groups()
data['category'] = data['facility_type'].apply(lambda x:
                                               next((category for category, values in facility_groups.items()
                                                     if x in values), None))

In [None]:
#Missing Values
data['energy_star_rating'] = data['energy_star_rating']. \
    fillna(data.groupby(["state_factor", "category"])['energy_star_rating'].
           transform('median'))
data['energy_star_rating'] = data['energy_star_rating'].fillna(data["energy_star_rating"].median())

In [None]:
data.drop(columns = ["facility_type", "building_class"], inplace = True)

In [None]:
temp_min_max = data.groupby("state_factor")["january_min_temp"].min().reset_index()
for i in data.columns[6:41]:
    if "_min_" in i:
        temp_min_max[i + "_for_state"] = data.groupby("state_factor")[i].min().values
    elif "_avg_" in i:
        temp_min_max[i + "_for_state"] = data.groupby("state_factor")[i].mean().values
    elif "_max_" in i:
        temp_min_max[i + "_for_state"] = data.groupby("state_factor")[i].max().values

temp_min_max.head(2)
temp_min_max["min_temp_for_state"] = temp_min_max.iloc[:, 1:].min(axis = 1)
temp_min_max["max_temp_for_state"] = temp_min_max.iloc[:, 1:].max(axis = 1)

data = pd.merge(data,
                temp_min_max[["state_factor", "min_temp_for_state", "max_temp_for_state"]],
                how = "left",
                on = "state_factor")

In [None]:
del_temp = ['january_min_temp', 'january_avg_temp', 'january_max_temp', 'february_min_temp',
            'february_avg_temp', 'february_max_temp', 'march_min_temp', 'march_avg_temp', 'march_max_temp',
            'april_min_temp', 'april_avg_temp', 'april_max_temp', 'may_min_temp', 'may_avg_temp',
            'may_max_temp', 'june_min_temp', 'june_avg_temp', 'june_max_temp', 'july_min_temp', 'july_avg_temp',
            'july_max_temp', 'august_min_temp', 'august_avg_temp', 'august_max_temp', 'september_min_temp',
            'september_avg_temp', 'september_max_temp', 'october_min_temp', 'october_avg_temp',
            'october_max_temp', 'november_min_temp', 'november_avg_temp', 'november_max_temp',
            'december_min_temp', 'december_avg_temp', 'december_max_temp']
data.drop(columns = del_temp, inplace = True)

In [None]:
label_encoder = LabelEncoder()
label_cols = ["state_factor", "category"]
for col in label_cols:
    data[col] = label_encoder.fit_transform(data[col])

In [None]:
################# Model Creation #########################
y = data[["site_eui"]]
X = data.drop("site_eui", axis = 1)
X.head()

models = [('LR', LinearRegression()),
          ('KNN', KNeighborsRegressor()),
          ('CART', DecisionTreeRegressor(random_state = 17)),
          ('RF', RandomForestRegressor(random_state = 17)),
          ('GBM', GradientBoostingRegressor(random_state = 17)),
          ("XGBoost", XGBRegressor(objective = 'reg:squarederror')),
          ("LightGBM", LGBMRegressor(random_state = 17))]

for name, regressor in models:
    rmse = np.mean(np.sqrt(-cross_val_score(regressor, X, y, cv = 5, scoring = "neg_mean_squared_error")))
    print(f"RMSE: {round(rmse, 4)} ({name}) ")


In [None]:
###### LightGBM
lgbm_model = LGBMRegressor(random_state = 17)
lgbm_params = {"learning_rate": [0.01, 0.05, 0.1],
               "n_estimators": [500, 1000, 2000, 2500],
               "colsample_bytree": [0.5, 0.7, 1]
               }

lgbm_gs_best = GridSearchCV(lgbm_model,
                            lgbm_params,
                            cv = 5,
                            n_jobs = -1,
                            verbose = True).fit(X, y)
best = lgbm_gs_best.best_params_
# {'colsample_bytree': 0.7, 'learning_rate': 0.01, 'n_estimators': 500}
final_model = lgbm_model.set_params(**best).fit(X, y)
rmse = np.mean(np.sqrt(-cross_val_score(final_model, X, y, cv = 5, scoring = "neg_mean_squared_error")))
# # RMSE: 50.41004


In [None]:
joblib.dump(final_model, "datasets/model.pkl")

In [None]:
# upload_to_s3("datasets/model.pkl", "model_energy.pkl")