In [19]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import RandomizedSearchCV

from datetime import datetime
import joblib
import json
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import copy 
import hashlib

### Load Dataset 

In [20]:
X_train = joblib.load("data/processed/X_train_new.pkl")
y_train = joblib.load("data/processed/y_train.pkl")

X_valid = joblib.load("data/processed/X_valid_new.pkl")
y_valid = joblib.load("data/processed/y_valid.pkl")

X_test = joblib.load("data/processed/X_test_new.pkl")
y_test = joblib.load("data/processed/y_test.pkl")

### Create Log Template 

In [21]:
def time_stamp():
    return datetime.now()

In [22]:
def create_log_template():
    logger = {
        "model_name" : [],
        "model_uid" : [],
        "training_time" : [],
        "training_date" : [],
#         "performance" : [],
        "r2_score" : [],
        "mse_score": [],
        "mae_score": [],
        "data_configurations" : [],
    }

    return logger

In [23]:
def training_log_updater(current_log, log_path):
    current_log = current_log.copy()

    try:
        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()
    except FileNotFoundError as ffe:
        with open(log_path, "w") as file:
            file.write("[]")
        file.close()
        with open(log_path, "r") as file:
            last_log = json.load(file)
        file.close()
    
    last_log.append(current_log)

    with open(log_path, "w") as file:
        json.dump(last_log, file)
        file.close()

    return last_log

### Training and Evaluation

#### Create Model Object 

In [24]:
lr_vanilla = LinearRegression()
dct_vanilla = DecisionTreeRegressor()
rfr_vanilla = RandomForestRegressor()
knn_vanilla = KNeighborsRegressor()

In [25]:
list_of_model = {
    "vanilla" : [
    { "model_name": lr_vanilla.__class__.__name__, "model_object": lr_vanilla, "model_uid": ""},
    { "model_name": dct_vanilla.__class__.__name__, "model_object": dct_vanilla, "model_uid": ""},
    { "model_name": rfr_vanilla.__class__.__name__, "model_object": rfr_vanilla, "model_uid": ""},
    { "model_name": knn_vanilla.__class__.__name__, "model_object": knn_vanilla, "model_uid": ""},
    ]
}

#### Baseline Model 

In [26]:
baseline_model = np.mean(y_train)
print(baseline_model)

0.5287046241903901


In [27]:
baseline_mse = mean_squared_error(y_train, 
                                  np.ones(len(y_train)) * baseline_model)
print(baseline_mse)

1.9716680463575895


#### Training Model 

In [28]:
def train_eval_model(list_of_model, prefix_model_name, X_train, y_train, data_configuration_name, X_valid, y_valid, log_path):

    list_of_model = copy.deepcopy(list_of_model)
    logger = create_log_template()

    for model in tqdm(list_of_model):    
        model_name = prefix_model_name + "-" + model["model_name"]

        start_time = time_stamp()
        model["model_object"].fit(X_train, y_train)
        finished_time = time_stamp()

        elapsed_time = finished_time - start_time
        elapsed_time = elapsed_time.total_seconds()

        y_pred = model["model_object"].predict(X_valid)
        performance_r2 = r2_score(y_valid, y_pred)
        performance_mse = mean_squared_error(y_valid, y_pred)
        performance_mae = mean_absolute_error(y_valid, y_pred)

        plain_id = str(start_time) + str(finished_time)
        chiper_id = hashlib.md5(plain_id.encode()).hexdigest()

        model["model_uid"] = chiper_id

        logger["model_name"].append(model_name)
        logger["model_uid"].append(chiper_id)
        logger["training_time"].append(elapsed_time)
        logger["training_date"].append(str(start_time))
#         logger["performance"].append(performance)
        logger["r2_score"].append(performance_r2)
        logger["mse_score"].append(performance_mse)
        logger["mae_score"].append(performance_mae)
        logger["data_configurations"].append(data_configuration_name)

    training_log = training_log_updater(logger, log_path)

    return training_log, list_of_model

In [29]:
training_log, list_of_model_vanilla = train_eval_model(
    list_of_model["vanilla"],
    "vanilla_model",
    X_train,
    y_train,
    "vanilla",
    X_valid,
    y_valid,
    "log/training_log.json"
)

100%|█████████████████████████████████████████████| 4/4 [00:02<00:00,  1.81it/s]


In [30]:
list_of_model["vanilla"] = copy.deepcopy(list_of_model_vanilla)

### Best performance 

In [31]:
def training_log_to_df(training_log):
    training_res = pd.DataFrame()

    for log in tqdm(training_log):
        training_res = pd.concat([training_res, pd.DataFrame(log)])
    
    training_res.sort_values(["r2_score", "training_time"], ascending = [False, True], inplace = True)
    training_res.reset_index(inplace = True, drop = True)
    
    return training_res

In [32]:
training_res = training_log_to_df(training_log)

100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 820.48it/s]


In [33]:
training_res

Unnamed: 0,model_name,model_uid,training_time,training_date,r2_score,mse_score,mae_score,data_configurations
0,vanilla_model-LinearRegression,70489a57b46e5e069ddb869646fd3aca,0.012005,2023-04-10 13:52:48.537645,0.999988,2.7e-05,0.002945,vanilla
1,vanilla_model-RandomForestRegressor,fdbd8574e6bf6519fe7767feba7ec0f8,2.084238,2023-04-10 13:52:48.600617,0.975385,0.05461,0.027902,vanilla
2,vanilla_model-DecisionTreeRegressor,86fc11e8f0a34bf01707010b9a36c254,0.04382,2023-04-10 13:52:48.554178,0.966616,0.074065,0.044936,vanilla
3,vanilla_model-KNeighborsRegressor,009768a97d92e3e7269fd9cc3b2346be,0.011252,2023-04-10 13:52:50.722792,0.965375,0.07682,0.055829,vanilla


#### Best Performance Model 

In [34]:
def get_best_model(training_log_df, list_of_model):
    model_object = None

    best_model_info = training_log_df.sort_values(["r2_score", "training_time"], ascending = [False, True]).iloc[0]
    
    for configuration_data in list_of_model:
        for model_data in list_of_model[configuration_data]:
            if model_data["model_uid"] == best_model_info["model_uid"]:
                model_object = model_data["model_object"]
                break
    
    if model_object == None:
        raise RuntimeError("The best model not found in your list of model.")
    
    return model_object
    

In [35]:
model = get_best_model(training_res, list_of_model)

In [36]:
model

In [38]:
joblib.dump(model, "model/best_vanilla_model.pkl")

['model/best_vanilla_model.pkl']

In [39]:
y_predd = model.predict(X_test)

In [40]:
y_predd

array([0.07038663, 0.02037861, 0.12039022, ..., 0.31038247, 0.02037572,
       1.24030943])

In [41]:
data_res = {"actual": y_test,
            "pred": y_predd.squeeze()}

In [43]:
mean_squared_error(y_test, y_predd)

2.7589220341679503e-05

In [44]:
mean_absolute_error(y_test, y_predd)

0.0029793241269037126

In [45]:
np.sqrt(mean_squared_error(y_test, y_predd))

0.005252544177984561

In [46]:
pd.DataFrame(data_res)

Unnamed: 0,actual,pred
11832,0.07,0.070387
14655,0.03,0.020379
9669,0.12,0.120390
7862,0.19,0.190367
9059,0.14,0.140359
...,...,...
9542,0.13,0.120360
2425,0.86,0.850336
5766,0.31,0.310382
15702,0.02,0.020376
