## Read training and test data

In [11]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from helper import *
import mlflow
import pickle
import time

training_set = pd.read_csv('../data/model_dataset/training_set.csv', index_col='unique_id')
test_set = pd.read_csv('../data/model_dataset/test_set.csv', index_col='unique_id')

## Set features and target

In [12]:
# Set features and target while dropping unwanted columns
X_train = training_set.drop(columns=['target_value', 'Unnamed: 0', 'full_name'])
y_train = training_set['target_value']

X_test = test_set.drop(columns=['target_value', 'Unnamed: 0', 'full_name'])
y_test = test_set['target_value']

## Perform hyperparameter tuning

In [13]:
experiment_start = time.time()

mlflow.set_tracking_uri("../mlruns")
mlflow.set_experiment('RandomForest')

n_estimators_range = [1000]
max_depth_range = [2, 10, 15, 20, 40, 50]
min_samples_split_range = [2, 5, 10, 20]
min_samples_leaf_range = [1, 2, 5, 10, 20]
max_features_range = [1, 'auto', 'sqrt', 'log2']
random_state = 42

features = str(list(X_train.columns))
count = 0
for n_estimators in n_estimators_range:
    for max_depth in max_depth_range:
        for min_samples_split in min_samples_split_range:
            for min_samples_leaf in min_samples_leaf_range:
                for max_features in max_features_range:
                    with mlflow.start_run():
                        count += 1
                        #print("######################### Iteration Nr: {} ##############################".format(count))
                        scoring = ['neg_mean_absolute_error', 'r2', 'neg_mean_squared_error']
                        
                        # log hyperparameters
                        mlflow.log_param("n_estimators", n_estimators)
                        mlflow.log_param("max_depth", max_depth)
                        mlflow.log_param("min_samples_split", min_samples_split)
                        mlflow.log_param("min_samples_leaf", min_samples_leaf)
                        mlflow.log_param("max_features", max_features)
                        mlflow.log_param("features", features)

                        # scoring = ['neg_mean_absolute_error', 'neg_root_mean_squared_error', 'r2']

                        # define model using hyperparameters
                        rf_parameters = {"n_estimators": n_estimators,
                                        "max_depth": max_depth,
                                        "min_samples_split": min_samples_split,
                                        "min_samples_leaf": min_samples_leaf,
                                        "max_features": max_features,
                                        "random_state": random_state}

                        rf_model = RandomForestRegressor(**rf_parameters)

                        # define cross-validation method to use
                        scores = cross_validate(estimator=rf_model, X=X_train, y=y_train,
                                                scoring=scoring, cv=5, n_jobs=-1)
                        
                        # create new model to evaluate test set scores
                        rf_model = RandomForestRegressor(**rf_parameters)
                        rf_model.fit(X=X_train, y=y_train)
                        y_pred = pd.Series(rf_model.predict(X_test))
                        # round the targets to the nearest 5 
                        y_pred = y_pred.apply(round_to_nearest_5)
                        
                        # obtain and log metrics
                        r2 = scores["test_r2"].mean()
                        mse = -1.0 * (scores["test_neg_mean_squared_error"].mean())
                        mae = -1.0 * (scores["test_neg_mean_absolute_error"].mean())

                        r2_holdout = r2_score(y_true=y_test, y_pred=y_pred)
                        mse_holdout = mean_squared_error(y_true=y_test, y_pred=y_pred)
                        mae_holdout = mean_absolute_error(y_true=y_test, y_pred=y_pred)
                        
                        mlflow.log_metric("R2", r2)
                        mlflow.log_metric("MSE", mse)
                        mlflow.log_metric("MAE", mae)
                        mlflow.log_metric("R2_holdout", r2_holdout)
                        mlflow.log_metric("MSE_holdout", mse_holdout)
                        mlflow.log_metric("MAE_holdout", mae_holdout)

print("Experiment Duration {} s".format(str(round((time.time() - experiment_start)))))

2023/09/03 23:50:05 INFO mlflow.tracking.fluent: Experiment with name 'RandomForest' does not exist. Creating a new experiment.


Experiment Duration 1241 s


## Set and test final model 

In [17]:
rf_final_parameters = {   
                "n_estimators": 1000,
                "max_depth": 20,
                "min_samples_split": 20,
                "min_samples_leaf": 2,
                "max_features": 'auto',
                "random_state": 42
                }

rf_final_model = RandomForestRegressor(**rf_final_parameters)

rf_final_model.fit(X=X_train, y=y_train)

y_pred = pd.Series(rf_final_model.predict(X_test))
# round the targets to the nearest 5 
y_pred = y_pred.apply(round_to_nearest_5)

pred = pd.DataFrame(data=y_pred, columns=['Predicted'])
actual = pd.DataFrame(data=y_test.values, columns=['Actual'])

results = pd.concat([pred, actual], axis=1)
print('R2: ', r2_score(y_true=y_test, y_pred=y_pred))
print('MSE: ', mean_squared_error(y_true=y_test, y_pred=y_pred))
print('MAE: ', mean_absolute_error(y_true=y_test, y_pred=y_pred))

print(results)


R2:  0.7599956520951466
MSE:  39.130434782608695
MAE:  3.9130434782608696
     Predicted  Actual
0         45.0    45.0
1         45.0    45.0
2         50.0    50.0
3         45.0    50.0
4        105.0    90.0
..         ...     ...
179       60.0    65.0
180       50.0    55.0
181       45.0    50.0
182       85.0    80.0
183       50.0    50.0

[184 rows x 2 columns]


## Save model

In [18]:
with open('../models/rf_v0.0.1.pkl', 'wb') as file:
    pickle.dump(rf_final_model, file)

In [19]:
# Load the saved model from the pickle file
with open('../models/rf_v0.0.1.pkl', 'rb') as file:
    loaded_model = pickle.load(file)