In [34]:
import pandas as pd
import sklearn
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_validate
from sklearn.metrics import r2_score, max_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.base import BaseEstimator

import mlflow

import os
import pathlib
import joblib

current_path = os.getcwd()
abs_current_path = os.path.abspath(current_path)
abs_one_level_up = os.path.split(abs_current_path)[0]
abs_experiments_path = os.path.join(abs_one_level_up, "experiments")
experiments_folder_uri = pathlib.Path(abs_experiments_path).as_uri()

print(experiments_folder_uri)

mlflow.set_tracking_uri(experiments_folder_uri)




file:///Users/rusernyeoh/projects/datium_data_science_test/DDST/experiments


### Load cleaned dataset

In [29]:
cleaned_dataset_save_path = r"../datasets/cleaned_train.csv"

train_dataset = pd.read_csv(cleaned_dataset_save_path)

In [30]:
train_dataset.head()

Unnamed: 0,YearGroup,MonthGroup,SequenceNum,GearNum,DoorNum,EngineSize,EngineDescription,Cylinders,FuelCapacity,WheelBase,...,Branch_Tamworth (NSW),Branch_Townsville (QLD),Branch_Tullamarine (VIC),Branch_Welshpool (WA),SaleCategory_Auction,SaleCategory_Dealer Only Auction,SaleCategory_Fixed Price,SaleCategory_Pickles Online,SaleCategory_Special Fixed Price,SaleCategory_Tender
0,0.72,0.0,0.0,0.375,0.666667,0.496967,0.003628,0.333333,0.272109,0.519591,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,0.12,0.583333,0.341667,0.375,1.0,0.536141,0.003878,0.333333,0.238095,0.487905,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,0.88,0.0,0.05,0.5,1.0,0.288438,0.002127,0.111111,0.183673,0.432709,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.68,0.0,0.091667,0.375,1.0,0.496967,0.003628,0.333333,0.285714,0.527428,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.68,0.0,0.0,0.375,1.0,0.288438,0.002127,0.111111,0.217687,0.531516,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Start model training

Split features and targets

In [31]:
y_train = train_dataset['Sold_Amount']
x_train = train_dataset.drop('Sold_Amount', axis=1)

In [6]:
#verify dataset shape
print(y_train.shape)
print(x_train.shape)

(46855,)
(46855, 790)


In [7]:
x_train.head()

Unnamed: 0.1,Unnamed: 0,YearGroup,MonthGroup,SequenceNum,GearNum,DoorNum,EngineSize,EngineDescription,Cylinders,FuelCapacity,...,Branch_Tamworth (NSW),Branch_Townsville (QLD),Branch_Tullamarine (VIC),Branch_Welshpool (WA),SaleCategory_Auction,SaleCategory_Dealer Only Auction,SaleCategory_Fixed Price,SaleCategory_Pickles Online,SaleCategory_Special Fixed Price,SaleCategory_Tender
0,0,0.72,0.0,0.0,0.375,0.666667,0.496967,0.003628,0.333333,0.272109,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1,0.12,0.583333,0.341667,0.375,1.0,0.536141,0.003878,0.333333,0.238095,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,2,0.88,0.0,0.05,0.5,1.0,0.288438,0.002127,0.111111,0.183673,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,3,0.68,0.0,0.091667,0.375,1.0,0.496967,0.003628,0.333333,0.285714,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,4,0.68,0.0,0.0,0.375,1.0,0.288438,0.002127,0.111111,0.217687,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


Get benchmark model


In [8]:
linear_reg = LinearRegression()
score_r2 = r2_score
score = cross_validate(linear_reg, x_train, y_train, scoring=('r2', 'neg_mean_absolute_error'))


In [None]:
score


{'fit_time': array([2.40661597, 2.39453912, 5.3964529 , 2.24974298, 2.13588309]),
 'score_time': array([0.006001  , 0.00788879, 0.0151701 , 0.01510406, 0.01539874]),
 'test_r2': array([-1.82863711e+17, -3.11932899e+18, -4.86882249e+16, -1.31438616e+20,
        -3.25087925e+18]),
 'test_neg_mean_absolute_error': array([  -321345.66294246,  -1414651.08501717,   -118998.44597744,
        -10849904.23106647,  -1121785.96472795])}

### Train models

select model family

In [32]:
def train_score_model(estimator_name:str, estimator:BaseEstimator, x_train, y_train):
    """
        Train an input estimator and score
    """

    print(f"scoring {estimator_name}")
    score = cross_validate(estimator=estimator, X=x_train, y=y_train, scoring=('r2', 'neg_mean_absolute_error'))

    return {estimator_name: score}


create model ID

In [None]:
EXPERIMENT_NAME = "model_training_general"
EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)

train and log model metrics

In [21]:
candidate_estimators = {"random_forest":RandomForestRegressor(), "gradient_boosted":GradientBoostingRegressor(), "decision_tree":DecisionTreeRegressor(), "mlp":MLPRegressor()}
candidate_scores = {}

RUN_NAME = "Initial_run_3"


for estimator_name, estimator in candidate_estimators.items():
    with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name=RUN_NAME + f"_{estimator_name}") as run:
        res = train_score_model(estimator_name, estimator, x_train, y_train)
        for name, scores in res.items():
            for metric, val in scores.items():
                mlflow.log_metric(metric, val.mean())
        candidate_scores = candidate_scores | res

scoring random_forest
scoring gradient_boosted
scoring decision_tree
scoring mlp


In [20]:
with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name="test_run_2") as run:
        res = train_score_model("linear_reg", LinearRegression(), x_train, y_train)
        for name, scores in res.items():
            for metric, val in scores.items():
                mlflow.log_metric(metric, val.mean())

scoring linear_reg


mlflow ui --backend-store-uri file:///Users/rusernyeoh/projects/datium_data_science_test/DDST/experiments

Choose random forest based on metrics

Skipping hyperparameter tuning here

Train RandomForest

create model ID

In [38]:
EXPERIMENT_NAME = "model_training_hyperparameter"
current_experiment=dict(mlflow.get_experiment_by_name(EXPERIMENT_NAME))
EXPERIMENT_ID=current_experiment['experiment_id']
# EXPERIMENT_ID = mlflow.create_experiment(EXPERIMENT_NAME)


Traceback (most recent call last):
  File "/opt/homebrew/Caskroom/miniforge/base/envs/datium_test_py3.9/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 285, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/datium_test_py3.9/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 378, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/datium_test_py3.9/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 1082, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/datium_test_py3.9/lib/python3.9/site-packages/mlflow/store/tracking/file_store.py", line 1075, in _read_helper
    result = read_yaml(root, file_name)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/datium_test_py3.9/lib/p

In [39]:
RUN_NAME = "selected_model"
with mlflow.start_run(experiment_id=EXPERIMENT_ID, run_name=RUN_NAME + f"random_forest") as run:
    mlflow.sklearn.autolog()
    rf = RandomForestRegressor()
    rf.fit(x_train, y_train)




In [40]:
model_save_path = r"../assets/model.save"
joblib.dump(rf, model_save_path)

['../assets/model.save']