## Experiment tracking using MlFlow Part 1

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR
import pickle
%matplotlib inline
# dataset source : https://www.kaggle.com/datasets/ujjwalwadhwa/cars24com-used-cars-dataset

In [2]:
df = pd.read_csv("cars_24_combined.csv", index_col=None).drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,Car Name,Year,Distance,Owner,Fuel,Location,Drive,Type,Price
0,Maruti S PRESSO,2022.0,3878,1,PETROL,HR-98,Manual,HatchBack,514000
1,Hyundai Xcent,2018.0,32041,1,PETROL,TN-22,Manual,Sedan,674000
2,Tata Safari,2021.0,96339,1,DIESEL,TS-08,Automatic,SUV,1952000
3,Maruti Vitara Brezza,2019.0,51718,1,DIESEL,WB-24,Manual,SUV,690000
4,Tata Tiago,2021.0,19811,1,PETROL,HR-51,Manual,HatchBack,526000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8015 entries, 0 to 8014
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Car Name  8014 non-null   object 
 1   Year      8014 non-null   float64
 2   Distance  8015 non-null   int64  
 3   Owner     8015 non-null   int64  
 4   Fuel      8015 non-null   object 
 5   Location  7802 non-null   object 
 6   Drive     8015 non-null   object 
 7   Type      8015 non-null   object 
 8   Price     8015 non-null   int64  
dtypes: float64(1), int64(3), object(5)
memory usage: 563.7+ KB


#### Checking for Null values

In [4]:
df.isna().sum()

Car Name      1
Year          1
Distance      0
Owner         0
Fuel          0
Location    213
Drive         0
Type          0
Price         0
dtype: int64

Location is an important factor in determining the price as the price of the same vehicle varies with the state it was bought in (owing to taxation rules) and therefore imputing the missing values using mode or other similar imputation methods won't be a good ideas as it might distort the patterns that the ML model will learn. So, we will drop the null item rows.

In [5]:
print(df.shape)
df.dropna(inplace=True)
print(f'Shape of dataset after dropping the null item rows {df.shape}')

(8015, 9)
Shape of dataset after dropping the null item rows (7801, 9)


### Data Cleaning and Featurization

#### Clean and featurize 'Location' column

In [6]:
len(dict(df.Location.value_counts()))

436

As we can see the Location feature is a categorical feature but still has too many different/unique values. One way to resolve this is to consider only state rather than the district of the vehicle. As the price of the vehicle is almost the same across different cities in the state, it is okay if we drop the last two digits (indicating district) and consider only the first two letter (indicating the state). 

In [7]:
df.Location = df.Location.str.split('-').apply(lambda x : x[0])

In [8]:
len(dict(df.Location.value_counts()))

17

Now, there are only 17 different values in this categorical column and can be easily label encoded.

In [9]:
# For the price column, we see that the prices are in multiple of 1000. So, let's divide them all by thousand and keep it 
# in terms of 1000
df['Price'] = df.Price.apply(lambda x : x/1000)

In [18]:
df.columns

Index(['Car Name', 'Year', 'Distance', 'Owner', 'Fuel', 'Location', 'Drive',
       'Type', 'Price'],
      dtype='object')

## Vectorization

In [12]:
categorical_features = ['Car Name', 'Fuel', 'Location', 'Drive', 'Type']
target_feature = 'Price'

X = df.drop(columns='Price')
y = df['Price'].values

oe = OrdinalEncoder()
oe.fit(X=X[categorical_features])
X[categorical_features] = oe.transform( X[categorical_features] )

scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [13]:
X_train, X_ref, y_train, y_ref = train_test_split(X_train, y_train, test_size=0.1, random_state=42)

In [17]:
with open('data/dataset.bin', 'wb') as f_out :
    pickle.dump( (X, y), f_out)

In [15]:
with open('data/train.bin', 'wb') as f_out :
    pickle.dump( (X_train, y_train), f_out)

with open('data/test.bin', 'wb' ) as f_out : 
    pickle.dump( (X_test, y_test), f_out) 

with open('data/reference_data.bin', 'wb' ) as f_out :
    pickle.dump( (X_ref, y_ref), f_out)

#with open('preprocess/preprocess.bin', 'wb') as f_out : 
#    pickle.dump( (oe, scaler), f_out)

In [12]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("first_experiment")

2023/07/20 03:37:53 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2023/07/20 03:37:53 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
2023/07/20 03:37:53 INFO mlflow.tracking.fluent: Experiment with name 'first_experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///D:/0 mlopscamp/mlruns/1', creation_time=1689838673556, experiment_id='1', last_update_time=1689838673556, lifecycle_stage='active', name='first_experiment', tags={}>

### SGDRegressor

In [23]:
def objective_linReg(search_space):
    with mlflow.start_run():
        mlflow.set_tag("developer", "rishikesh")
        
        linReg = SGDRegressor(**search_space, random_state=101)
        linReg.fit(X_train, y_train)

        pred_train = linReg.predict(X_train)
        pred_test = linReg.predict(X_test)

        train_error = mean_squared_error(y_train, pred_train)
        test_error = mean_squared_error(y_test, pred_test)
        
        mlflow.log_metric("train_mse", train_error)
        mlflow.log_metric("test_mse", test_error)
        mlflow.log_artifact("preprocess/preprocess.bin", artifact_path="preprocess")
        mlflow.sklearn.log_model(linReg, artifact_path = "models")

    return { 'loss' : train_error, 'status': STATUS_OK }
    
search_space = {
    'penalty' : hp.choice( "penalty", ['l2', 'l1', 'elasticnet']),
    'alpha' : hp.loguniform('alpha', -5, -1),
}

best_params = fmin(
    fn = objective_linReg,
    space = search_space,
    algo = tpe.suggest,
    max_evals = 25
)

100%|█████████████████████████████████████████████████| 25/25 [01:26<00:00,  3.47s/trial, best loss: 36871.32817847238]


In [24]:
best_params

{'alpha': 0.0685546505156344, 'penalty': 2}

In [None]:
# with open('models/linReg.bin', 'wb') as f_out : 
#    pickle.dump( (oe, scaler, linReg), f_out)

### XGBoost 

In [26]:
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test, label=y_test)

mlflow.xgboost.autolog()

def objective_xgboost(params):
    with mlflow.start_run():
        booster = xgb.train(
            params=params,
            dtrain=xg_train,
            num_boost_round=100,
            evals=[(xg_test, 'testing')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(xg_test)
        mse = mean_squared_error(y_test, y_pred, squared=False)
        mlflow.log_artifact("preprocess/preprocess.bin", artifact_path="preprocess")
    return {'loss': mse, 'status': STATUS_OK}

search_space_1 = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 50, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:squarederror',
    'seed': 42
}

best_result = fmin(
    fn=objective_xgboost,
    space=search_space_1,
    algo=tpe.suggest,
    max_evals=25
)



[0]	testing-rmse:227.43897                                                                                             
[1]	testing-rmse:132.55855                                                                                             
[2]	testing-rmse:117.93487                                                                                             
[3]	testing-rmse:120.31805                                                                                             
[4]	testing-rmse:120.04165                                                                                             
[5]	testing-rmse:121.80466                                                                                             
[6]	testing-rmse:122.39184                                                                                             
[7]	testing-rmse:122.58770                                                                                             
[8]	testing-rmse:123.74518              

[11]	testing-rmse:118.61476                                                                                            
[12]	testing-rmse:118.95464                                                                                            
[13]	testing-rmse:119.04586                                                                                            
[14]	testing-rmse:119.21656                                                                                            
[15]	testing-rmse:118.46714                                                                                            
[16]	testing-rmse:118.01149                                                                                            
[17]	testing-rmse:119.51019                                                                                            
[18]	testing-rmse:120.23491                                                                                            
[19]	testing-rmse:121.39043             

[19]	testing-rmse:112.89515                                                                                            
[20]	testing-rmse:112.11991                                                                                            
[21]	testing-rmse:111.34286                                                                                            
[22]	testing-rmse:111.04828                                                                                            
[23]	testing-rmse:110.50853                                                                                            
[24]	testing-rmse:110.43421                                                                                            
[25]	testing-rmse:110.13009                                                                                            
[26]	testing-rmse:110.25088                                                                                            
[27]	testing-rmse:110.05623             

[87]	testing-rmse:109.78341                                                                                            
[88]	testing-rmse:109.78298                                                                                            
[89]	testing-rmse:109.79731                                                                                            
[90]	testing-rmse:109.79604                                                                                            
[91]	testing-rmse:109.79262                                                                                            
[92]	testing-rmse:109.79024                                                                                            
[93]	testing-rmse:109.80181                                                                                            
[94]	testing-rmse:109.79730                                                                                            
[95]	testing-rmse:109.79619             

[54]	testing-rmse:127.35711                                                                                            
[55]	testing-rmse:127.35748                                                                                            
[56]	testing-rmse:127.35800                                                                                            
[57]	testing-rmse:127.35810                                                                                            
[58]	testing-rmse:127.35825                                                                                            
[59]	testing-rmse:127.35869                                                                                            
[60]	testing-rmse:127.35864                                                                                            
[61]	testing-rmse:127.35875                                                                                            
[62]	testing-rmse:127.35854             

[52]	testing-rmse:116.37795                                                                                            
[53]	testing-rmse:116.12064                                                                                            
[54]	testing-rmse:115.81579                                                                                            
[55]	testing-rmse:115.51529                                                                                            
[56]	testing-rmse:115.30084                                                                                            
[57]	testing-rmse:115.10816                                                                                            
[58]	testing-rmse:114.94559                                                                                            
[59]	testing-rmse:114.86816                                                                                            
[60]	testing-rmse:114.76506             

[18]	testing-rmse:127.60654                                                                                            
[19]	testing-rmse:127.60658                                                                                            
[20]	testing-rmse:127.60658                                                                                            
[21]	testing-rmse:127.60659                                                                                            
[22]	testing-rmse:127.60660                                                                                            
[23]	testing-rmse:127.60660                                                                                            
[24]	testing-rmse:127.60662                                                                                            
[25]	testing-rmse:127.60662                                                                                            
[26]	testing-rmse:127.60664             

[32]	testing-rmse:141.23592                                                                                            
[33]	testing-rmse:137.81115                                                                                            
[34]	testing-rmse:134.69748                                                                                            
[35]	testing-rmse:131.90263                                                                                            
[36]	testing-rmse:129.36074                                                                                            
[37]	testing-rmse:127.20934                                                                                            
[38]	testing-rmse:125.06207                                                                                            
[39]	testing-rmse:123.00783                                                                                            
[40]	testing-rmse:121.27398             

[0]	testing-rmse:297.21624                                                                                             
[1]	testing-rmse:169.32431                                                                                             
[2]	testing-rmse:125.41985                                                                                             
[3]	testing-rmse:115.67050                                                                                             
[4]	testing-rmse:115.01430                                                                                             
[5]	testing-rmse:113.09234                                                                                             
[6]	testing-rmse:113.17452                                                                                             
[7]	testing-rmse:114.85368                                                                                             
[8]	testing-rmse:115.56712              

[8]	testing-rmse:115.66619                                                                                             
[9]	testing-rmse:115.65777                                                                                             
[10]	testing-rmse:115.48034                                                                                            
[11]	testing-rmse:116.28645                                                                                            
[12]	testing-rmse:116.46110                                                                                            
[13]	testing-rmse:116.88826                                                                                            
[14]	testing-rmse:117.20411                                                                                            
[15]	testing-rmse:117.56477                                                                                            
[16]	testing-rmse:118.28801             

[17]	testing-rmse:123.10685                                                                                            
[18]	testing-rmse:123.10134                                                                                            
[19]	testing-rmse:123.10610                                                                                            
[20]	testing-rmse:123.09772                                                                                            
[21]	testing-rmse:123.08095                                                                                            
[22]	testing-rmse:123.06898                                                                                            
[23]	testing-rmse:123.06572                                                                                            
[24]	testing-rmse:123.05839                                                                                            
[25]	testing-rmse:123.05210             

[30]	testing-rmse:105.20379                                                                                            
[31]	testing-rmse:104.79165                                                                                            
[32]	testing-rmse:104.69430                                                                                            
[33]	testing-rmse:104.63574                                                                                            
[34]	testing-rmse:104.53609                                                                                            
[35]	testing-rmse:103.97344                                                                                            
[36]	testing-rmse:103.46939                                                                                            
[37]	testing-rmse:103.71985                                                                                            
[38]	testing-rmse:103.82303             

[98]	testing-rmse:103.14288                                                                                            
[99]	testing-rmse:103.22115                                                                                            
[0]	testing-rmse:576.11113                                                                                             
[1]	testing-rmse:530.10811                                                                                             
[2]	testing-rmse:488.51817                                                                                             
[3]	testing-rmse:450.54163                                                                                             
[4]	testing-rmse:415.59998                                                                                             
[5]	testing-rmse:384.44487                                                                                             
[6]	testing-rmse:355.86233              

[64]	testing-rmse:103.47937                                                                                            
[65]	testing-rmse:103.22238                                                                                            
[66]	testing-rmse:103.30683                                                                                            
[67]	testing-rmse:103.38722                                                                                            
[68]	testing-rmse:103.31667                                                                                            
[69]	testing-rmse:103.47395                                                                                            
[70]	testing-rmse:103.59210                                                                                            
[71]	testing-rmse:103.58934                                                                                            
[72]	testing-rmse:103.53970             

[30]	testing-rmse:182.24754                                                                                            
[31]	testing-rmse:178.06789                                                                                            
[32]	testing-rmse:174.33465                                                                                            
[33]	testing-rmse:170.74681                                                                                            
[34]	testing-rmse:167.80961                                                                                            
[35]	testing-rmse:165.10162                                                                                            
[36]	testing-rmse:162.28341                                                                                            
[37]	testing-rmse:160.01635                                                                                            
[38]	testing-rmse:157.85205             

[98]	testing-rmse:117.63573                                                                                            
[99]	testing-rmse:117.54475                                                                                            
[0]	testing-rmse:514.36756                                                                                             
[1]	testing-rmse:424.49046                                                                                             
[2]	testing-rmse:351.45339                                                                                             
[3]	testing-rmse:293.41594                                                                                             
[4]	testing-rmse:247.52935                                                                                             
[5]	testing-rmse:211.90116                                                                                             
[6]	testing-rmse:184.18362              

[64]	testing-rmse:107.97402                                                                                            
[65]	testing-rmse:107.95408                                                                                            
[66]	testing-rmse:107.92372                                                                                            
[67]	testing-rmse:107.96125                                                                                            
[68]	testing-rmse:107.95715                                                                                            
[69]	testing-rmse:107.99641                                                                                            
[70]	testing-rmse:107.97947                                                                                            
[0]	testing-rmse:563.82594                                                                                             
[1]	testing-rmse:507.49975              

[59]	testing-rmse:112.79961                                                                                            
[60]	testing-rmse:112.85984                                                                                            
[61]	testing-rmse:112.89679                                                                                            
[62]	testing-rmse:112.85108                                                                                            
[63]	testing-rmse:112.88393                                                                                            
[64]	testing-rmse:112.87981                                                                                            
[65]	testing-rmse:112.88515                                                                                            
[66]	testing-rmse:112.89327                                                                                            
[67]	testing-rmse:112.90508             

[35]	testing-rmse:114.87467                                                                                            
[36]	testing-rmse:114.84773                                                                                            
[37]	testing-rmse:114.81913                                                                                            
[38]	testing-rmse:114.83373                                                                                            
[39]	testing-rmse:114.82048                                                                                            
[40]	testing-rmse:114.80535                                                                                            
[41]	testing-rmse:114.81123                                                                                            
[42]	testing-rmse:114.82182                                                                                            
[43]	testing-rmse:114.82195             

[10]	testing-rmse:171.78583                                                                                            
[11]	testing-rmse:158.64101                                                                                            
[12]	testing-rmse:147.96262                                                                                            
[13]	testing-rmse:139.17620                                                                                            
[14]	testing-rmse:132.30764                                                                                            
[15]	testing-rmse:126.57196                                                                                            
[16]	testing-rmse:121.76436                                                                                            
[17]	testing-rmse:117.92944                                                                                            
[18]	testing-rmse:115.27628             

[78]	testing-rmse:106.65102                                                                                            
[79]	testing-rmse:106.64472                                                                                            
[80]	testing-rmse:106.64597                                                                                            
[81]	testing-rmse:106.56550                                                                                            
[82]	testing-rmse:106.53865                                                                                            
[83]	testing-rmse:106.64850                                                                                            
[84]	testing-rmse:106.64918                                                                                            
[0]	testing-rmse:568.34194                                                                                             
[1]	testing-rmse:516.21305              

[59]	testing-rmse:108.86092                                                                                            
[60]	testing-rmse:108.96103                                                                                            
[61]	testing-rmse:108.95235                                                                                            
[62]	testing-rmse:109.01915                                                                                            
[63]	testing-rmse:109.06922                                                                                            
[64]	testing-rmse:109.11728                                                                                            
[65]	testing-rmse:109.16204                                                                                            
[66]	testing-rmse:109.15134                                                                                            
[67]	testing-rmse:109.15323             

[31]	testing-rmse:99.49665                                                                                             
[32]	testing-rmse:99.64548                                                                                             
[33]	testing-rmse:99.70318                                                                                             
[34]	testing-rmse:99.32660                                                                                             
[35]	testing-rmse:99.10072                                                                                             
[36]	testing-rmse:99.39130                                                                                             
[37]	testing-rmse:99.40063                                                                                             
[38]	testing-rmse:99.77783                                                                                             
[39]	testing-rmse:99.68254              

[99]	testing-rmse:98.57517                                                                                             
[0]	testing-rmse:419.95502                                                                                             
[1]	testing-rmse:286.10300                                                                                             
[2]	testing-rmse:205.97635                                                                                             
[3]	testing-rmse:159.78771                                                                                             
[4]	testing-rmse:135.76155                                                                                             
[5]	testing-rmse:123.34659                                                                                             
[6]	testing-rmse:117.88899                                                                                             
[7]	testing-rmse:115.61911              

[3]	testing-rmse:171.65783                                                                                             
[4]	testing-rmse:142.11002                                                                                             
[5]	testing-rmse:125.47941                                                                                             
[6]	testing-rmse:118.37769                                                                                             
[7]	testing-rmse:115.39952                                                                                             
[8]	testing-rmse:114.88783                                                                                             
[9]	testing-rmse:114.43159                                                                                             
[10]	testing-rmse:113.83263                                                                                            
[11]	testing-rmse:113.07592             

[71]	testing-rmse:114.19780                                                                                            
[0]	testing-rmse:434.97865                                                                                             
[1]	testing-rmse:309.82702                                                                                             
[2]	testing-rmse:226.99681                                                                                             
[3]	testing-rmse:175.96250                                                                                             
[4]	testing-rmse:144.83731                                                                                             
[5]	testing-rmse:128.11080                                                                                             
[6]	testing-rmse:119.72815                                                                                             
[7]	testing-rmse:113.52459              

[65]	testing-rmse:106.43899                                                                                            
[66]	testing-rmse:106.51391                                                                                            
[67]	testing-rmse:106.44080                                                                                            
[68]	testing-rmse:106.46196                                                                                            
[69]	testing-rmse:106.48586                                                                                            
[70]	testing-rmse:106.45231                                                                                            
[71]	testing-rmse:106.49692                                                                                            
[72]	testing-rmse:106.61585                                                                                            
[73]	testing-rmse:106.66455             

[53]	testing-rmse:114.96122                                                                                            
[54]	testing-rmse:114.95325                                                                                            
[55]	testing-rmse:114.95356                                                                                            
[56]	testing-rmse:114.96094                                                                                            
[0]	testing-rmse:471.38592                                                                                             
[1]	testing-rmse:357.90229                                                                                             
[2]	testing-rmse:276.22777                                                                                             
[3]	testing-rmse:218.94194                                                                                             
[4]	testing-rmse:178.30780              

[62]	testing-rmse:106.40675                                                                                            
[63]	testing-rmse:106.38694                                                                                            
[64]	testing-rmse:106.58690                                                                                            
[65]	testing-rmse:106.64021                                                                                            
[66]	testing-rmse:106.66280                                                                                            
[67]	testing-rmse:106.77884                                                                                            
[68]	testing-rmse:106.91505                                                                                            
[69]	testing-rmse:106.84771                                                                                            
[70]	testing-rmse:107.00363             

[34]	testing-rmse:109.33873                                                                                            
[35]	testing-rmse:109.35407                                                                                            
[36]	testing-rmse:109.30369                                                                                            
[37]	testing-rmse:109.44246                                                                                            
[38]	testing-rmse:109.39213                                                                                            
[39]	testing-rmse:109.65219                                                                                            
[40]	testing-rmse:109.49934                                                                                            
[41]	testing-rmse:109.84385                                                                                            
[42]	testing-rmse:110.15832             

### Ensemble models

In [32]:
mlflow.sklearn.autolog()

for model_class in (RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor):
    with mlflow.start_run():
        mlflow.log_artifact("preprocess/preprocess.bin", artifact_path="preprocess")
        mlmodel = model_class()
        mlmodel.fit(X_train, y_train)
        y_pred = mlmodel.predict(X_test)
        rmse = mean_squared_error(y_test, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)