In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR
import pickle
from pathlib import Path
from mlflow.tracking import MlflowClient
import mlflow.pyfunc
import mlflow
%matplotlib inline

In [14]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
run_id = '9ea7408ce7224421b1e72937f5ff18e7'

def read_data(filename):
    df = pd.read_csv(filename, index_col=None).drop(columns=['Unnamed: 0'])
    df.dropna(inplace=True)
    
    df.Location = df.Location.str.split('-').apply(lambda x : x[0])
    df['Price'] = df.Price.apply(lambda x : x/1000)
    
    return df

def preprocess(df, run_id ):
    categorical_features = ['Car Name', 'Fuel', 'Location', 'Drive', 'Type']
    
    if 'Price' in df.columns : 
        y = df['Price'].values
        X = df.drop(columns='Price')
    else : 
        X,y = df, None
        
    path = Path("preprocess/preprocess.bin")
    #if not path.isfile():
    client.download_artifacts(run_id=run_id, path='preprocess', dst_path='.')
    with open(path, 'rb') as f_in : 
        oe, scaler = pickle.load(f_in)
    X[categorical_features] = oe.transform( X[categorical_features] )
    X = scaler.transform(X)
    return X,y
    
def test_model(name, stage, X_test, y_test):
    # Load model as a PyFuncModel.
    model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{stage}")
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return {"rmse" : rmse}

In [15]:
df = read_data("cars_24_combined.csv")
X_test, y_test = preprocess(df, run_id)


  client.download_artifacts(run_id=run_id, path='preprocess', dst_path='.')


In [17]:
model_name = "sklearn"
stage = 'Production'

%time test_model(name=model_name, stage=stage, X_test=X_test, y_test=y_test)

 - mlflow (current: 2.4.1, required: mlflow==2.4)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


CPU times: total: 438 ms
Wall time: 452 ms


{'rmse': 52.427804245239926}