In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.svm import LinearSVR
import pickle
from pathlib import Path
from mlflow.tracking import MlflowClient
import mlflow.pyfunc
import mlflow
%matplotlib inline

In [10]:
MLFLOW_TRACKING_URI = "sqlite:///mlflow.db"
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
client = MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)
run_id = 'd766a676d45146e3912e38740b3b5036'

def read_data(filename):
    df = pd.read_csv(filename, index_col=None).drop(columns=['Unnamed: 0'])
    df.dropna(inplace=True)
    
    df.Location = df.Location.str.split('-').apply(lambda x : x[0])
    df['Price'] = df.Price.apply(lambda x : x/1000)
    
    return df

def preprocess(df, run_id ):
    categorical_features = ['Car Name', 'Fuel', 'Location', 'Drive', 'Type']
    
    if 'Price' in df.columns : 
        y = df['Price'].values
        X = df.drop(columns='Price')
    else : 
        X,y = df, None
        
    path = Path("preprocess/preprocess.bin")
    #if not path.isfile():
    client.download_artifacts(run_id=run_id, path='preprocess', dst_path='.')
    with open(path, 'rb') as f_in : 
        oe, scaler = pickle.load(f_in)
    X[categorical_features] = oe.transform( X[categorical_features] )
    X = scaler.transform(X)
    return X,y
    
def test_model(name, stage, X_test, y_test):
    # Load model as a PyFuncModel.
    model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{stage}")
    y_pred = model.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    return {"rmse" : rmse}

In [11]:
df = read_data("cars_24_combined.csv")
X_test, y_test = preprocess(df, run_id)


  client.download_artifacts(run_id=run_id, path='preprocess', dst_path='.')


In [13]:
model_name = "xgboost"
stage = 'Production'

%time test_model(name=model_name, stage=stage, X_test=X_test, y_test=y_test)

 - numpy (current: 1.25.0, required: numpy==1.25.1)
 - typing-extensions (current: 4.7.0, required: typing-extensions==4.7.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


CPU times: total: 2.38 s
Wall time: 5.57 s


{'rmse': 35.29596994403189}

#### Deployment code

In [14]:
df = read_data("cars_24_combined.csv")
X_test, y_test = preprocess(df, run_id)

  client.download_artifacts(run_id=run_id, path='preprocess', dst_path='.')


In [16]:
pd.read_csv('cars_24_combined.csv').head(2)

Unnamed: 0.1,Unnamed: 0,Car Name,Year,Distance,Owner,Fuel,Location,Drive,Type,Price
0,0,Maruti S PRESSO,2022.0,3878,1,PETROL,HR-98,Manual,HatchBack,514000
1,1,Hyundai Xcent,2018.0,32041,1,PETROL,TN-22,Manual,Sedan,674000


In [21]:
def read_data(filename):
    df = pd.read_csv(filename, index_col=None).drop(columns=['Unnamed: 0'])
    df.dropna(inplace=True)
    
    df.Location = df.Location.str.split('-').apply(lambda x : x[0])
    df['Price'] = df.Price.apply(lambda x : x/1000)
    
    return df

def preprocess(df, run_id ):
    categorical_features = ['Car Name', 'Fuel', 'Location', 'Drive', 'Type']
    
    if 'Price' in df.columns : 
        y = df['Price'].values
        X = df.drop(columns='Price')
    else : 
        X,y = df, None
        
    path = Path("preprocess/preprocess.bin")
    #if not path.isfile():
    client.download_artifacts(run_id=run_id, path='preprocess', dst_path='.')
    print(X.head(2))
    with open(path, 'rb') as f_in : 
        oe, scaler = pickle.load(f_in)
    X[categorical_features] = oe.transform( X[categorical_features] )
    X = scaler.transform(X)
    return X,y

In [22]:
df = read_data("cars_24_combined.csv")
X_test, y_test = preprocess(df, run_id)

          Car Name    Year  Distance  Owner    Fuel Location   Drive   
0  Maruti S PRESSO  2022.0      3878      1  PETROL       HR  Manual  \
1    Hyundai Xcent  2018.0     32041      1  PETROL       TN  Manual   

        Type  
0  HatchBack  
1      Sedan  


  client.download_artifacts(run_id=run_id, path='preprocess', dst_path='.')


In [43]:
sample = pd.DataFrame({'Car Name' : [ 'Maruti S PRESSO' ],
              'Year' : [ 2022.0 ], 
              'Distance' : [ 3878 ], 
              'Owner' : [ 1 ],
              'Fuel' : [ 'PETROL' ], 
              'Location' : [ 'HR' ], 
              'Drive' : [ 'Manual' ], 
              'Type' : [ 'HatchBack' ]
             })

In [44]:
input_ = preprocess_(sample)

In [45]:
model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{stage}")
y_pred = model.predict(input_)

 - numpy (current: 1.25.0, required: numpy==1.25.1)
 - typing-extensions (current: 4.7.0, required: typing-extensions==4.7.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [46]:
y_pred[0]

510.5563

In [47]:
#pickle.dump(model, open('model.pkl', 'wb'))