In [2]:
import pandas as pd

In [3]:
pd.__version__

'2.2.3'

In [4]:
!pip install pyarrow



In [5]:
import pandas as pd
import pickle
import seaborn as sns
import matplotlib.pyplot as plt

In [6]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso,Ridge
from sklearn.metrics import root_mean_squared_error

In [7]:
import mlflow
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc__taxi_experiment")

<Experiment: artifact_location='/workspaces/MLOps-zoomcamp/02_experiment_tracking/mlruns/1', creation_time=1742415919471, experiment_id='1', last_update_time=1742415919471, lifecycle_stage='active', name='nyc__taxi_experiment', tags={}>

#### Green taxi trip-2021

In [8]:
def read_dataframe(filename):
    if filename.endswith('.csv'):
        df=pd.read_csv(filename)
        df.lpep_dropoff_datetime = pd.to_datetime(df.lpep_dropoff_datetime)
        df.lpep_pickup_datetime = pd.to_datetime(df.lpep_pickup_datetime)
    elif filename.endswith('.parquet'):
        df=pd.read_parquet(filename)
    df['duration']=df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration=df.duration.apply(lambda m : m.total_seconds() / 60)
    
    df=df[(df.duration >= 1) & (df.duration <= 60)]
    categorical=['PULocationID','DOLocationID']
    numerical=['trip_distance']
    
    df[categorical]=df[categorical].astype(str)
    return df    

In [9]:
df_train=read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val=read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')

In [10]:
len(df_train),len(df_val)

(73908, 61921)

In [11]:
df_train.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration
0,2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1.0,43,151,1.0,1.01,5.5,...,0.5,0.0,0.0,,0.3,6.8,2.0,1.0,0.0,3.933333
1,2,2021-01-01 00:25:59,2021-01-01 00:34:44,N,1.0,166,239,1.0,2.53,10.0,...,0.5,2.81,0.0,,0.3,16.86,1.0,1.0,2.75,8.75
2,2,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1.0,41,42,1.0,1.12,6.0,...,0.5,1.0,0.0,,0.3,8.3,1.0,1.0,0.0,5.966667
3,2,2020-12-31 23:57:51,2021-01-01 00:04:56,N,1.0,168,75,1.0,1.99,8.0,...,0.5,0.0,0.0,,0.3,9.3,2.0,1.0,0.0,7.083333
7,2,2021-01-01 00:26:31,2021-01-01 00:28:50,N,1.0,75,75,6.0,0.45,3.5,...,0.5,0.96,0.0,,0.3,5.76,1.0,1.0,0.0,2.316667


In [12]:
df_train['PU_DO'] = df_train['PULocationID'] + '_' + df_train['DOLocationID']
df_val['PU_DO'] = df_val['PULocationID'] + '_' + df_val['DOLocationID']

In [13]:
categorical = ['PU_DO'] #'PULocationID', 'DOLocationID']
numerical = ['trip_distance']

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [14]:
target = 'duration'
y_train = df_train[target].values
y_val = df_val[target].values

### Linear Regression

In [17]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_val)

root_mean_squared_error(y_val, y_pred)

7.758715209663881

In [18]:
with open('models/lin_reg.bin','wb') as f_out:
    pickle.dump((dv,lr),f_out)

### Lasso Regression

In [19]:
with mlflow.start_run():
    mlflow.set_tag("developer","slv")
    alpha=0.1
    mlflow.log_param("alpha",alpha)
    lasso_reg = Lasso(alpha)
    lasso_reg.fit(X_train, y_train)
    
    y_pred=lasso_reg.predict(X_val)
    rmse=root_mean_squared_error(y_val,y_pred)
    mlflow.log_metric("rmse",rmse)    
    mlflow.log_artifact(local_path="models/lin_reg.bin",artifact_path="models_pickle")

### xgboost - Hyperparameter optimization

In [15]:
import xgboost as xgb
from hyperopt import fmin,tpe,hp,STATUS_OK,Trials
from hyperopt.pyll import scope


In [16]:
train=xgb.DMatrix(X_train,label=y_train)
valid=xgb.DMatrix(X_val,label=y_val)

In [18]:
def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("model","xgboost")
        mlflow.log_params(params)
        booster=xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=50,
            evals=[(valid,'validation')],
            early_stopping_rounds=50
        )
        y_pred=booster.predict(valid)
        rmse=root_mean_squared_error(y_val,y_pred)
        mlflow.log_metric("rmse",rmse)
        with open("models/preprocessor.b","wb") as f_out:
            pickle.dump(dv,f_out)
        mlflow.log_artifact("models/preprocessor.b",artifact_path="models.xgboost")

        mlflow.xgboost.log_model(booster, artifact_path="models_mlflow")
    return {'loss':rmse,'status':STATUS_OK}
            


In [19]:
search_space={
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}

In [20]:
result=fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=10,
    trials=Trials()
)

  0%|                                                 | 0/10 [00:00<?, ?trial/s, best loss=?]




[0]	validation-rmse:11.23800                                                                 
[1]	validation-rmse:10.41830                                                                 
[2]	validation-rmse:9.73496                                                                  
[3]	validation-rmse:9.16844                                                                  
[4]	validation-rmse:8.70253                                                                  
[5]	validation-rmse:8.31102                                                                  
[6]	validation-rmse:7.99435                                                                  
[7]	validation-rmse:7.73927                                                                  
[8]	validation-rmse:7.52495                                                                  
[9]	validation-rmse:7.35931                                                                  
[10]	validation-rmse:7.22000                                





 10%|██▍                     | 1/10 [01:13<11:04, 73.81s/trial, best loss: 6.520541607598035]




[0]	validation-rmse:7.55004                                                                  
[1]	validation-rmse:6.72080                                                                  
[2]	validation-rmse:6.56210                                                                  
[3]	validation-rmse:6.50399                                                                  
[4]	validation-rmse:6.47769                                                                  
[5]	validation-rmse:6.46800                                                                  
[6]	validation-rmse:6.46170                                                                  
[7]	validation-rmse:6.45349                                                                  
[8]	validation-rmse:6.44585                                                                  
[9]	validation-rmse:6.44219                                                                  
[10]	validation-rmse:6.43272                                





 20%|████▊                   | 2/10 [01:38<05:57, 44.66s/trial, best loss: 6.385060067565677]




[0]	validation-rmse:9.72924                                                                  
[1]	validation-rmse:8.29021                                                                  
[2]	validation-rmse:7.49467                                                                  
[3]	validation-rmse:7.06614                                                                  
[4]	validation-rmse:6.83316                                                                  
[5]	validation-rmse:6.70506                                                                  
[6]	validation-rmse:6.62811                                                                  
[7]	validation-rmse:6.58155                                                                  
[8]	validation-rmse:6.55187                                                                  
[9]	validation-rmse:6.53518                                                                  
[10]	validation-rmse:6.52072                                





 30%|███████▏                | 3/10 [01:56<03:49, 32.85s/trial, best loss: 6.385060067565677]




[0]	validation-rmse:11.71704                                                                 
[1]	validation-rmse:11.26114                                                                 
[2]	validation-rmse:10.84154                                                                 
[3]	validation-rmse:10.45693                                                                 
[4]	validation-rmse:10.10459                                                                 
[5]	validation-rmse:9.78221                                                                  
[6]	validation-rmse:9.48737                                                                  
[7]	validation-rmse:9.21817                                                                  
[8]	validation-rmse:8.97217                                                                  
[9]	validation-rmse:8.74936                                                                  
[10]	validation-rmse:8.54654                                





 40%|█████████▌              | 4/10 [02:25<03:06, 31.13s/trial, best loss: 6.385060067565677]




[0]	validation-rmse:10.08167                                                                 
[1]	validation-rmse:8.73778                                                                  
[2]	validation-rmse:7.91948                                                                  
[3]	validation-rmse:7.43324                                                                  
[4]	validation-rmse:7.14847                                                                  
[5]	validation-rmse:6.98112                                                                  
[6]	validation-rmse:6.87849                                                                  
[7]	validation-rmse:6.81399                                                                  
[8]	validation-rmse:6.77437                                                                  
[9]	validation-rmse:6.74670                                                                  
[10]	validation-rmse:6.72814                                





 50%|████████████            | 5/10 [02:37<02:01, 24.20s/trial, best loss: 6.385060067565677]




[0]	validation-rmse:7.17094                                                                  
[1]	validation-rmse:6.70872                                                                  
[2]	validation-rmse:6.62491                                                                  
[3]	validation-rmse:6.60138                                                                  
[4]	validation-rmse:6.59061                                                                  
[5]	validation-rmse:6.58346                                                                  
[6]	validation-rmse:6.57669                                                                  
[7]	validation-rmse:6.56105                                                                  
[8]	validation-rmse:6.55618                                                                  
[9]	validation-rmse:6.55245                                                                  
[10]	validation-rmse:6.54829                                





 60%|██████████████▍         | 6/10 [02:49<01:20, 20.18s/trial, best loss: 6.385060067565677]




[0]	validation-rmse:6.76962                                                                  
[1]	validation-rmse:6.58849                                                                  
[2]	validation-rmse:6.55715                                                                  
[3]	validation-rmse:6.54244                                                                  
[4]	validation-rmse:6.52551                                                                  
[5]	validation-rmse:6.51903                                                                  
[6]	validation-rmse:6.50597                                                                  
[7]	validation-rmse:6.49796                                                                  
[8]	validation-rmse:6.48576                                                                  
[9]	validation-rmse:6.48152                                                                  
[10]	validation-rmse:6.47478                                





 70%|████████████████▊       | 7/10 [03:07<00:58, 19.47s/trial, best loss: 6.385060067565677]




[0]	validation-rmse:9.55619                                                                  
[1]	validation-rmse:8.08001                                                                  
[2]	validation-rmse:7.30281                                                                  
[3]	validation-rmse:6.90230                                                                  
[4]	validation-rmse:6.69295                                                                  
[5]	validation-rmse:6.58623                                                                  
[6]	validation-rmse:6.52587                                                                  
[7]	validation-rmse:6.48547                                                                  
[8]	validation-rmse:6.46308                                                                  
[9]	validation-rmse:6.44478                                                                  
[10]	validation-rmse:6.43243                                





 80%|████████████████████▊     | 8/10 [03:54<00:56, 28.32s/trial, best loss: 6.3559243555761]




[0]	validation-rmse:10.38692                                                                 
[1]	validation-rmse:9.11741                                                                  
[2]	validation-rmse:8.26108                                                                  
[3]	validation-rmse:7.69171                                                                  
[4]	validation-rmse:7.31721                                                                  
[5]	validation-rmse:7.07249                                                                  
[6]	validation-rmse:6.90907                                                                  
[7]	validation-rmse:6.80046                                                                  
[8]	validation-rmse:6.72646                                                                  
[9]	validation-rmse:6.67390                                                                  
[10]	validation-rmse:6.63393                                





 90%|███████████████████████▍  | 9/10 [04:13<00:25, 25.32s/trial, best loss: 6.3559243555761]




[0]	validation-rmse:9.31573                                                                  
[1]	validation-rmse:7.91418                                                                  
[2]	validation-rmse:7.27818                                                                  
[3]	validation-rmse:6.99363                                                                  
[4]	validation-rmse:6.86013                                                                  
[5]	validation-rmse:6.78886                                                                  
[6]	validation-rmse:6.74954                                                                  
[7]	validation-rmse:6.72639                                                                  
[8]	validation-rmse:6.70606                                                                  
[9]	validation-rmse:6.69219                                                                  
[10]	validation-rmse:6.68523                                





100%|█████████████████████████| 10/10 [04:36<00:00, 27.68s/trial, best loss: 6.3559243555761]


[91]	validation-rmse:6.61585                                                                
[92]	validation-rmse:6.61567                                                                
[93]	validation-rmse:6.61508                                                                
[94]	validation-rmse:6.61449                                                                
[95]	validation-rmse:6.61426                                                                
[96]	validation-rmse:6.61440                                                                
[97]	validation-rmse:6.61399                                                                
[98]	validation-rmse:6.61311                                                                
[99]	validation-rmse:6.61312                                                                
 90%|████████████████████▋  | 9/10 [04:01<00:15, 15.54s/trial, best loss: 6.337907152115211]




[0]	validation-rmse:7.75121                                                                 
[1]	validation-rmse:6.85662                                                                 
[2]	validation-rmse:6.66708                                                                 
[3]	validation-rmse:6.61213                                                                 
[4]	validation-rmse:6.59282                                                                 
[5]	validation-rmse:6.57541                                                                 
[6]	validation-rmse:6.56804                                                                 
[7]	validation-rmse:6.56145                                                                 
[8]	validation-rmse:6.55657                                                                 
[9]	validation-rmse:6.55288                                                                 
[10]	validation-rmse:6.54839                                          

[88]	validation-rmse:6.46464                                                                
[89]	validation-rmse:6.46371                                                                
[90]	validation-rmse:6.46321                                                                
[91]	validation-rmse:6.46334                                                                
[92]	validation-rmse:6.46378                                                                
[93]	validation-rmse:6.46383                                                                
[94]	validation-rmse:6.46393                                                                
[95]	validation-rmse:6.46399                                                                
[96]	validation-rmse:6.46317                                                                
[97]	validation-rmse:6.46289                                                                
[98]	validation-rmse:6.46392                                          

## select the hyperparameters from best model 

In [None]:
best_params={
    'learning_rate':0.47727240630121637,
    'max_depth':80,
    'min_child_weight':1.0018416171991325,
    'objective':'reg:linear',
    'reg_alpha':0.035975037379223605,
    'reg_lambda':0.0030249775225013233,
    'seed':42
}
mlflow.xgboost.autolog()
booster=xgb.train(
            params=best_params,
            dtrain=train,
            num_boost_round=10,
            evals=[(valid,'validation')],
            early_stopping_rounds=50
        )

2025/03/19 21:54:42 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '994be2d562c44c04a643358d861f832f', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current xgboost workflow


## making predictions from logged model

In [23]:
logged_model='runs:/37ae44a196b84617b79979721a4b700f/models_mlflow'
#load model
xgb_model=mlflow.xgboost.load_model(logged_model)

In [25]:
y_pred=xgb_model.predict(valid)
y_pred[:10]

array([13.930652,  6.461905, 21.47559 , 24.854925,  9.170853, 17.101048,
       14.701725,  9.104583,  9.281493, 18.616581], dtype=float32)

### MLflow-autologging

In [None]:
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,ExtraTreesRegressor
from sklearn.svm import LinearSVR

mlflow.sklearn.autolog()
for model_class in(RandomForestRegressor,GradientBoostingRegressor,ExtraTreesRegressor,LinearSVR):
    with mlflow.start_run():
        mlflow.log_artifact("models/preprocessor.b",artifact_path="preprocessor")
        mlmodel=model_class()
        mlmodel.fit(X_train,y_train)
        y_pred=mlmodel.pred(X_val)
        rmse=root_mean_squared_error(y_val,y_pred)
        mlflow.log_metric("rmse",rmse)