In [3]:
!pip freeze | grep scikit-learn

In [2]:
!python -V

Python 3.12.10


In [13]:
import pickle
import pandas as pd

In [6]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [7]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [8]:
# df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')
df = read_data('/workspaces/mlops-zoomcamp-1/datasets/raw/yellow_tripdata_2023-03.parquet')

In [9]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

#### Q1

In [24]:
import numpy as np
# np.std(y_pred)
y_pred.std()

np.float64(6.247488852238703)

### Q2

In [10]:
year=2023
month=3
df_result = pd.DataFrame()
df_result['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
df_result['predicted_duration'] = y_pred

In [11]:
output_file = '/workspaces/mlops-zoomcamp-1/datasets/04/result.parquet'
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [12]:
!ls -larth /workspaces/mlops-zoomcamp-1/datasets/04/result.parquet

-rw-r--r-- 1 vscode vscode 66M May 30 13:04 /workspaces/mlops-zoomcamp-1/datasets/04/result.parquet


### Rough: send http request to model

In [None]:
X_val[1,:].toarray()

import json
import requests

# Assuming X_val is a predefined CSR matrix
# data = {"instances": X_val[1,:].toarray().tolist()}
# data = {"inputs": X_val[1,:].toarray().tolist()}
data = {"inputs": [X_val[1,:].toarray().tolist()[0]]}
res = requests.post("http://127.0.0.1:1234/invocations", 
                        headers={"Content-Type": "application/json"},
                        json=data)
res.status_code, res.text

In [40]:
import requests

data = {"inputs": [X_val[1,:].toarray().tolist()]}  # Ensure [[val1, val2, ...]]
res = requests.post("http://127.0.0.1:1234/invocations", 
                    headers={"Content-Type": "application/json"},
                    json=data)

# Documentaion for serving
- [For serve & docker creation](https://mlflow.org/docs/latest/api_reference/cli.html#mlflow-models-serve)
- [tutorial](https://mlflow.org/docs/latest/deployment/deploy-model-to-kubernetes/tutorial)
- this cmd worked `mlflow models serve --env-manager local -m "runs:/d9f0f7a395174a398eb6215fb9c74df5/model" -p 1234`
- creating docker image with following did NOT work `mlflow models build-docker --model-uri "runs:/d9f0f7a395174a398eb6215fb9c74df5/model" --name "mlflow-model"`

### Rough: load model in notebook for inference

In [1]:
import mlflow.sklearn

sk_model = mlflow.sklearn.load_model("runs:/d9f0f7a395174a398eb6215fb9c74df5/model")

# use Pandas DataFrame to make predictions
# pandas_df = ...
# predictions = sk_model.predict(pandas_df) 

MlflowException: Run 'd9f0f7a395174a398eb6215fb9c74df5' not found

In [5]:
import mlflow
import sys

mlflow.search_runs(search_all_experiments=True)

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.intercept,metrics.input_length,params.data_url,tags.mlflow.source.type,tags.mlflow.runName,tags.mlflow.log-model.history,tags.mlflow.source.name,tags.mlflow.user,tags.mlflow.source.git.commit
0,d9f0f7a395174a398eb6215fb9c74df5,366603160111467826,FINISHED,mlflow-artifacts:/366603160111467826/d9f0f7a39...,2025-05-28 21:43:36.024000+00:00,2025-05-28 21:44:23.333000+00:00,24.778603,3403766.0,https://d37ci6vzurychx.cloudfront.net/trip-dat...,LOCAL,shapeless-nightingale,"[{""run_id"": ""d9f0f7a395174a398eb6215fb9c74df5""...",/usr/local/lib/python3.12/site-packages/prefec...,root,
1,2af8e3d0dcdc4e7c90b3807e7d7b250e,366603160111467826,FINISHED,mlflow-artifacts:/366603160111467826/2af8e3d0d...,2025-05-28 21:33:30.436000+00:00,2025-05-28 21:34:17.195000+00:00,24.778603,3403766.0,https://d37ci6vzurychx.cloudfront.net/trip-dat...,LOCAL,hissing-bettong,"[{""run_id"": ""2af8e3d0dcdc4e7c90b3807e7d7b250e""...",/usr/local/lib/python3.12/site-packages/prefec...,root,
2,0b7985570bfa468e98496361b379d06f,366603160111467826,FINISHED,mlflow-artifacts:/366603160111467826/0b7985570...,2025-05-28 14:30:32.916000+00:00,2025-05-28 14:31:25.969000+00:00,24.778603,3403766.0,https://d37ci6vzurychx.cloudfront.net/trip-dat...,LOCAL,green-nyala,"[{""run_id"": ""0b7985570bfa468e98496361b379d06f""...",../03-orchestration/hw.py,vscode,5c623218d83f7dcf69f0c94bd4e2d5423e2efe5e
3,630182229a374d43bf539cd0b3429d0b,366603160111467826,FINISHED,mlflow-artifacts:/366603160111467826/630182229...,2025-05-28 14:28:04.457000+00:00,2025-05-28 14:28:55.408000+00:00,24.778603,3403766.0,https://d37ci6vzurychx.cloudfront.net/trip-dat...,LOCAL,cautious-spaniel,"[{""run_id"": ""630182229a374d43bf539cd0b3429d0b""...",hw.py,vscode,5c623218d83f7dcf69f0c94bd4e2d5423e2efe5e
4,d6bf16f23b3c45bc8b1e390b6f239dd1,366603160111467826,FINISHED,mlflow-artifacts:/366603160111467826/d6bf16f23...,2025-05-28 14:18:52.013000+00:00,2025-05-28 14:19:41.203000+00:00,24.778603,3403766.0,https://d37ci6vzurychx.cloudfront.net/trip-dat...,LOCAL,onyx-owl,"[{""run_id"": ""d6bf16f23b3c45bc8b1e390b6f239dd1""...",hw.py,vscode,5c623218d83f7dcf69f0c94bd4e2d5423e2efe5e
5,8ea24be2c9db4952bec1c0b19534fd0f,366603160111467826,FINISHED,mlflow-artifacts:/366603160111467826/8ea24be2c...,2025-05-28 14:02:02.572000+00:00,2025-05-28 14:03:08.080000+00:00,24.778603,3403766.0,https://d37ci6vzurychx.cloudfront.net/trip-dat...,LOCAL,bulky-asp,"[{""run_id"": ""8ea24be2c9db4952bec1c0b19534fd0f""...",hw.py,vscode,d275c9451d95adf5de1575435e32b7b4966ce732
