In [3]:
!pip freeze | grep scikit-learn

In [2]:
!python -V

Python 3.12.10


In [1]:
import pickle
import pandas as pd

In [2]:
with open('model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [3]:
categorical = ['PULocationID', 'DOLocationID']

def read_data(filename):
    df = pd.read_parquet(filename)
    
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df['duration'] = df.duration.dt.total_seconds() / 60

    df = df[(df.duration >= 1) & (df.duration <= 60)].copy()

    df[categorical] = df[categorical].fillna(-1).astype('int').astype('str')
    
    return df

In [4]:
# df = read_data('https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-03.parquet')
df = read_data('/workspaces/mlops-zoomcamp-1/datasets/raw/yellow_tripdata_2023-03.parquet')

In [5]:
dicts = df[categorical].to_dict(orient='records')
X_val = dv.transform(dicts)
y_pred = model.predict(X_val)

#### Q1

In [6]:
import numpy as np
# np.std(y_pred)
y_pred.std()

np.float64(6.247488852238703)

### Q2

In [7]:
year=2023
month=3
df_result = pd.DataFrame()
df_result['ride_id'] = f'{year:04d}/{month:02d}_' + df.index.astype('str')
df_result['predicted_duration'] = y_pred

In [8]:
output_file = '/workspaces/mlops-zoomcamp-1/datasets/04/result.parquet'
df_result.to_parquet(
    output_file,
    engine='pyarrow',
    compression=None,
    index=False
)

In [9]:
!ls -larth /workspaces/mlops-zoomcamp-1/datasets/04/result.parquet

-rw-r--r-- 1 vscode vscode 66M Jun  3 06:20 /workspaces/mlops-zoomcamp-1/datasets/04/result.parquet


### Rough: send http request to model

In [None]:
X_val[1,:].toarray()

import json
import requests

# Assuming X_val is a predefined CSR matrix
# data = {"instances": X_val[1,:].toarray().tolist()}
# data = {"inputs": X_val[1,:].toarray().tolist()}
data = {"inputs": [X_val[1,:].toarray().tolist()[0]]}
res = requests.post("http://127.0.0.1:1234/invocations", 
                        headers={"Content-Type": "application/json"},
                        json=data)
res.status_code, res.text

In [40]:
import requests

data = {"inputs": [X_val[1,:].toarray().tolist()]}  # Ensure [[val1, val2, ...]]
res = requests.post("http://127.0.0.1:1234/invocations", 
                    headers={"Content-Type": "application/json"},
                    json=data)

# Documentaion for serving
- [For serve & docker creation](https://mlflow.org/docs/latest/api_reference/cli.html#mlflow-models-serve)
- [tutorial](https://mlflow.org/docs/latest/deployment/deploy-model-to-kubernetes/tutorial)
- this cmd worked `mlflow models serve --env-manager local -m "runs:/d9f0f7a395174a398eb6215fb9c74df5/model" -p 1234`
- creating docker image with following did NOT work `mlflow models build-docker --model-uri "runs:/d9f0f7a395174a398eb6215fb9c74df5/model" --name "mlflow-model"`

### Rough: load model in notebook for inference

In [8]:
import mlflow
mlflow.search_runs(search_all_experiments=True)

Unnamed: 0,run_id,experiment_id,status,artifact_uri,start_time,end_time,metrics.input_length,metrics.intercept,params.data_url,tags.mlflow.source.type,tags.mlflow.source.git.commit,tags.mlflow.runName,tags.mlflow.source.name,tags.mlflow.log-model.history,tags.mlflow.user
0,b2d02ff1bd444133a970e1ba4b7b340b,228318985062494346,FINISHED,mlflow-artifacts:/228318985062494346/b2d02ff1b...,2025-06-02 20:53:18.259000+00:00,2025-06-02 20:54:54.242000+00:00,3403766.0,24.778603,https://d37ci6vzurychx.cloudfront.net/trip-dat...,LOCAL,a89f2137712696440adc16967178bcb250465a4b,just-swift,hw.py,"[{""run_id"": ""b2d02ff1bd444133a970e1ba4b7b340b""...",vscode
1,482fd76e6ea440319c7e4eb7ddb03c2d,228318985062494346,FINISHED,mlflow-artifacts:/228318985062494346/482fd76e6...,2025-06-02 20:23:04.152000+00:00,2025-06-02 20:24:39.201000+00:00,3403766.0,24.778603,https://d37ci6vzurychx.cloudfront.net/trip-dat...,LOCAL,a89f2137712696440adc16967178bcb250465a4b,unbiased-chital,hw.py,"[{""run_id"": ""482fd76e6ea440319c7e4eb7ddb03c2d""...",vscode
2,b1a25f32db574bfebfbf4fbe69f89199,228318985062494346,FINISHED,mlflow-artifacts:/228318985062494346/b1a25f32d...,2025-06-02 20:12:49.861000+00:00,2025-06-02 20:14:30.355000+00:00,3403766.0,24.778603,https://d37ci6vzurychx.cloudfront.net/trip-dat...,LOCAL,a89f2137712696440adc16967178bcb250465a4b,horned-chachalaca,hw.py,"[{""run_id"": ""b1a25f32db574bfebfbf4fbe69f89199""...",vscode
3,564ec923ff2f4e6ea371db504b1b08fa,228318985062494346,FAILED,mlflow-artifacts:/228318985062494346/564ec923f...,2025-06-02 20:11:00.132000+00:00,2025-06-02 20:11:23.950000+00:00,3403766.0,,https://d37ci6vzurychx.cloudfront.net/trip-dat...,LOCAL,a89f2137712696440adc16967178bcb250465a4b,magnificent-gecko,hw.py,,vscode


In [27]:
import pandas as pd
import mlflow

logged_model = 'runs:/b2d02ff1bd444133a970e1ba4b7b340b/pipeline'

def transform_data(df: pd.DataFrame):
    
    df.tpep_dropoff_datetime = pd.to_datetime(df.tpep_dropoff_datetime)
    df.tpep_pickup_datetime = pd.to_datetime(df.tpep_pickup_datetime)
    df['duration'] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    X = df[categorical].to_dict(orient='records')

    return X

# Load model
loaded_model = mlflow.sklearn.load_model(logged_model)

# Load data
df = pd.read_parquet('/workspaces/mlops-zoomcamp-1/datasets/raw/yellow_tripdata_2023-03.parquet')
df.iloc[ 1:100,:]
X = transform_data(df)

# Predict
loaded_model.predict(X)

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[categorical] = df[categorical].astype(str)


array([16.67233016, 28.66815635, 12.53358183, ..., 11.64097115,
       13.18296659, 13.28087802], shape=(3316216,))

### Rough: make MLFlow create docker file & use that 
Create docker file from MLFlow: 
`mlflow models build-docker --model-uri "runs:/b2d02ff1bd444133a970e1ba4b7b340b/pipeline" --name "mlflow-model"`