# Explore Data
We use NYC taxi drive data for this scenario.

In [None]:
%pip install tqdm

In [None]:
from tqdm import tqdm
import os

import requests
import datetime
import pandas as pd

files = ['green_tripdata_2022-02.parquet', 'green_tripdata_2022-01.parquet']

path = './data'
if not os.path.exists(path):
    os.mkdir(path)

print("Download files:")
for file in files:
    url=f"https://d37ci6vzurychx.cloudfront.net/trip-data/{file}"
    resp=requests.get(url, stream=True)
    save_path=f"{path}/{file}"
    with open(save_path, "wb") as handle:
        for data in tqdm(resp.iter_content(),
                        desc=f"{file}",
                        postfix=f"save to {save_path}",
                        total=int(resp.headers["Content-Length"])):
            handle.write(data)

In [None]:
jan_data = pd.read_parquet('data/green_tripdata_2022-01.parquet')
jan_data.describe()

In [None]:
# create target
jan_data["duration_min"] = jan_data.lpep_dropoff_datetime - jan_data.lpep_pickup_datetime
jan_data.duration_min = jan_data.duration_min.apply(lambda td : float(td.total_seconds())/60)

In [None]:
# filter out outliers
jan_data = jan_data[(jan_data.duration_min >= 0) & (jan_data.duration_min <= 60)]
jan_data = jan_data[(jan_data.passenger_count > 0) & (jan_data.passenger_count <= 8)]

In [None]:
jan_data.duration_min.hist()

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

# data labeling
target = "duration_min"
num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
cat_features = ["PULocationID", "DOLocationID"]

train_data = jan_data[:30000]
val_data = jan_data[30000:]
model = LinearRegression()

model.fit(train_data[num_features + cat_features], train_data[target])

train_preds = model.predict(train_data[num_features + cat_features])
train_data['prediction'] = train_preds

val_preds = model.predict(val_data[num_features + cat_features])
val_data['prediction'] = val_preds

print(mean_absolute_error(train_data.duration_min, train_data.prediction))
print(mean_absolute_error(val_data.duration_min, val_data.prediction))

# Setup the project

In [None]:
import digitalhub as dh
import os

project = dh.get_or_create_project(f"{os.environ['USER']}-ml-service")

In [None]:
di = project.log_dataitem("train-data", kind="table", data=jan_data)

In [None]:
if not os.path.exists('src'):
    os.mkdir('src')

In [None]:
%%writefile "src/train.py"

from tqdm import tqdm
import os
import mlflow
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from digitalhub import from_mlflow_run, get_mlflow_model_metrics

import requests
import datetime
import pandas as pd

def train(project, train_data):
    # Enable MLflow autologging for sklearn
    mlflow.sklearn.autolog(log_datasets=True)
    
    df = train_data.as_df()
    train_data = df[:30000]
    val_data = df[30000:]
    model = LinearRegression()

    # data labeling
    target = "duration_min"
    num_features = ["passenger_count", "trip_distance", "fare_amount", "total_amount"]
    cat_features = ["PULocationID", "DOLocationID"]
    
    model.fit(train_data[num_features + cat_features], train_data[target])
        
    val_preds = model.predict(val_data[num_features + cat_features])
    val_data['prediction'] = val_preds
    
    mae = mean_absolute_error(val_data.duration_min, val_data.prediction)
    # Get MLflow run information
    run_id = mlflow.last_active_run().info.run_id

    # Extract MLflow run artifacts and metadata for DigitalHub integration
    model_params = from_mlflow_run(run_id)
    metrics = get_mlflow_model_metrics(run_id)

    # Register model in DigitalHub with MLflow metadata
    model = project.log_model(name="taxi-predictor", kind="mlflow", **model_params)
    model.log_metrics(metrics)
    

In [None]:
func = project.new_function(
    name="train",
    kind="python",
    python_version="PYTHON3_10",
    code_src="src/train.py",
    handler="train",
    requirements=["numpy<2", "mlflow<3", "scikit-learn <= 1.6.1", "tqdm"]
)

In [None]:
run = func.run(action="job", inputs={"train_data": di.key}, wait=True, local_execution=False)

Please note the path property of the mlflowserve function. The path corresponds to the path of the model artifact pointing to the folder containing the model ('/model/').

In [None]:
service = project.new_function("service", kind="mlflowserve", model_name="taxi-predictor", path="s3://datalake/raman-ml-service/model/taxi-predictor/c7c7670199414634ac3163dd5e034cc5/model/")

In [None]:
service_run = service.run(action="serve")

In [None]:
df = di.as_df()
df = df[num_features + cat_features]
df

In [None]:
df2 = di.as_df()[0:3]
df2 = df2[num_features + cat_features]
recs = df2.to_dict(orient='records')
inputs = []
for r in recs:
    inputs.append(
    [{'name': 'passenger_count', 'shape': [1], 'datatype': 'FP32', 'data': [r['passenger_count']]},
     {'name': 'trip_distance', 'shape': [1], 'datatype': 'FP32', 'data': [r['trip_distance']]},
     {'name': 'fare_amount', 'shape': [1], 'datatype': 'FP32', 'data': [r['fare_amount']]},
     {'name': 'total_amount', 'shape': [1], 'datatype': 'FP32', 'data': [r['total_amount']]},
     {'name': 'PULocationID', 'shape': [1], 'datatype': 'UINT32', 'data': [r['PULocationID']]},
     {'name': 'DOLocationID', 'shape': [1], 'datatype': 'UINT32', 'data': [r['DOLocationID']]}]
    )    

In [None]:
inputs

In [None]:
import requests

for i in inputs:
    inference_request = {
        "inputs": i
    }
    
    endpoint = f"http://{service_run.refresh().status.service['url']}/v2/models/taxi-predictor/infer"
    response = requests.post(endpoint, json=inference_request)
    print(response.json())
