In [1]:
import os
import mlflow
import pickle
import pandas as pd

from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

from sklearn.pipeline import make_pipeline

In [2]:
# Load model
EXPERIMENT_ID = 1
RUN_ID = os.getenv("RUN_ID", "553def03f5224f649fe56bc1567daccc")
logged_model = f"gs://pytholic-mlops-zoomcamp-artifacts/{EXPERIMENT_ID}/{RUN_ID}/artifacts/model"
model = mlflow.pyfunc.load_model(logged_model)

service_account_key_path = os.path.expanduser('~') + "/service_account_key.json"
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = service_account_key_path

 - psutil (current: 5.9.0, required: psutil==5.9.7)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [3]:
def read_dataframe(filename: str):
    df = pd.read_parquet(filename)

    df['duration'] = df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration = df.duration.dt.total_seconds() / 60
    df = df[(df.duration >= 1) & (df.duration <= 60)]    
    return df


def prepare_dictionaries(df: pd.DataFrame):
    categorical = ['PULocationID', 'DOLocationID']
    df[categorical] = df[categorical].astype(str)
    df['PU_DO'] = df['PULocationID'] + '_' + df['DOLocationID']
    categorical = ['PU_DO']
    numerical = ['trip_distance']
    dicts = df[categorical + numerical].to_dict(orient='records')
    return dicts

In [4]:
df = read_dataframe('../../data/green_tripdata_2021-01.parquet')
dicts = prepare_dictionaries(df)

y_pred = model.predict(dicts)

In [5]:
y_pred

array([ 6.86271117, 13.36872083,  6.3608707 , ..., 14.43650924,
       37.09262214, 11.10083955])

In [6]:
import uuid

In [7]:
n = len(df)
ride_ids = []
for i in range(n):
    ride_ids.append(str(uuid.uuid4()))

In [8]:
df["ride_id"] = ride_ids
df.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,duration,PU_DO,ride_id
0,2,2021-01-01 00:15:56,2021-01-01 00:19:52,N,1.0,43,151,1.0,1.01,5.5,...,0.0,,0.3,6.8,2.0,1.0,0.0,3.933333,43_151,1e7361ca-4ad3-4073-86b7-acf0d637df22
1,2,2021-01-01 00:25:59,2021-01-01 00:34:44,N,1.0,166,239,1.0,2.53,10.0,...,0.0,,0.3,16.86,1.0,1.0,2.75,8.75,166_239,1255ace2-8353-4ceb-93f5-0f2d16dd874a
2,2,2021-01-01 00:45:57,2021-01-01 00:51:55,N,1.0,41,42,1.0,1.12,6.0,...,0.0,,0.3,8.3,1.0,1.0,0.0,5.966667,41_42,d766694e-0f84-4a3f-9a0c-55075c8a9192
3,2,2020-12-31 23:57:51,2021-01-01 00:04:56,N,1.0,168,75,1.0,1.99,8.0,...,0.0,,0.3,9.3,2.0,1.0,0.0,7.083333,168_75,298e7339-dcee-478e-a667-1ce612c4c1a4
7,2,2021-01-01 00:26:31,2021-01-01 00:28:50,N,1.0,75,75,6.0,0.45,3.5,...,0.0,,0.3,5.76,1.0,1.0,0.0,2.316667,75_75,0546a037-a094-4808-8ff5-ad0f3f399b35


In [9]:
df_result = pd.DataFrame()

In [11]:
df_result["ride_id"] = df["ride_id"]