In [None]:
import os
from dotenv import load_dotenv

import pandas as pd

import mlflow

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error

from sklearn.pipeline import Pipeline,make_pipeline

In [None]:
year = 2021
month = 1
color = "yellow"

In [None]:
# Download the data
if not os.path.exists(f"./data/{color}_tripdata_{year}-{month:02d}.parquet"):
    os.system(f"wget -P ./data https://d37ci6vzurychx.cloudfront.net/trip-data/{color}_tripdata_{year}-{month:02d}.parquet")

In [None]:
# Load the data

df = pd.read_parquet(f"./data/{color}_tripdata_{year}-{month:02d}.parquet")

In [None]:
df.shape

In [None]:
load_dotenv()

MLFLOW_TRACKING_URI=os.getenv("MLFLOW_TRACKING_URI")
print(MLFLOW_TRACKING_URI)


In [None]:
# Set up the connection to MLflow
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)

# Setup the MLflow experiment 
mlflow.set_experiment("yellow-taxi-trip-duration")

If everything went well, you should be able to see the experiment now in the MLFlow UI at `http://<external-ip>:5000`.

Let's start now with looking at the data a bit:

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Look for missing values
df.isnull().sum()

In [None]:
features = ["PULocationID", "DOLocationID", "trip_distance"]
target = 'duration'

In [None]:
# calculate the trip duration in minutes and drop trips that are less than 1 minute and more than 2 hours
def calculate_trip_duration_in_minutes(df):
    df["trip_duration_minutes"] = (df["tpep_dropoff_datetime"] - df["tpep_pickup_datetime"]).dt.total_seconds() / 60
    df = df[(df["trip_duration_minutes"] >= 1) & (df["trip_duration_minutes"] <= 60)]
    return df
    

In [None]:
def preprocess(df):
    df = df.copy()
    df = calculate_trip_duration_in_minutes(df)
    categorical_features = ["PULocationID", "DOLocationID"]
    df[categorical_features] = df[categorical_features].astype(str)
    df['trip_route'] = df["PULocationID"] + "_" + df["DOLocationID"]
    df = df[['trip_route', 'trip_distance', 'trip_duration_minutes']]
    return df

In [None]:
df_processed = preprocess(df)

In [None]:
y=df_processed["trip_duration_minutes"]
X=df_processed.drop(columns=["trip_duration_minutes"])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [None]:
dv = DictVectorizer()

dv.fit(X_train.to_dict(orient="records"))
X_train = dv.transform(X_train.to_dict(orient="records"))
X_test = dv.transform(X_test.to_dict(orient="records"))

In [None]:
load_dotenv()

SA_KEY=os.getenv("GOOGLE_SA_KEY")
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = SA_KEY


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

In [None]:
X_train = X_train.to_dict(orient="records")
X_test = X_test.to_dict(orient="records")

In [None]:

with mlflow.start_run():
    
    tags = {
        "model": "Random Forest pipeline",
        "developer": "user1",
        "dataset": f"{color}-taxi",
        "year": year,
        "month": month,
        "features": features,
        "target": target
    }
    mlflow.set_tags(tags)
    pipeline = make_pipeline(
         DictVectorizer(),
        RandomForestRegressor(random_state=42,criterion='squared_error', max_depth=16)
    )
    pipeline.fit(X_train, y_train)
    
    y_pred = pipeline.predict(X_test)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)
    
    mlflow.sklearn.log_model(pipeline, "model")

Now you should see a new experiment with a new run id in MLFlow. You can also see the pipeline and the model in the UI under `Artifacts`.

In [None]:
print(rmse)