In [3]:
!pip install --user mlflow

Collecting mlflow
  Using cached mlflow-2.3.2-py3-none-any.whl (17.7 MB)
Collecting gitpython<4,>=2.1.0
  Using cached GitPython-3.1.31-py3-none-any.whl (184 kB)
Collecting docker<7,>=4.0.0
  Using cached docker-6.1.2-py3-none-any.whl (148 kB)
Collecting databricks-cli<1,>=0.8.7
  Using cached databricks_cli-0.17.7-py3-none-any.whl
Collecting pyarrow<12,>=4.0.0
  Using cached pyarrow-11.0.0-cp39-cp39-win_amd64.whl (20.6 MB)
Collecting alembic!=1.10.0,<2
  Using cached alembic-1.11.1-py3-none-any.whl (224 kB)
Collecting waitress<3
  Using cached waitress-2.1.2-py3-none-any.whl (57 kB)
Collecting Jinja2<4,>=3.0
  Using cached Jinja2-3.1.2-py3-none-any.whl (133 kB)
Collecting sqlparse<1,>=0.4.0
  Using cached sqlparse-0.4.4-py3-none-any.whl (41 kB)
Collecting querystring-parser<2
  Using cached querystring_parser-1.2.4-py2.py3-none-any.whl (7.9 kB)
Collecting protobuf<5,>=3.12.0
  Using cached protobuf-4.23.2-cp39-cp39-win_amd64.whl (422 kB)
Collecting Mako
  Using cached Mako-1.2.4-py3-n

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
anaconda-project 0.11.1 requires ruamel-yaml, which is not installed.


In [1]:
!python -V

Python 3.9.13


In [4]:
import pandas as pd
import pickle

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge

import mlflow


ModuleNotFoundError: No module named 'mlflow'

In [None]:
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("nyc-taxi-experiment")

In [None]:
def read_dataframe(filename):
    df = pd.read_parquet(filename)
    
    df["tpep_pickup_datetime"] = pd.to_datetime(df.tpep_pickup_datetime)
    df["tpep_dropoff_datetime"] = pd.to_datetime(df.tpep_dropoff_datetime)
    
    df["duration"] = df.tpep_dropoff_datetime - df.tpep_pickup_datetime
    df.duration = df.duration.apply(lambda td: td.total_seconds() / 60)
    
    df = df[(df.duration >= 1) & (df.duration <= 60)]
    
    categorical = ["PULocationID", "DOLocationID"]
    numerical = ["trip_distance"]
    
    df[categorical] = df[categorical].astype(str)
    
    return df

In [None]:
df_train = read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-01.parquet")
df_valid = read_dataframe("https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2022-02.parquet")


In [None]:
len(df_train), len(df_valid)

In [None]:
categorical = ["PU_DO"]
numerical = ["trip_distance"]

dv = DictVectorizer()

train_dicts = df_train[categorical + numerical].to_dict(orient="records")
X_train = dv.fit_transform(train_dicts)

valid_dicts = df_valid[categorical + numerical].to_dict(orient="records")
X_valid = dv.fit_transform(valid_dicts)

In [None]:
target = "duration"

y_train = df_tarin[target].values
y_valid = df_valid[target].values

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_valid)

mean_squared_error(y_valid, y_pred, squared=False)

In [None]:
with open("models/lin_reg.v2.0.bin", "wb") as f_out:
    pickle.dump((dv, lr), f_out)

In [None]:
ls = Lasso(0.01)
ls.fit(X_train, y_train)

y_pred = ls.predict(X_val)

mean_squared_error(y_valid, y_pred, squared=False)

In [None]:
with open("models/lasso.v2.0.bin", "wb") as f_out:
    pickle.dump((dv, ls), f_out)

In [None]:
with mlflow.start_run():
    
    mlflow.set_tag("develop", "Sina")
    mlflow.log_param("tarin-data-path", "trip-data/yellow_tripdata_2022-01.parquet")
    mlflow.log_param("valid_data-path", "trip-data/yellow_tripdata_2022-02.parquet")
    
    alpha = 0.01
    
    mlflow.log_param("alpha", alpha)
    
    las = Lasso(alpha)
    las.fit(X_train, y_train)

    y_pred = ls.predict(X_val)

    rmse = mean_squared_error(y_valid, y_pred, squared=False)
    mlflow.log_metric("rmse", rmse)