In [1]:
import pickle
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [2]:
from sklearn.pipeline import make_pipeline

In [None]:
import mlflow
mlflow.set_tracking_uri("")
mlflow.set_experiment("")

In [4]:
def read_dataframe(filename: str):
    df=pd.read_parquet(filename)
    df['duration']=df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration=df.duration.apply(lambda m : m.total_seconds() / 60)
    
    df=df[(df.duration >= 1) & (df.duration <= 60)]
    categorical=['PULocationID','DOLocationID']
    df[categorical]=df[categorical].astype(str)
    return df    
    

In [5]:
def prepare_dictionaries(df:pd.DataFrame):
    df['PU_DO']=df['PULocationID']+'_'+df['DOLocationID']
    categorical=['PU_DO']
    numerical=['trip_distance']
    dict=df[categorical + numerical].to_dict(orient='records')

In [None]:
df_train=read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val=read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')
target='duration'
y_train=df_train[target].values
y_val=df_val[target].values

dict_train=prepare_dictionaries(df_train)
dict_val=prepare_dictionaries(df_val)

In [None]:
with mlflow.start_run():
    params=dict(max_depth=20,n_estimators=100,min_samples_leaf=10,random_state=0)
    mlflow.log_params(params)
    pipeline=make_pipeline(
        DictVectorizer(),
        RandomForestRegressor(**params,n_jobs=-1)
    )
    pipeline.fit(dict_train,y_train)
    y_pred=pipeline.predict(dict_val)
    rmse=mean_squared_error(y_pred,y_val,squared=False)
    print(params,rmse)
    mlflow.log_metric('rmse',rmse)
    mlflow.sklearn.log_model(pipeline,artifact_path="model")
    
    

In [6]:
from mlflow.tracking import MlflowClient

ModuleNotFoundError: No module named 'mlflow'

In [None]:
MLFLOW_TRACKING_URI = ''
RUN_ID = ''
client=MlflowClient(tracking_uri= MLFLOW_TRACKING_URI)

In [None]:
path=client.download_artifacts(run_id=RUN_ID,path='dict_vectorizer.bin')

In [None]:
with open(path,'rb')as f_out:
    dv=pickle.load(f_out)
          