In [1]:
import pickle
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [2]:
from sklearn.pipeline import make_pipeline

In [8]:
pip install mlflow

Collecting mlflow
  Using cached mlflow-2.21.3-py3-none-any.whl.metadata (30 kB)
Collecting importlib_metadata!=4.7.0,<9,>=3.7.0 (from mlflow-skinny==2.21.3->mlflow)
  Using cached importlib_metadata-8.6.1-py3-none-any.whl.metadata (4.7 kB)


INFO: pip is looking at multiple versions of google-auth to determine which version is compatible with other requirements. This could take a while.
Collecting google-auth~=2.0 (from databricks-sdk<1,>=0.20.0->mlflow-skinny==2.21.3->mlflow)
  Using cached google_auth-2.38.0-py2.py3-none-any.whl.metadata (4.8 kB)
Using cached mlflow-2.21.3-py3-none-any.whl (28.2 MB)
Using cached importlib_metadata-8.6.1-py3-none-any.whl (26 kB)
Using cached google_auth-2.38.0-py2.py3-none-any.whl (210 kB)
Installing collected packages: importlib_metadata, google-auth, mlflow
  Attempting uninstall: importlib_metadata
    Found existing installation: importlib-metadata 5.2.0
    Uninstalling importlib-metadata-5.2.0:
      Successfully uninstalled importlib-metadata-5.2.0
  Attempting uninstall: google-auth
    Found existing installation: google-auth 2.0.0
    Uninstalling google-auth-2.0.0:
      Successfully uninstalled google-auth-2.0.0
[31mERROR: pip's dependency resolver does not currently take int

In [3]:
pip install boto3

Note: you may need to restart the kernel to use updated packages.


In [4]:
import boto3
print(boto3.__version__)

1.21.32


In [5]:
import os

# Set the AWS profile to use
os.environ['AWS_PROFILE'] = 'ml_user'

In [6]:
import boto3

# Create a session and specify the region if necessary
session = boto3.Session(profile_name='ml_user', region_name='us-east-1')

# Create the S3 client and list buckets
s3_client = session.client('s3')
response = s3_client.list_buckets()
print(response)


{'ResponseMetadata': {'RequestId': 'J5Y1FVXMW1VFJHQM', 'HostId': 'gZyZ/JNazenMF2W2C34Q3WNMKkqvP8IaWyh64KsxQq8Lb9IwDZUndVQuQ2z6uF84vmZS1J8Fjbg=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'gZyZ/JNazenMF2W2C34Q3WNMKkqvP8IaWyh64KsxQq8Lb9IwDZUndVQuQ2z6uF84vmZS1J8Fjbg=', 'x-amz-request-id': 'J5Y1FVXMW1VFJHQM', 'date': 'Wed, 09 Apr 2025 00:35:37 GMT', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'Buckets': [{'Name': 'ml-sagemaker-practise', 'CreationDate': datetime.datetime(2025, 3, 5, 16, 9, 31, tzinfo=tzlocal())}, {'Name': 'mlflow-models-slv', 'CreationDate': datetime.datetime(2025, 3, 21, 15, 25, 50, tzinfo=tzlocal())}, {'Name': 'sagemaker-studio-585768144809-iwglcnciicp', 'CreationDate': datetime.datetime(2025, 1, 21, 20, 41, 34, tzinfo=tzlocal())}, {'Name': 'sagemaker-studio-585768144809-q9v9klv6dxe', 'CreationDate': datetime.datetime(2025, 1, 21, 20, 37, 14, tzinfo=tzlocal())}, {'Name': 'sagemaker-studio-58576

In [9]:
import mlflow
print(mlflow.__version__)


2.21.3


In [11]:
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("green taxi duration")

2025/04/09 00:39:12 INFO mlflow.tracking.fluent: Experiment with name 'green taxi duration' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://mlflow-models-slv/1', creation_time=1744159152449, experiment_id='1', last_update_time=1744159152449, lifecycle_stage='active', name='green taxi duration', tags={}>

In [12]:
def read_dataframe(filename: str):
    df=pd.read_parquet(filename)
    df['duration']=df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration=df.duration.apply(lambda m : m.total_seconds() / 60)
    
    df=df[(df.duration >= 1) & (df.duration <= 60)]
    categorical=['PULocationID','DOLocationID']
    df[categorical]=df[categorical].astype(str)
    return df    
    

In [13]:
def prepare_dictionaries(df:pd.DataFrame):
    df['PU_DO']=df['PULocationID'] + '_' + df['DOLocationID']
    categorical=['PU_DO']
    numerical=['trip_distance']
    dicts=df[categorical + numerical].to_dict(orient='records')
    return dicts

In [14]:
df_train=read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val=read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')
target='duration'
y_train=df_train[target].values
y_val=df_val[target].values

dict_train=prepare_dictionaries(df_train)
dict_val=prepare_dictionaries(df_val)

In [15]:
len(df_train),len(df_val)

(73908, 61921)

In [22]:
with mlflow.start_run():
    params=dict(max_depth=20,n_estimators=100,min_samples_leaf=10,random_state=0)
    mlflow.log_params(params)

    dv = DictVectorizer()
    model = RandomForestRegressor(**params,n_jobs=-1)

    X_train = dv.fit_transform(dict_train)
    model.fit(X_train,y_train)

    X_val = dv.transform(dict_val)
    y_pred = model.predict(X_val)

   
    mse = mean_squared_error(y_val, y_pred)
    print(params,mse)
    mlflow.log_metric('mse',mse)
    
    mlflow.sklearn.log_model(model,artifact_path="model")

    with open('dict_vectorizer.bin','wb') as f_out:
        pickle.dump(dv,f_out)
    mlflow.log_artifact('dict_vectorizer.bin')

{'max_depth': 20, 'n_estimators': 100, 'min_samples_leaf': 10, 'random_state': 0} 45.64114429815588




🏃 View run burly-seal-331 at: http://127.0.0.1:5000/#/experiments/1/runs/0b524fa448444456aa0ead18def76fcd
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


### using sklearn pipeline

In [16]:
with mlflow.start_run():
    params=dict(max_depth=20,n_estimators=100,min_samples_leaf=10,random_state=0)
    mlflow.log_params(params)
    pipeline=make_pipeline(
        DictVectorizer(),
        RandomForestRegressor(**params,n_jobs=-1)
    )
    pipeline.fit(dict_train,y_train)
    y_pred=pipeline.predict(dict_val)
    
    mse=mean_squared_error(y_pred,y_val)
    print(params,mse)
    mlflow.log_metric('mse',mse)
    mlflow.sklearn.log_model(pipeline,artifact_path="model")
    
    

{'max_depth': 20, 'n_estimators': 100, 'min_samples_leaf': 10, 'random_state': 0} 45.64114429815588




🏃 View run popular-bird-15 at: http://127.0.0.1:5000/#/experiments/1/runs/fad1893cd9bb408babd99b113e83e900
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


In [19]:
from mlflow.tracking import MlflowClient

In [23]:
MLFLOW_TRACKING_URI='http://127.0.0.1:5000'
RUN_ID='0b524fa448444456aa0ead18def76fcd'
client=MlflowClient(tracking_uri=MLFLOW_TRACKING_URI)

In [24]:
path=client.download_artifacts(run_id=RUN_ID,path='dict_vectorizer.bin')

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

In [25]:
with open(path,'rb')as f_out:
    dv=pickle.load(f_out)
          

In [26]:
dv

DictVectorizer()