In [22]:
import pickle
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [23]:
from sklearn.pipeline import make_pipeline

In [32]:
pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Downloading python_dotenv-1.0.1-py3-none-any.whl (19 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install mlflow

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install boto3

Collecting boto3
  Downloading boto3-1.37.18-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.38.0,>=1.37.18 (from boto3)
  Downloading botocore-1.37.18-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Using cached jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.12.0,>=0.11.0 (from boto3)
  Using cached s3transfer-0.11.4-py3-none-any.whl.metadata (1.7 kB)
Collecting urllib3<1.27,>=1.25.4 (from botocore<1.38.0,>=1.37.18->boto3)
  Downloading urllib3-1.26.20-py2.py3-none-any.whl.metadata (50 kB)
Downloading boto3-1.37.18-py3-none-any.whl (139 kB)
Downloading botocore-1.37.18-py3-none-any.whl (13.4 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.4/13.4 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m0m eta [36m0:00:01[0m
[?25hUsing cached jmespath-1.0.1-py3-none-any.whl (20 kB)
Using cached s3transfer-0.11.4-py3-none-any.whl (84 kB)
Downloading urllib3-1.26.20-py2.py3-none-any.whl (144 

In [5]:
import boto3
print(boto3.__version__)

1.37.18


In [35]:
import os
from dotenv import load_dotenv
load_dotenv()
AWS_ACCESS_KEY_ID = os.getenv('AWS_ACCESS_KEY_ID')
AWS_SECRET_ACCESS_KEY = os.getenv('AWS_SECRET_ACCESS_KEY')
DEFAULT_REGION_NAME = os.getenv('DEFAULT_REGION_NAME')


os.environ["AWS_PROFILE"] = "ml_user"

In [36]:
import boto3

# Create a session using the profile 'ml_user'
session = boto3.Session(profile_name='ml_user')

# Create an S3 client using the session
s3_client = session.client('s3')

# List objects in the specified S3 bucket
response = s3_client.list_objects_v2(Bucket='mlflow-models-slv')

# Print the response
print(response)

{'ResponseMetadata': {'RequestId': 'ARHF2CJVACDEXSZ7', 'HostId': 'bFEJEYSeHrY2Jh+H9Oc6Z6K9OYLc9jiELdEvUeizRz2LDOKjM/kag03VG8Q+BR63MfVTz81XGZQ=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': 'bFEJEYSeHrY2Jh+H9Oc6Z6K9OYLc9jiELdEvUeizRz2LDOKjM/kag03VG8Q+BR63MfVTz81XGZQ=', 'x-amz-request-id': 'ARHF2CJVACDEXSZ7', 'date': 'Fri, 21 Mar 2025 22:34:47 GMT', 'x-amz-bucket-region': 'us-east-1', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'IsTruncated': False, 'Contents': [{'Key': 'test_file/', 'LastModified': datetime.datetime(2025, 3, 21, 22, 23, 31, tzinfo=tzlocal()), 'ETag': '"d41d8cd98f00b204e9800998ecf8427e"', 'ChecksumAlgorithm': ['CRC64NVME'], 'ChecksumType': 'FULL_OBJECT', 'Size': 0, 'StorageClass': 'STANDARD'}], 'Name': 'mlflow-models-slv', 'Prefix': '', 'MaxKeys': 1000, 'EncodingType': 'url', 'KeyCount': 1}


In [37]:
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("green taxi duration")

<Experiment: artifact_location='s3://mlflow-models-slv/1', creation_time=1742594167031, experiment_id='1', last_update_time=1742594167031, lifecycle_stage='active', name='green taxi duration', tags={}>

In [38]:
def read_dataframe(filename: str):
    df=pd.read_parquet(filename)
    df['duration']=df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration=df.duration.apply(lambda m : m.total_seconds() / 60)
    
    df=df[(df.duration >= 1) & (df.duration <= 60)]
    categorical=['PULocationID','DOLocationID']
    df[categorical]=df[categorical].astype(str)
    return df    
    

In [39]:
def prepare_dictionaries(df:pd.DataFrame):
    df['PU_DO']=df['PULocationID'] + '_' + df['DOLocationID']
    categorical=['PU_DO']
    numerical=['trip_distance']
    dicts=df[categorical + numerical].to_dict(orient='records')
    return dicts

In [40]:
df_train=read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val=read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')
target='duration'
y_train=df_train[target].values
y_val=df_val[target].values

dict_train=prepare_dictionaries(df_train)
dict_val=prepare_dictionaries(df_val)

In [41]:
len(df_train),len(df_val)

(73908, 61921)

In [42]:
with mlflow.start_run():
    params=dict(max_depth=20,n_estimators=100,min_samples_leaf=10,random_state=0)
    mlflow.log_params(params)

    dv = DictVectorizer()
    model = RandomForestRegressor(**params,n_jobs=-1)

    X_train = dv.fit_transform(dict_train)
    model.fit(X_train,y_train)

    X_val = dv.transform(dict_val)
    y_pred = model.predict(X_val)

   
    mse = mean_squared_error(y_val, y_pred)
    print(params,mse)
    mlflow.log_metric('mse',mse)
    
    mlflow.sklearn.log_model(model,artifact_path="model")

    with open('dict_vectorizer.bin','wb') as f_out:
        pickle.dump(dv,f_out)
    

{'max_depth': 20, 'n_estimators': 100, 'min_samples_leaf': 10, 'random_state': 0} 45.64114429815588




🏃 View run gaudy-carp-451 at: http://127.0.0.1:5000/#/experiments/1/runs/da8ff84b1e2e43e4912e184470d7aec2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


### using sklearn pipeline

In [43]:
with mlflow.start_run():
    params=dict(max_depth=20,n_estimators=100,min_samples_leaf=10,random_state=0)
    mlflow.log_params(params)
    pipeline=make_pipeline(
        DictVectorizer(),
        RandomForestRegressor(**params,n_jobs=-1)
    )
    pipeline.fit(dict_train,y_train)
    y_pred=pipeline.predict(dict_val)
    
    mse=mean_squared_error(y_pred,y_val)
    print(params,mse)
    mlflow.log_metric('mse',mse)
    mlflow.sklearn.log_model(pipeline,artifact_path="model")
    
    

{'max_depth': 20, 'n_estimators': 100, 'min_samples_leaf': 10, 'random_state': 0} 45.64114429815588




🏃 View run smiling-loon-660 at: http://127.0.0.1:5000/#/experiments/1/runs/51ac55521a414c17869dc2eccb873ee2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


In [None]:
with open(path,'rb')as f_out:
    dv=pickle.load(f_out)
          