In [1]:
import pickle
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error


In [2]:
from sklearn.pipeline import make_pipeline

In [3]:
pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install boto3

Note: you may need to restart the kernel to use updated packages.


In [5]:
import boto3
print(boto3.__version__)

1.21.32


In [6]:
import os

# Set the AWS profile to use
os.environ['AWS_PROFILE'] = 'ml_user'

In [7]:
import boto3

# Create a session and specify the region if necessary
session = boto3.Session(profile_name='ml_user', region_name='us-east-1')

# Create the S3 client and list buckets
s3_client = session.client('s3')
response = s3_client.list_buckets()
print(response)


{'ResponseMetadata': {'RequestId': 'YNY6XC3AVQJF797F', 'HostId': '601lNtkqsgoZkt5Eb0uHd/18wjmWvSzuTVqM6hVdaZ2kejW+0W0kU6YXnFLnqdciu3OeuXah+z4=', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amz-id-2': '601lNtkqsgoZkt5Eb0uHd/18wjmWvSzuTVqM6hVdaZ2kejW+0W0kU6YXnFLnqdciu3OeuXah+z4=', 'x-amz-request-id': 'YNY6XC3AVQJF797F', 'date': 'Tue, 08 Apr 2025 21:43:13 GMT', 'content-type': 'application/xml', 'transfer-encoding': 'chunked', 'server': 'AmazonS3'}, 'RetryAttempts': 0}, 'Buckets': [{'Name': 'ml-sagemaker-practise', 'CreationDate': datetime.datetime(2025, 3, 5, 16, 9, 31, tzinfo=tzlocal())}, {'Name': 'mlflow-models-slv', 'CreationDate': datetime.datetime(2025, 3, 21, 15, 25, 50, tzinfo=tzlocal())}, {'Name': 'sagemaker-studio-585768144809-iwglcnciicp', 'CreationDate': datetime.datetime(2025, 1, 21, 20, 41, 34, tzinfo=tzlocal())}, {'Name': 'sagemaker-studio-585768144809-q9v9klv6dxe', 'CreationDate': datetime.datetime(2025, 1, 21, 20, 37, 14, tzinfo=tzlocal())}, {'Name': 'sagemaker-studio-58576

In [12]:
import mlflow
print(mlflow.__version__)


AttributeError: partially initialized module 'mlflow' has no attribute 'version' (most likely due to a circular import)

In [8]:
import mlflow
mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("green taxi duration")

ImportError: cannot import name 'tarfile' from 'backports' (/home/ubuntu/anaconda3/lib/python3.9/site-packages/backports/__init__.py)

In [38]:
def read_dataframe(filename: str):
    df=pd.read_parquet(filename)
    df['duration']=df.lpep_dropoff_datetime - df.lpep_pickup_datetime
    df.duration=df.duration.apply(lambda m : m.total_seconds() / 60)
    
    df=df[(df.duration >= 1) & (df.duration <= 60)]
    categorical=['PULocationID','DOLocationID']
    df[categorical]=df[categorical].astype(str)
    return df    
    

In [39]:
def prepare_dictionaries(df:pd.DataFrame):
    df['PU_DO']=df['PULocationID'] + '_' + df['DOLocationID']
    categorical=['PU_DO']
    numerical=['trip_distance']
    dicts=df[categorical + numerical].to_dict(orient='records')
    return dicts

In [40]:
df_train=read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-01.parquet')
df_val=read_dataframe('https://d37ci6vzurychx.cloudfront.net/trip-data/green_tripdata_2021-02.parquet')
target='duration'
y_train=df_train[target].values
y_val=df_val[target].values

dict_train=prepare_dictionaries(df_train)
dict_val=prepare_dictionaries(df_val)

In [41]:
len(df_train),len(df_val)

(73908, 61921)

In [42]:
with mlflow.start_run():
    params=dict(max_depth=20,n_estimators=100,min_samples_leaf=10,random_state=0)
    mlflow.log_params(params)

    dv = DictVectorizer()
    model = RandomForestRegressor(**params,n_jobs=-1)

    X_train = dv.fit_transform(dict_train)
    model.fit(X_train,y_train)

    X_val = dv.transform(dict_val)
    y_pred = model.predict(X_val)

   
    mse = mean_squared_error(y_val, y_pred)
    print(params,mse)
    mlflow.log_metric('mse',mse)
    
    mlflow.sklearn.log_model(model,artifact_path="model")

    with open('dict_vectorizer.bin','wb') as f_out:
        pickle.dump(dv,f_out)
    

{'max_depth': 20, 'n_estimators': 100, 'min_samples_leaf': 10, 'random_state': 0} 45.64114429815588




🏃 View run gaudy-carp-451 at: http://127.0.0.1:5000/#/experiments/1/runs/da8ff84b1e2e43e4912e184470d7aec2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


### using sklearn pipeline

In [43]:
with mlflow.start_run():
    params=dict(max_depth=20,n_estimators=100,min_samples_leaf=10,random_state=0)
    mlflow.log_params(params)
    pipeline=make_pipeline(
        DictVectorizer(),
        RandomForestRegressor(**params,n_jobs=-1)
    )
    pipeline.fit(dict_train,y_train)
    y_pred=pipeline.predict(dict_val)
    
    mse=mean_squared_error(y_pred,y_val)
    print(params,mse)
    mlflow.log_metric('mse',mse)
    mlflow.sklearn.log_model(pipeline,artifact_path="model")
    
    

{'max_depth': 20, 'n_estimators': 100, 'min_samples_leaf': 10, 'random_state': 0} 45.64114429815588




🏃 View run smiling-loon-660 at: http://127.0.0.1:5000/#/experiments/1/runs/51ac55521a414c17869dc2eccb873ee2
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


In [None]:
with open(path,'rb')as f_out:
    dv=pickle.load(f_out)
          