In [1]:
import pandas as pd
import numpy as np
import pyarrow

# Models
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error

In [2]:
# Get current working directory
!pwd

/home/ubuntu/mlops_zoomcamp_sam/02-mlflow


In [3]:
import mlflow

mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment('nyc-taxi-experiment')

<Experiment: artifact_location='/home/ubuntu/mlops_zoomcamp_sam/02-mlflow/mlruns/1', creation_time=1684758245252, experiment_id='1', last_update_time=1684758245252, lifecycle_stage='active', name='nyc-taxi-experiment', tags={}>

## Question 1: Download the data
- Read the data, how many columns are there?

In [4]:
# Get current working directory
!pwd

/home/ubuntu/mlops_zoomcamp_sam/02-mlflow


In [5]:
import pathlib
data_path = pathlib.Path('../data')

def import_parquet_data(data_path, file):
    return pd.read_parquet(pathlib.Path(data_path, file), engine = 'pyarrow')

df = import_parquet_data(data_path, 'yellow_tripdata_2022-01.parquet')

The data has 19 columns

## Question 2. Computing Duration
- What's the standard deviation fo trips duration in January?

In [6]:
df

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,1,2022-01-01 00:35:40,2022-01-01 00:53:29,2.0,3.80,1.0,N,142,236,1,14.50,3.0,0.5,3.65,0.0,0.3,21.95,2.5,0.0
1,1,2022-01-01 00:33:43,2022-01-01 00:42:07,1.0,2.10,1.0,N,236,42,1,8.00,0.5,0.5,4.00,0.0,0.3,13.30,0.0,0.0
2,2,2022-01-01 00:53:21,2022-01-01 01:02:19,1.0,0.97,1.0,N,166,166,1,7.50,0.5,0.5,1.76,0.0,0.3,10.56,0.0,0.0
3,2,2022-01-01 00:25:21,2022-01-01 00:35:23,1.0,1.09,1.0,N,114,68,2,8.00,0.5,0.5,0.00,0.0,0.3,11.80,2.5,0.0
4,2,2022-01-01 00:36:48,2022-01-01 01:14:20,1.0,4.30,1.0,N,68,163,1,23.50,0.5,0.5,3.00,0.0,0.3,30.30,2.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2463926,2,2022-01-31 23:36:53,2022-01-31 23:42:51,,1.32,,,90,170,0,8.00,0.0,0.5,2.39,0.0,0.3,13.69,,
2463927,2,2022-01-31 23:44:22,2022-01-31 23:55:01,,4.19,,,107,75,0,16.80,0.0,0.5,4.35,0.0,0.3,24.45,,
2463928,2,2022-01-31 23:39:00,2022-01-31 23:50:00,,2.10,,,113,246,0,11.22,0.0,0.5,2.00,0.0,0.3,16.52,,
2463929,2,2022-01-31 23:36:42,2022-01-31 23:48:45,,2.92,,,148,164,0,12.40,0.0,0.5,0.00,0.0,0.3,15.70,,


In [7]:
df['Duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
df.Duration = df.Duration.apply(lambda td: td.total_seconds() / 60)
df['Duration'].std() # numpy standard deviation
# 46.45

46.44530513776802

## Question 3. Dropping Outliers
- Keep only between 1 and 60 minute rides (inclusive)

In [8]:
 # Use DataFrame.loc[] to filter by multiple conditions
df2 = df.loc[(df['Duration']>=1) & (df['Duration'] <= 60), :]

In [9]:
assert df2['Duration'].min() > .98, 'Check on Min Value Filtering'
assert df2['Duration'].max() < 60.2, 'Check on Max Value Filtering'

In [10]:
df2.shape[0] / df.shape[0] # 98.27%

0.9827547930522406

## Q4. One Hot Encoding
- Turn DF into list of dicts
- Fit DicVectorizor
- Get a feature matrix

- What's the dimensionality of the matrix?

In [11]:
# Only keep pickup and dropoff location columns
categorical = ['PULocationID', 'DOLocationID']
df2[categorical] = df2[categorical].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2[categorical] = df2[categorical].astype(str)


In [12]:
# Turn DF into list of dicts
train_dicts = df2[categorical].to_dict(orient='records')

In [13]:
dv = DictVectorizer()
X_train = dv.fit_transform(train_dicts)

In [14]:
target = 'Duration'
y_train = df2[target].values

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_train)

mean_squared_error(y_train, y_pred, squared=False)

6.986191065500608

## Import Feb Data to Validate

In [15]:
df = import_parquet_data(data_path, 'yellow_tripdata_2022-02.parquet')

In [16]:
# Create Target
df['Duration'] = df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']
df.Duration = df.Duration.apply(lambda td: td.total_seconds() / 60)

# Remove Outliers
df_feb = df.loc[(df['Duration']>=1) & (df['Duration'] <= 60), :]

In [17]:
# Only keep pickup and dropoff location columns
categorical = ['PULocationID', 'DOLocationID']
df_feb[categorical] = df_feb[categorical].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_feb[categorical] = df_feb[categorical].astype(str)


In [18]:
# Turn DF into list of dicts
valid_dicts = df_feb[categorical].to_dict(orient='records')

In [19]:
x_valid = dv.transform(valid_dicts)
y_valid = df_feb[target].values

In [20]:
y_pred = lr.predict(x_valid)
mean_squared_error(y_valid, y_pred, squared=False)

7.786408015215065

In [21]:
import pickle
with open('../models/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, lr), f_out)

In [22]:
import pathlib
with mlflow.start_run():
    # Set Model Run Info
    mlflow.set_tag('developer', 'sam')
    mlflow.log_param('train-data-path', pathlib.Path(data_path, 'yellow_tripdata_2022-01.parquet'))
    mlflow.log_param('valid-data-path', pathlib.Path(data_path, 'yellow_tripdata_2022-02.parquet'))
    
    alpha = 0.01
    mlflow.log_param('alpha', alpha)
    lr = Lasso(alpha)
    lr.fit(X_train, y_train)
    
    y_pred = lr.predict(x_valid)
    rmse = mean_squared_error(y_valid, y_pred, squared = False)
    mlflow.log_metric('rmse', rmse)

## Section 2.3 -- Experiment Tracking

In [23]:
import xgboost as xgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll import scope

In [27]:
from typing import Dict, Any
import xgboost as xgb
import mlflow
from sklearn.metrics import mean_squared_error
from numpy import ndarray
from xgboost import DMatrix

def objective(params: Dict[str, Any], train: DMatrix, valid: DMatrix, y_val: ndarray) -> Dict[str, Any]:
    """
    Train an XGBoost model with given parameters and datasets, log the training 
    process with MLFlow, make predictions on the validation set, calculate RMSE 
    and log it with MLFlow. Return the RMSE and the status.

    Parameters:
    params (dict): A dictionary of parameters to use for the XGBoost model.
    train (DMatrix): A DMatrix object representing the training dataset.
    valid (DMatrix): A DMatrix object representing the validation dataset.
    y_val (ndarray): A numpy array representing the actual target values for the validation dataset.

    Returns:
    dict: A dictionary with 'loss' key indicating the root mean squared error (RMSE) 
    on validation set and 'status' key indicating the status of the function.
    """
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}


In [36]:
def objective(params: Dict[str, Any]) -> Dict[str, Any]:
    with mlflow.start_run():
        mlflow.set_tag("model", "xgboost")
        mlflow.log_params(params)
        booster = xgb.train(
            params=params,
            dtrain=train,
            num_boost_round=1000,
            evals=[(valid, 'validation')],
            early_stopping_rounds=50
        )
        y_pred = booster.predict(valid)
        rmse = mean_squared_error(y_val, y_pred, squared=False)
        mlflow.log_metric("rmse", rmse)

    return {'loss': rmse, 'status': STATUS_OK}

In [37]:
# This dictionary defines the search space for hyperparameter optimization. The keys represent 
# different parameters of the XGBoost model, while the values specify distributions from which 
# values for these parameters will be sampled. Here:
# - 'max_depth' is sampled uniformly from integers between 4 and 100
# - 'learning_rate' is sampled log-uniformly in the range from 0.05 to 1
# - 'reg_alpha' is sampled log-uniformly in the range from 0.00001 to 0.1
# - 'reg_lambda' is sampled log-uniformly in the range from 0.000001 to 0.1
# - 'min_child_weight' is sampled log-uniformly in the range from 0.1 to 1000
# - 'objective' is fixed to 'reg:linear'
# - 'seed' is fixed to 42 for reproducibility
search_space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 100, 1)),
    'learning_rate': hp.loguniform('learning_rate', -3, 0),
    'reg_alpha': hp.loguniform('reg_alpha', -5, -1),
    'reg_lambda': hp.loguniform('reg_lambda', -6, -1),
    'min_child_weight': hp.loguniform('min_child_weight', -1, 3),
    'objective': 'reg:linear',
    'seed': 42
}


train = xgb.DMatrix(X_train, label = y_train)
valid = xgb.DMatrix(x_valid, label = y_valid)

best_result = fmin(
    fn=objective,
    space=search_space,
    algo=tpe.suggest,
    max_evals=50,
    trials=Trials()
)

[0]	validation-rmse:15.35985                          
[1]	validation-rmse:14.41090                          
[2]	validation-rmse:13.57295                          
[3]	validation-rmse:12.83231                          
[4]	validation-rmse:12.17382                          
[5]	validation-rmse:11.59770                          
[6]	validation-rmse:11.08326                          
[7]	validation-rmse:10.64031                          
[8]	validation-rmse:10.25453                          
[9]	validation-rmse:9.91548                           
[10]	validation-rmse:9.58507                          
[11]	validation-rmse:9.32857                          
[12]	validation-rmse:9.10308                          
[13]	validation-rmse:8.90627                          
[14]	validation-rmse:8.69451                          
[15]	validation-rmse:8.54580                          
[16]	validation-rmse:8.41701                          
[17]	validation-rmse:8.30162                          
[18]	valid

KeyboardInterrupt: 

In [None]:
# Model Training takes too long, going to C&P from course
params = {
   'learning_rate': .204721,
   'max_depth': 17,
   'objective': reg:linear,
   'reg_alpha': .2856789,
   'reg_lambda': .004264404,
   'seed':42
}

# We can autolog
# https://mlflow.org/docs/latest/tracking.html
mlflow.xgboost.autolog()
booster = xgb.train(
    params = params,
    dtrain=train,
    num_boost_round = 1000,
    evals = [(valid, 'validation')],
    early_stopping_rounds = 50
)