In [1]:
from mltrace import get_db_uri, set_db_uri, create_component, tag_component, register
from utils import io, models

import os
import pandas as pd

# Develop a model offline

This notebook trains a model on January 2020 data and evaluates on February 2020.

In [2]:
# Params

mltrace_server = '54.177.215.161'
train_month = '2020_01'
test_month = '2020_02'

feature_columns = [
    'pickup_weekday', 'pickup_hour', 'pickup_minute', 'work_hours',
    'passenger_count', 'trip_distance', 'trip_time', 'trip_speed',
    'PULocationID', 'DOLocationID', 'RatecodeID'
]
label_column = 'high_tip_indicator'

model_params = {
    'max_depth': 10
}

In [3]:
# Create mltrace metadata

set_db_uri(get_db_uri().replace('database', mltrace_server))

create_component('split_dev', 'Splitting features into train and test sets. Dev/offline version', 'shreya')
tag_component('split_dev', ['training', 'dev'])
create_component('train_dev', 'Training a model. Dev/offline version.', 'shreya')
tag_component('train_dev', ['training', 'dev'])

create_component('train', 'Training a model.', 'shreya')
tag_component('train', ['training'])

## Split into train and test sets for dev model

In [4]:
@register('split_dev', input_vars=['input_files'], output_vars=['train_output_path', 'test_output_path'])
def split():
    # Load latest features for train and test sets
    train_df = io.load_output_df(os.path.join('features', train_month))
    test_df = io.load_output_df(os.path.join('features', test_month))
    
    input_files = [io.get_output_path(os.path.join('features', month)) for month in [train_month, test_month]]

    # Save train and test sets. Good to save intermediate outputs!
    train_output_path = io.save_output_df(train_df, 'training/files/train', dev=True)
    test_output_path = io.save_output_df(test_df, 'training/files/test', dev=True)
    
    return train_output_path, test_output_path

In [5]:
# Call split function

train_output_path, test_output_path = split()

## Train dev model

In [6]:
@register('train_dev', inputs=[train_output_path, test_output_path], output_vars=['dev_output_path'])
def train_dev(train_output_path, test_output_path):
    # Load train and test sets. Good to save intermediate outputs!
    train_df = io.load_output_df('training/files/train', dev=True)
    test_df = io.load_output_df('training/files/test', dev=True)
    
    # Create and train model
    mw = models.RandomForestModelWrapper(
        feature_columns=feature_columns, model_params=model_params)
    mw.train(train_df, label_column)
    
    # Score model and add params to model wrapper. TODO: include MLFlow here
    train_score = mw.score(train_df, label_column)
    test_score = mw.score(test_df, label_column)
    mw.add_data_path('train_df', train_output_path)
    mw.add_data_path('test_df', test_output_path)
    mw.add_metric('train_f1', train_score)
    mw.add_metric('test_f1', test_score)
    
    # Print paths, metrics, feature importances
    print('Paths:')
    print(mw.get_data_paths())
    print('Metrics:')
    print(mw.get_metrics())
    print(mw.get_feature_importances())
    
    # Save model
    dev_output_path = mw.save('training/models', dev=True)
    return dev_output_path

In [7]:
# Train dev model

dev_output_path = train_dev(train_output_path, test_output_path)

Paths:
{'train_df': 's3://toy-applied-ml-pipeline/dev/training/files/train/20210501-165847.pq', 'test_df': 's3://toy-applied-ml-pipeline/dev/training/files/test/20210501-170136.pq'}
Metrics:
{'train_f1': 0.7310504153094398, 'test_f1': 0.735223195868965}
            feature  importance
0     trip_distance    0.214608
1        RatecodeID    0.208746
2   passenger_count    0.161279
3         trip_time    0.131596
4      PULocationID    0.112600
5      DOLocationID    0.091000
6       pickup_hour    0.057419
7    pickup_weekday    0.016199
8        work_hours    0.004890
9     pickup_minute    0.001364
10       trip_speed    0.000297


## Train production model

In [8]:
@register('train', inputs=[dev_output_path], input_vars=['train_file_path'], output_vars=['output_path'])
def train_prod():
    train_df = io.load_output_df(os.path.join('features', '2020_02'))
    train_file_path = io.get_output_path(os.path.join('features', '2020_02'))

    # Train model
    prod_mw = models.RandomForestModelWrapper(
        feature_columns=feature_columns, model_params=model_params)
    prod_mw.train(train_df, label_column)
    
    # Score model and add to mw
    train_score = prod_mw.score(train_df, label_column)
    prod_mw.add_data_path('train_df', train_file_path)
    prod_mw.add_metric('train_f1', train_score)
    
    # Print paths and metrics
    print('Paths:')
    print(prod_mw.get_data_paths())
    print('Metrics:')
    print(prod_mw.get_metrics())
    
    # Print feature importances
    print(prod_mw.get_feature_importances())
    
    # Save model
    output_path = prod_mw.save('training/models', dev=False)
    return output_path

In [9]:
# Call the train function

output_path = train_prod()

Paths:
{'train_df': 's3://toy-applied-ml-pipeline/dev/features/2020_02/20210429-194419.pq', 'test_df': 's3://toy-applied-ml-pipeline/dev/training/files/test/20210501-170136.pq'}
Metrics:
{'train_f1': 0.7370630497255707, 'test_f1': 0.735223195868965}
            feature  importance
0     trip_distance    0.218205
1        RatecodeID    0.191648
2   passenger_count    0.142927
3         trip_time    0.129717
4      PULocationID    0.116504
5      DOLocationID    0.109360
6       pickup_hour    0.062277
7    pickup_weekday    0.020527
8        work_hours    0.006613
9     pickup_minute    0.001911
10       trip_speed    0.000310
