In [9]:
from mltrace import get_db_uri, set_db_uri, create_component, tag_component, register
from utils import io, models

import os
import pandas as pd

# Develop a model offline

This notebook trains a model on January 2020 data and evaluates on February 2020.

In [10]:
# Params

mltrace_server = '54.177.215.161'
train_month = '2020_01'
test_month = '2020_02'

feature_columns = [
    'pickup_weekday', 'pickup_hour', 'pickup_minute', 'work_hours',
    'passenger_count', 'trip_distance', 'trip_time', 'trip_speed',
    'PULocationID', 'DOLocationID', 'RatecodeID'
]
label_column = 'high_tip_indicator'

model_params = {
    'max_depth': 10
}

In [3]:
# Create mltrace metadata

set_db_uri(get_db_uri().replace('database', mltrace_server))

create_component('split_dev', 'Splitting features into train and test sets. Dev/offline version', 'shreya')
tag_component('split_dev', ['training', 'dev'])
create_component('train_dev', 'Training a model. Dev/offline version.', 'shreya')
tag_component('train_dev', ['training', 'dev'])

## Split

In [5]:
# Load latest features for train and test sets
train_df = io.load_output_df(os.path.join('features', train_month))
test_df = io.load_output_df(os.path.join('features', test_month))

# Save train and test sets. Good to save intermediate outputs!
train_output_path = io.save_output_df(train_df, 'training/files/train', dev=True)
test_output_path = io.save_output_df(test_df, 'training/files/test', dev=True)

In [6]:
print(train_output_path)
print(test_output_path)

s3://toy-applied-ml-pipeline/dev/training/files/train/20210501-150521.pq
s3://toy-applied-ml-pipeline/dev/training/files/test/20210501-150811.pq


## Train dev model

In [7]:
# Load train and test sets. Good to save intermediate outputs!

train_df = io.load_output_df('training/files/train', dev=True)
test_df = io.load_output_df('training/files/test', dev=True)

In [11]:
# Create and train model

mw = models.RandomForestModelWrapper(
    feature_columns=feature_columns, model_params=model_params)
mw.train(train_df, label_column)

In [13]:
# Score model and add params to model wrapper. TODO: include MLFlow here

train_score = mw.score(train_df, label_column)
test_score = mw.score(test_df, label_column)
mw.add_data_path('train_df', train_output_path)
mw.add_data_path('test_df', test_output_path)
mw.add_metric('train_f1', train_score)
mw.add_metric('test_f1', test_score)

In [14]:
# Print paths, metrics, feature importances
print('Paths:')
print(mw.get_data_paths())
print('Metrics:')
print(mw.get_metrics())
print(mw.get_feature_importances())

Paths:
{'train_df': 's3://toy-applied-ml-pipeline/dev/training/files/train/20210501-150521.pq', 'test_df': 's3://toy-applied-ml-pipeline/dev/training/files/test/20210501-150811.pq'}
Metrics:
{'train_f1': 0.7370630497255707, 'test_f1': 0.7190508623877429}
            feature  importance
0     trip_distance    0.218205
1        RatecodeID    0.191648
2   passenger_count    0.142927
3         trip_time    0.129717
4      PULocationID    0.116504
5      DOLocationID    0.109360
6       pickup_hour    0.062277
7    pickup_weekday    0.020527
8        work_hours    0.006613
9     pickup_minute    0.001911
10       trip_speed    0.000310


In [15]:
# Save model

output_path = mw.save('training/models', dev=True)
print(output_path)

s3://toy-applied-ml-pipeline/dev/training/models/20210501-151804.pkl
