In [8]:
import datetime

import disdat.api as api
from disdat.api import Bundle
import pandas as pd
import pickle
import time

from pipelines.return_targets import ReturnTargets

from sklearn import datasets, linear_model, tree
from sklearn.metrics import  mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

# Create example context

In [9]:
data_context = 'example-context'

# Create data context
api.context(data_context)
api.rm(data_context, rm_all=True)

# Run a pipeline to generate training data

In [10]:
# Execute pipeline
api.apply(data_context, ReturnTargets)

INFO: Informed scheduler that task   DriverTask_False______9f45fa3e13   has status   PENDING
INFO: Informed scheduler that task   ReturnTargets__99914b932b   has status   PENDING
INFO: Done scheduling tasks
INFO: Running Worker with 1 processes
INFO: [pid 74644] Worker Worker(salt=440402511, workers=1, host=INTUL173d00074, username=kyocum, pid=74644) running   ReturnTargets()
INFO: [pid 74644] Worker Worker(salt=440402511, workers=1, host=INTUL173d00074, username=kyocum, pid=74644) done      ReturnTargets()
INFO: Informed scheduler that task   ReturnTargets__99914b932b   has status   DONE
INFO: [pid 74644] Worker Worker(salt=440402511, workers=1, host=INTUL173d00074, username=kyocum, pid=74644) running   DriverTask(output_bundle=-, pipe_params={}, pipe_cls=<class 'pipelines.return_targets.ReturnTargets'>, input_tags={}, output_tags={}, force=False)
INFO: [pid 74644] Worker Worker(salt=440402511, workers=1, host=INTUL173d00074, username=kyocum, pid=74644) done      DriverTask(output_bun

-------------------
return_targets is Running!
-------------------



{'success': True, 'did_work': True}

# Retrieve training data and create splits

In [4]:
train_bundle = api.get(data_context, 'return_targets')
print('name:', train_bundle.name)
print('data:', train_bundle.data)
print('creation date:', datetime.datetime.utcfromtimestamp(train_bundle.creation_date))
print('uuid:', train_bundle.uuid)
print('gitstuff:', train_bundle.git_info)

# Read the data from file in the bundle
df = pd.read_csv(train_bundle.data, index_col=0)
display(df)
X_train = df.a.values.reshape(-1,1)
Y_train = df.b.values.reshape(-1,1)

name: return_targets
data: /Users/kyocum/.disdat/context/example-context/objects/b25c9031-bbad-489f-801c-4d2c94ed80f9/df.csv
creation date: 2020-08-28 21:20:04.539730
uuid: b25c9031-bbad-489f-801c-4d2c94ed80f9
gitstuff: ('git@github.com:seanr15/disdat-examples.git', '63cb9dc6bd92aaae418e4ac637f6b93cf3435c4c', 'feature/sparkjob')


Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


# Train 3 different models, store each in a bundle

In [11]:
model_types = {'linear':linear_model.LinearRegression(),
               'tree':tree.DecisionTreeRegressor(min_samples_leaf=10, criterion='mse'),
               'random_forest': RandomForestRegressor(n_estimators=1000, n_jobs=-1, max_leaf_nodes=16)
              }
models = {}

for k,v in model_types.items():
    start = time.time()
    clf = v.fit(X_train, Y_train)
    stop = time.time()
    with Bundle(data_context) as b:
        b.name=k
        b.add_timing(start,stop)
        fn = b.get_file("model.pkl")
        with open(fn, mode='wb') as f:
            pickle.dump(clf, f)
        b.add_data(fn)
        b.add_tags({'model':'True', 'model_type':k})
        b.add_dependencies(train_bundle)

# Retrieve bundles by tag, use them to estimate training error

In [7]:
for b in api.search(data_context, tags={'model':'True'}):
    print("Found {} bundle, took [{}] seconds to train".format(b.name, b.timing[1]-b.timing[0]))
    with open(b.data, mode='rb') as f:
        clf = pickle.load(f)
    Y_pred = clf.predict(X_train)    
    print("Train Error MAE [{}]".format(mean_absolute_error(Y_pred, Y_train)))
    print()
    

Found random_forest bundle, took [1.1025018692016602] seconds to train
Train Error MAE [0.2926666666666667]

Found tree bundle, took [0.0003991127014160156] seconds to train
Train Error MAE [0.6666666666666666]

Found linear bundle, took [0.0005710124969482422] seconds to train
Train Error MAE [0.0]

