In [99]:
import datetime

import disdat.api as api
from disdat.api import Bundle
import pandas as pd
import pickle
import time

from pipelines.return_targets import ReturnTargets

from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model, tree
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score, cross_val_predict, cross_validate
from sklearn.ensemble import BaggingClassifier, BaggingRegressor, RandomForestRegressor, RandomForestClassifier

In [101]:
data_context = 'example-context'

# Create data context
api.context(data_context)
api.remote(data_context, data_context, "s3://cdo-disdat-prod", force=True)   

# Remove all bundles for clean execution
found_bundles = api.search(data_context)
if len(found_bundles) > 0:
    print("Found {} bundles in context {}, removing ...".format(len(found_bundles), data_context))
    api.rm(data_context, rm_all=True)

assert api.search(data_context) == []

Context already bound to remote at s3://cdo-disdat-prod/


In [102]:
# Execute pipeline
api.apply(data_context, ReturnTargets)

INFO: Informed scheduler that task   DriverTask_False______9f45fa3e13   has status   PENDING
INFO: Informed scheduler that task   ReturnTargets__99914b932b   has status   PENDING
INFO: Done scheduling tasks
INFO: Running Worker with 1 processes
INFO: [pid 18535] Worker Worker(salt=654229521, workers=1, host=INTUL173d00074, username=kyocum, pid=18535) running   ReturnTargets()
INFO: [pid 18535] Worker Worker(salt=654229521, workers=1, host=INTUL173d00074, username=kyocum, pid=18535) done      ReturnTargets()
INFO: Informed scheduler that task   ReturnTargets__99914b932b   has status   DONE
INFO: [pid 18535] Worker Worker(salt=654229521, workers=1, host=INTUL173d00074, username=kyocum, pid=18535) running   DriverTask(output_bundle=-, pipe_params={}, pipe_cls=<class 'pipelines.return_targets.ReturnTargets'>, input_tags={}, output_tags={}, force=False)
INFO: [pid 18535] Worker Worker(salt=654229521, workers=1, host=INTUL173d00074, username=kyocum, pid=18535) done      DriverTask(output_bun

-------------------
return_targets is Running!
-------------------



{'success': True, 'did_work': True}

In [104]:
bundle = api.get(data_context, 'return_targets')
print('name:', bundle.name)
print('data:', bundle.data)
print('creation date:', datetime.datetime.utcfromtimestamp(bundle.creation_date))
print('uuid:', bundle.uuid)
print('gitstuff:', bundle.git_info)
display(pd.read_csv(bundle.data,index_col=0))

name: return_targets
data: /Users/kyocum/.disdat/context/example-context/objects/cb46cc99-e4f5-416b-9df6-2d2e31f5b449/df.csv
creation date: 2020-05-01 21:57:03.762671
uuid: cb46cc99-e4f5-416b-9df6-2d2e31f5b449
gitstuff: ('git@github.com:seanr15/disdat-examples.git', '0ab06a25f36d833c5b0df5b0af10f6b8a9407226', 'kyocum/newapi')


Unnamed: 0,a,b
0,1,4
1,2,5
2,3,6


In [105]:
path = bundle.data
df=pd.read_csv(path, index_col=0)
X_train = df.a.values.reshape(-1,1)
Y_train = df.b.values.reshape(-1,1)

In [107]:
model_types = {'lin':linear_model.LinearRegression(),
               'tree':tree.DecisionTreeRegressor(min_samples_leaf=10, criterion='mse'),
               'rf': RandomForestRegressor(n_estimators=1000, n_jobs=-1, max_leaf_nodes=16)
              }
models = {}

for k,v in model_types.items():
    start = time.time()
    clf = v.fit(X_train, Y_train)
    stop = time.time()
    with Bundle(data_context) as b:
        b.name=k
        b.add_timing(start,stop)
        b.add_data(str(pickle.dumps(m)))
        b.add_params({'type':'k'})        

In [108]:
for b in api.search(data_context):
    print(b.name)
    print(b.processing_name)
    print(b.timing[1] - b.timing[0])
    b.commit().push()

rf
_8ce4b16b22_d41d8cd98f
1.228410005569458
Pushed committed bundle None uuid 81a20fe6-dd4d-49c1-872e-03291ab81675 to remote s3://cdo-disdat-prod/context
tree
_8ce4b16b22_d41d8cd98f
0.0007390975952148438
Pushed committed bundle None uuid c5e05da4-cae2-4f85-bbf1-ba7cafab2d4d to remote s3://cdo-disdat-prod/context
lin
_8ce4b16b22_d41d8cd98f
0.0004241466522216797
Pushed committed bundle None uuid 22ba4d31-8656-45bd-b2b0-0c3db734bf51 to remote s3://cdo-disdat-prod/context
lin
_d41d8cd98f_d41d8cd98f
0.002988100051879883
Pushed committed bundle None uuid 60105073-cd5b-4474-8e38-7e5ce0271619 to remote s3://cdo-disdat-prod/context
return_targets
ReturnTargets__99914b932b_d41d8cd98f
0.010350227355957031
Pushed committed bundle None uuid cb46cc99-e4f5-416b-9df6-2d2e31f5b449 to remote s3://cdo-disdat-prod/context
