In [None]:

import pandas as pd
from daggerml import Dml

from dml_util import S3Store, funkify

In [None]:
dml = Dml(repo="tutorial", branch="main")
dag = dml.new("ml-example-2")
s3 = S3Store()

In [None]:
@funkify
def load_data(dag):
    from tempfile import NamedTemporaryFile

    import pandas as pd
    from sklearn.datasets import load_iris
    from sklearn.model_selection import train_test_split

    from dml_util import S3Store

    s3 = S3Store()
    params = dag.argv[1].value()
    X, y = load_iris(as_frame=True, return_X_y=True)
    splits = train_test_split(X, y, random_state=params["random_state"])
    out = {}
    for name, spl in zip(["X_train", "X_test", "y_train", "y_test"], splits):
        with NamedTemporaryFile() as temp:
            if isinstance(spl,pd.Series):
                spl = spl.to_frame("class")
            spl.to_parquet(temp.name)
            temp.seek(0)
            out[name] = s3.put(filepath=temp.name, suffix=".parquet")

    return out

dag.load_data = load_data
params = {"random_state": 2}
iris_data = dag.load_data(params, name="iris_data")
iris_data

In [18]:
@funkify
def fit_model(dag):
    import pickle
    from time import time

    import pandas as pd
    from sklearn.cluster import KMeans

    from dml_util import S3Store

    t_0 = time()
    train = dag.argv[1].value()
    params = dag.argv[2].value()
    clusterer = KMeans(**params)
    iris_train = pd.read_parquet(train.uri,engine="fastparquet")
    fitted = clusterer.fit(iris_train)
    s3 = S3Store()
    t_n = time()
    dag.elapsed = t_n - t_0

    return s3.put(pickle.dumps(fitted), suffix=".pkl")

print(type(fit_model))
print(fit_model.uri)
print(fit_model.data["script"])

<class 'daggerml.core.Resource'>
script
#!/usr/bin/env python3
from dml_util import aws_fndag

def fit_model(dag):
    import pickle
    from time import time

    import pandas as pd
    from sklearn.cluster import KMeans

    from dml_util import S3Store

    t_0 = time()
    train = dag.argv[1].value()
    params = dag.argv[2].value()
    clusterer = KMeans(**params)
    iris_train = pd.read_parquet(train.uri,engine="fastparquet")
    fitted = clusterer.fit(iris_train)
    s3 = S3Store()
    t_n = time()
    dag.elapsed = t_n - t_0

    return s3.put(pickle.dumps(fitted), suffix=".pkl")

if __name__ == "__main__":
    with aws_fndag() as dag:
        res = fit_model(dag)
        if dag._ref is None:
            dag.result = res


In [None]:
dag.fit_model = fit_model
fitted = dag.fit_model(iris_data["X_train"], {"n_clusters": 3})

In [None]:
@funkify
def predict(dag):
    import pickle
    from tempfile import NamedTemporaryFile

    import pandas as pd

    from dml_util import S3Store
    s3 = S3Store()

    model = pickle.loads(s3.get(dag.argv[1]))
    X_test = pd.read_parquet(dag.argv[2].value().uri, engine="fastparquet")
    predictions = model.transform(X_test)
    preds_df = pd.DataFrame(predictions,index=X_test.index,columns=[f"c{i}" for i in range(predictions.shape[1])])

    with NamedTemporaryFile() as temp:
        preds_df.to_parquet(temp.name)
        temp.seek(0)
        return s3.put(filepath=temp.name, suffix=".parquet")

dag.predict = predict

In [None]:
predictions = dag.predict(fitted, iris_data["X_test"])

In [None]:
pd.read_parquet(predictions.value().uri)

In [None]:
dag.result = predictions