In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [2]:
from splicemachine.spark import PySpliceContext
splice = PySpliceContext(spark)

In [7]:
from splicemachine.mlflow_support import *
from splicemachine.mlflow_support.utilities import get_user
mlflow.register_splice_context(splice)
schema = get_user()

  and should_run_async(code)


## One output node - Regression

In [8]:
# first neural network with keras tutorial
from numpy import loadtxt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import pandas as pd
import numpy as np
from random import random

with mlflow.start_run():
    # load the dataset
    dataset = loadtxt('data.csv', delimiter=',')
    # split into input (X) and output (y) variables
    X = dataset[:,0:8]
    y = dataset[:,8]
    # Make it a regression problem
    y = np.array([random() for _ in y])
    # define the keras model
    model = Sequential()
    model.add(Dense(12, input_dim=8, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile the keras model
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    # fit the keras model on the dataset
    model.fit(X, y, epochs=50, batch_size=10)

#     model.save('simple_model_multiclass.h5')
    mlflow.log_model(model, 'keras_model')
    print(mlflow.current_run_id())
    splice.dropTableIfExists(f'{schema}.keras_regression')
    
    run_id = mlflow.current_run_id()
    
    df = pd.DataFrame(X,columns=[f'C{i}' for i in range(len(X[0]))])
    jid = mlflow.deploy_db(schema, 'keras_regression', mlflow.current_run_id(), primary_key=[('MOMENT', 'INT')], df=df, create_model_table=True)
    mlflow.watch_job(jid)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50


Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Saving artifact of size: 5.376 KB to Splice Machine DB
acccfafdb9bf
Deploying model to database...


Passing in primary keys as a list of tuples is deprecated. Use dictionary {column name: type}


Your Job has been submitted. The returned value of this function is the job id, which you can use to monitor the your task in real-time. Run mlflow.watch_job(<job id>) tostream them to stdout, or mlflow.fetch_logs(<job id>) to read them one time to a list
---Job Logs---
INFO     2020-09-14 18:40:12.351 - A service worker has found your request
INFO     2020-09-14 18:40:12.371 - Checking whether handler DEPLOY_DATABASE is enabled
INFO     2020-09-14 18:40:12.405 - Handler is available
INFO     2020-09-14 18:40:12.419 - Retrieving Run from MLFlow Tracking Server...INFO     2020-09-14 18:40:12.419 - Retrieving Run from MLFlow Tracking Server...
INFO     2020-09-14 18:40:12.495 - Retrieved MLFlow Run
INFO     2020-09-14 18:40:12.509 - Updating MLFlow Run for the UI
INFO     2020-09-14 18:40:12.543 - Reading Model Artifact Stream from Splice Machine
INFO     2020-09-14 18:40:12.555 - Extracting Model from DB with Name: keras_model
INFO     2020-09-14 18:40:12.578 - Decoding Model Artifact B

In [9]:
%%time
%%sql

insert into keras_regression (c0,c1,c2,c3,c4,c5,c6,c7,moment) values(2.4,2.2,1.5,2.6,3.6,7.5,77.8,2.4,34512);
insert into keras_regression (c0,c1,c2,c3,c4,c5,c6,c7,moment) values(66.2,3.78,33.5,-23.4,22.3,0.1,19.3,-339.2,345);

select * from keras_regression;
select * into ${data_and_preds} from keras_regression;

CPU times: user 13 ms, sys: 5.53 ms, total: 18.5 ms
Wall time: 4.66 s


In [10]:
from beakerx.object import beakerx

data = beakerx.get('data_and_preds')

features = data[['C0','C1','C2','C3','C4','C5','C6','C7']]
db_preds = data[['PREDICTION']]


raw_preds = model.predict(features)


for db_pred, raw_pred in zip(db_preds.iterrows(), raw_preds):
    # Check reconstruction values
    for db, raw in zip(db_pred[1], raw_pred):
        l = min(len(str(raw)),len(str(db))) # Digit truncation because keras python only returns 7 decimal places
        assert round(float(db),l) == round(float(raw),l), f'Something is wrong. Model Table gives {float(db)} but raw model gives {float(raw)}'

    
print('test passed!')

test passed!


## Use pred_threshold to make it a binary classification

In [17]:
splice.dropTableIfExists(f'{schema}.keras_binary')
jid = mlflow.deploy_db(schema, 'keras_binary', run_id, primary_key=[('MOMENT', 'INT')], df=df, create_model_table=True, classes=['Out1'], library_specific={'pred_threshold':0.5})
mlflow.watch_job(jid)

Deploying model to database...


Passing in primary keys as a list of tuples is deprecated. Use dictionary {column name: type}


Your Job has been submitted. The returned value of this function is the job id, which you can use to monitor the your task in real-time. Run mlflow.watch_job(<job id>) tostream them to stdout, or mlflow.fetch_logs(<job id>) to read them one time to a list
---Job Logs---
INFO     2020-09-14 18:42:23.059 - A service worker has found your request
INFO     2020-09-14 18:42:23.080 - Checking whether handler DEPLOY_DATABASE is enabled
INFO     2020-09-14 18:42:23.110 - Handler is available
INFO     2020-09-14 18:42:23.126 - Retrieving Run from MLFlow Tracking Server...INFO     2020-09-14 18:42:23.126 - Retrieving Run from MLFlow Tracking Server...
INFO     2020-09-14 18:42:23.216 - Retrieved MLFlow Run
INFO     2020-09-14 18:42:23.229 - Updating MLFlow Run for the UI
INFO     2020-09-14 18:42:23.262 - Reading Model Artifact Stream from Splice Machine
INFO     2020-09-14 18:42:23.274 - Extracting Model from DB with Name: keras_model
INFO     2020-09-14 18:42:23.295 - Decoding Model Artifact B

In [20]:
%%time
%%sql
truncate table keras_binary;
insert into keras_binary (c0,c1,c2,c3,c4,c5,c6,c7,moment) values(2.4,2.2,1.5,2.6,3.6,7.5,77.8,2.4,         34512);
insert into keras_binary (c0,c1,c2,c3,c4,c5,c6,c7,moment) values(66.2,3.78,33.5,-23.4,22.3,0.1,19.3,-339.2,345);

select * from keras_binary;
select * into ${data_and_preds} from keras_binary;

CPU times: user 14.3 ms, sys: 2.42 ms, total: 16.7 ms
Wall time: 1.26 s


In [21]:
from beakerx.object import beakerx

data = beakerx.get('data_and_preds')

features = data[['C0','C1','C2','C3','C4','C5','C6','C7']]
db_preds = data[['Out1']]
db_classpred = data[['prediction']]

#set in the deploy call
threshold = 0.5

raw_preds = model.predict(features)


for db_c, db_pred, raw_pred in zip(db_classpred.iterrows(),db_preds.iterrows(), raw_preds):
    # Check reconstruction values
    for c,db, raw in zip(db_c[1],db_pred[1], raw_pred):
        l = min(len(str(raw)),len(str(db))) # Digit truncation
        assert round(float(db),l) == round(float(raw),l), f'Something is wrong. Model Table gives {float(db)} but raw model gives {float(raw)}'
        
        raw_classpred = 1.0 if raw > threshold else 0.0
        assert raw_classpred==c, f'Something is wrong. Model Table gives {float(c)} but raw model gives {float(raw_classpred)}'

    
print('test passed!')

test passed!


## Multiclass prediction

In [23]:
# first neural network with keras tutorial
from numpy import loadtxt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import pandas as pd
import numpy as np
from random import random

with mlflow.start_run():
    # load the dataset
    dataset = loadtxt('data.csv', delimiter=',')
    # split into input (X) and output (y) variables
    X = dataset[:,0:8]
    y = dataset[:,8]
    # Make it a regression problem
    y = np.array([random() for _ in y])
    # define the keras model
    model = Sequential()
    model.add(Dense(12, input_dim=8, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(3, activation='sigmoid'))
    # compile the keras model
    model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['accuracy'])
    # fit the keras model on the dataset
    model.fit(X, y, epochs=50, batch_size=10)

#     model.save('simple_model_multiclass.h5')
    mlflow.log_model(model, 'keras_model')
    print(mlflow.current_run_id())
    splice.dropTableIfExists(f'{schema}.keras_multiclass')
    
    run_id = mlflow.current_run_id()
    
    df = pd.DataFrame(X,columns=[f'C{i}' for i in range(len(X[0]))])
    jid = mlflow.deploy_db(schema, 'keras_multiclass', mlflow.current_run_id(), primary_key=[('MOMENT', 'INT')], classes=['class1','class2','class3'],df=df, create_model_table=True)
    mlflow.watch_job(jid)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Saving artifact of size: 8.058 KB to Splice Machine DB
bb3c158690b1
Deploying model to database...


Passing in primary keys as a list of tuples is deprecated. Use dictionary {column name: type}


Your Job has been submitted. The returned value of this function is the job id, which you can use to monitor the your task in real-time. Run mlflow.watch_job(<job id>) tostream them to stdout, or mlflow.fetch_logs(<job id>) to read them one time to a list
---Job Logs---
INFO     2020-09-14 18:43:42.062 - A service worker has found your request
INFO     2020-09-14 18:43:42.086 - Checking whether handler DEPLOY_DATABASE is enabled
INFO     2020-09-14 18:43:42.117 - Handler is available
INFO     2020-09-14 18:43:42.132 - Retrieving Run from MLFlow Tracking Server...INFO     2020-09-14 18:43:42.132 - Retrieving Run from MLFlow Tracking Server...
INFO     2020-09-14 18:43:42.209 - Retrieved MLFlow Run
INFO     2020-09-14 18:43:42.221 - Updating MLFlow Run for the UI
INFO     2020-09-14 18:43:42.251 - Reading Model Artifact Stream from Splice Machine
INFO     2020-09-14 18:43:42.262 - Extracting Model from DB with Name: keras_model
INFO     2020-09-14 18:43:42.284 - Decoding Model Artifact B

In [24]:
%%time
%%sql
truncate table keras_multiclass;
insert into keras_multiclass (c0,c1,c2,c3,c4,c5,c6,c7,moment) values(2.4,2.2,1.5,2.6,3.6,7.5,77.8,2.4,         34512);
insert into keras_multiclass (c0,c1,c2,c3,c4,c5,c6,c7,moment) values(66.2,3.78,33.5,-23.4,22.3,0.1,19.3,-339.2,345);

select * from keras_multiclass;
select * into ${data_and_preds} from keras_multiclass;

CPU times: user 10 ms, sys: 6.2 ms, total: 16.2 ms
Wall time: 2.43 s


In [39]:
from beakerx.object import beakerx

data = beakerx.get('data_and_preds')

features = data[['C0','C1','C2','C3','C4','C5','C6','C7']]
db_preds = data[['prediction', 'class1', 'class2', 'class3']]

#set in the deploy call
threshold = 0.5

raw_preds = list(model.predict(features))

for db_pred, raw_pred in zip(db_preds.iterrows(), raw_preds):

    # Add prediction
    raw_pred = [list(raw_pred).index(max(raw_pred))] + list(raw_pred)
    for db, raw in zip(db_pred[1], raw_pred):
        l = min(len(str(raw)),len(str(db)),14) # Digit truncation
        try:
            assert round(float(db),l) == round(float(raw),l), f'Something is wrong. Model Table gives {round(float(db),l)} but raw model gives {round(float(raw),l)}'
        except:
            print('Likely rounding issue. Testing max precision')
            for l in range(l,5,-1):
                if round(float(db),l) == round(float(raw),l):
                    print(f'Matched at precision {l}')
                    break
                if l == 6:
                    raise Exception(f'Fields did not match. Model Table gives {round(float(db),l)} but raw model gives {round(float(raw),l)}')
    
print('test passed!')

  and should_run_async(code)


Likely rounding issue. Testing max precision
Matched at precision 8
Likely rounding issue. Testing max precision
Matched at precision 7
Likely rounding issue. Testing max precision
Matched at precision 6
Likely rounding issue. Testing max precision
Matched at precision 6
Likely rounding issue. Testing max precision
Matched at precision 6
Likely rounding issue. Testing max precision
Matched at precision 6
test passed!


In [40]:
spark.stop()

  and should_run_async(code)
