In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [None]:
from splicemachine.spark import PySpliceContext
splice = PySpliceContext(spark)

In [None]:
from splicemachine.mlflow_support import *
from splicemachine.mlflow_support.utilities import get_user
mlflow.register_splice_context(splice)
schema = get_user()

## One output node - Regression

In [None]:
# first neural network with keras tutorial
from numpy import loadtxt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import pandas as pd
import numpy as np
from random import random

with mlflow.start_run() as run:
    # load the dataset
    dataset = loadtxt('data.csv', delimiter=',')
    # split into input (X) and output (y) variables
    X = dataset[:,0:8]
    y = dataset[:,8]
    # Make it a regression problem
    y = np.array([random() for _ in y])
    # define the keras model
    model = Sequential()
    model.add(Dense(12, input_dim=8, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    # compile the keras model
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
    # fit the keras model on the dataset
    model.fit(X, y, epochs=50, batch_size=10)

#     model.save('simple_model_multiclass.h5')
    mlflow.log_model(model, 'keras_model', model_lib='keras')
    print(mlflow.current_run_id())
    splice.dropTableIfExists(f'{schema}.keras_regression')
    
    run_id = mlflow.current_run_id()
    
    df = pd.DataFrame(X,columns=[f'C{i}' for i in range(len(X[0]))])
    jid = mlflow.deploy_db(schema, 'keras_regression', mlflow.current_run_id(), primary_key=[('MOMENT', 'INT')], df=df, create_model_table=True)
    mlflow.watch_job(jid)

In [None]:
mlflow.load_model(run.info.run_uuid)

In [None]:
%%time
%%sql

insert into keras_regression (c0,c1,c2,c3,c4,c5,c6,c7,moment) values(2.4,2.2,1.5,2.6,3.6,7.5,77.8,2.4,34512);
insert into keras_regression (c0,c1,c2,c3,c4,c5,c6,c7,moment) values(66.2,3.78,33.5,-23.4,22.3,0.1,19.3,-339.2,345);

select * from keras_regression;
select * into ${data_and_preds} from keras_regression;

In [None]:
from beakerx.object import beakerx

data = beakerx.get('data_and_preds')

features = data[['C0','C1','C2','C3','C4','C5','C6','C7']]
db_preds = data[['PREDICTION']]


raw_preds = model.predict(features)


for db_pred, raw_pred in zip(db_preds.iterrows(), raw_preds):
    # Check reconstruction values
    for db, raw in zip(db_pred[1], raw_pred):
        l = min(len(str(raw)),len(str(db))) # Digit truncation because keras python only returns 7 decimal places
        assert round(float(db),l) == round(float(raw),l), f'Something is wrong. Model Table gives {float(db)} but raw model gives {float(raw)}'

    
print('test passed!')

## Use pred_threshold to make it a binary classification

In [None]:
splice.dropTableIfExists(f'{schema}.keras_binary')
jid = mlflow.deploy_db(schema, 'keras_binary', run_id, primary_key=[('MOMENT', 'INT')], df=df, create_model_table=True, classes=['Out1'], library_specific={'pred_threshold':0.5})
mlflow.watch_job(jid)

In [None]:
%%time
%%sql
truncate table keras_binary;
insert into keras_binary (c0,c1,c2,c3,c4,c5,c6,c7,moment) values(2.4,2.2,1.5,2.6,3.6,7.5,77.8,2.4,         34512);
insert into keras_binary (c0,c1,c2,c3,c4,c5,c6,c7,moment) values(66.2,3.78,33.5,-23.4,22.3,0.1,19.3,-339.2,345);

select * from keras_binary;
select * into ${data_and_preds} from keras_binary;

In [None]:
from beakerx.object import beakerx

data = beakerx.get('data_and_preds')

features = data[['C0','C1','C2','C3','C4','C5','C6','C7']]
db_preds = data[['Out1']]
db_classpred = data[['prediction']]

#set in the deploy call
threshold = 0.5

raw_preds = model.predict(features)


for db_c, db_pred, raw_pred in zip(db_classpred.iterrows(),db_preds.iterrows(), raw_preds):
    # Check reconstruction values
    for c,db, raw in zip(db_c[1],db_pred[1], raw_pred):
        l = min(len(str(raw)),len(str(db))) # Digit truncation
        assert round(float(db),l) == round(float(raw),l), f'Something is wrong. Model Table gives {float(db)} but raw model gives {float(raw)}'
        
        raw_classpred = 1.0 if raw > threshold else 0.0
        assert raw_classpred==c, f'Something is wrong. Model Table gives {float(c)} but raw model gives {float(raw_classpred)}'

    
print('test passed!')

## Multiclass prediction

In [None]:
# first neural network with keras tutorial
from numpy import loadtxt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import pandas as pd
import numpy as np
from random import random

with mlflow.start_run() as run:
    # load the dataset
    dataset = loadtxt('data.csv', delimiter=',')
    # split into input (X) and output (y) variables
    X = dataset[:,0:8]
    y = dataset[:,8]
    # Make it a regression problem
    y = np.array([random() for _ in y])
    # define the keras model
    model = Sequential()
    model.add(Dense(12, input_dim=8, activation='relu'))
    model.add(Dense(8, activation='relu'))
    model.add(Dense(3, activation='sigmoid'))
    # compile the keras model
    model.compile(loss='mean_absolute_error', optimizer='adam', metrics=['accuracy'])
    # fit the keras model on the dataset
    model.fit(X, y, epochs=50, batch_size=10)

#     model.save('simple_model_multiclass.h5')
    mlflow.keras.log_model(model, 'keras_model')
    print(mlflow.current_run_id())
    splice.dropTableIfExists(f'{schema}.keras_multiclass')
    
    run_id = mlflow.current_run_id()
    
    df = pd.DataFrame(X,columns=[f'C{i}' for i in range(len(X[0]))])
    jid = mlflow.deploy_db(schema, 'keras_multiclass', mlflow.current_run_id(), primary_key=[('MOMENT', 'INT')], classes=['class1','class2','class3'],df=df, create_model_table=True)
    mlflow.watch_job(jid)

In [None]:
mlflow.load_model(run.info.run_uuid)

In [None]:
%%time
%%sql
truncate table keras_multiclass;
insert into keras_multiclass (c0,c1,c2,c3,c4,c5,c6,c7,moment) values(2.4,2.2,1.5,2.6,3.6,7.5,77.8,2.4,         34512);
insert into keras_multiclass (c0,c1,c2,c3,c4,c5,c6,c7,moment) values(66.2,3.78,33.5,-23.4,22.3,0.1,19.3,-339.2,345);

select * from keras_multiclass;
select * into ${data_and_preds} from keras_multiclass;

In [None]:
from beakerx.object import beakerx

data = beakerx.get('data_and_preds')

features = data[['C0','C1','C2','C3','C4','C5','C6','C7']]
db_preds = data[['prediction', 'class1', 'class2', 'class3']]

#set in the deploy call
threshold = 0.5

raw_preds = list(model.predict(features))

for db_pred, raw_pred in zip(db_preds.iterrows(), raw_preds):

    # Add prediction
    raw_pred = [list(raw_pred).index(max(raw_pred))] + list(raw_pred)
    for db, raw in zip(db_pred[1], raw_pred):
        l = min(len(str(raw)),len(str(db)),14) # Digit truncation
        try:
            assert round(float(db),l) == round(float(raw),l), f'Something is wrong. Model Table gives {round(float(db),l)} but raw model gives {round(float(raw),l)}'
        except:
            print('Likely rounding issue. Testing max precision')
            for l in range(l,5,-1):
                if round(float(db),l) == round(float(raw),l):
                    print(f'Matched at precision {l}')
                    break
                if l == 6:
                    raise Exception(f'Fields did not match. Model Table gives {round(float(db),l)} but raw model gives {round(float(raw),l)}')
    
print('test passed!')

In [None]:
spark.stop()