## For now, need to be running sklearn==0.21.3

In [1]:
import sklearn
sklearn.__version__

'0.21.3'

In [14]:
from pyspark.sql import SparkSession
import math
spark = SparkSession.builder.getOrCreate()

In [3]:
from splicemachine.spark import PySpliceContext
from splicemachine.mlflow_support.utilities import get_user
from splicemachine.mlflow_support import *

splice = PySpliceContext(spark)
mlflow.set_experiment('sklearn model deployment')
mlflow.register_splice_context(splice)
schema = get_user()

INFO: 'sklearn model deployment' does not exist. Creating a new experiment


# Simple Regression

In [15]:
# Simple Regression

import pandas as pd
# Data
d = {'col1': [1, 2], 'col2': [3, 4], 'y': [0.2, 5.3]}
df = pd.DataFrame(data=d)
df
# Model
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(df[['col1', 'col2']], df['y'])
# Deploy

with mlflow.start_run(run_name='regression') as run:
    mlflow.log_model(reg, 'regression_model')
    splice.execute(f'drop table if exists {schema}.sk_regression')
    print('Deploying regression')
    jid = mlflow.deploy_db(schema, 'sk_regression', mlflow.current_run_id(), primary_key={'MOMENT_KEY': 'INT'}, df=df[['col1', 'col2']],create_model_table=True)
    mlflow.watch_job(jid)
    
print(f'Loading model {run.info.run_uuid}')
mlflow.load_model(run.info.run_uuid)
print('Running model')
splice.execute("insert into sk_regression (col1, col2, moment_key) values(1,5,4347861)")    
splice.execute("insert into sk_regression (col1, col2, moment_key) values(2,7,4908084)")
print('Getting results')
data = splice.df('select col1, col2, prediction from sk_regression').toPandas()
print('Comparing DB results to model results')
for index, row in data.iterrows():
    c1, c2, p = row
    raw_p = reg.predict([[float(c1),float(c2)]])[0]
    assert math.isclose(raw_p,p), f'Something is wrong. Model Table gives {p} but raw model gives {raw_p}'
print('test passed!')

Uploading file... Done.
Deploying regression
Deploying model to database...
Your Job has been submitted. The returned value of this function is the job id, which you can use to monitor the your task in real-time. Run mlflow.watch_job(<job id>) tostream them to stdout, or mlflow.fetch_logs(<job id>) to read them one time to a list

---Job Logs---
INFO     2021-06-04 21:33:51.640 - A service worker has found your request
INFO     2021-06-04 21:33:51.717 - Checking whether handler DEPLOY_DATABASE is enabled
INFO     2021-06-04 21:33:51.797 - Handler is available
INFO     2021-06-04 21:33:51.810 - Retrieving Run from MLFlow Tracking Server...
INFO     2021-06-04 21:33:51.889 - Retrieved MLFlow Run
INFO     2021-06-04 21:33:51.907 - Updating MLFlow Run for the UI
INFO     2021-06-04 21:33:51.997 - Reading Model Artifact Stream from Splice Machine
INFO     2021-06-04 21:33:52.015 - Extracting Model from DB with Name: regression_model
INFO     2021-06-04 21:33:52.041 - Decoding Model Artifact

# Bayesian Model

In [47]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import BayesianRidge, LinearRegression
import pandas as pd
# Data
d = {'col1': [1, 2], 'col2': [3, 4], 'y': [0.2, 5.3]}
df = pd.DataFrame(data=d)

# Model
model = BayesianRidge(compute_score=True,normalize = True)
model.fit(df[['col1', 'col2']], df['y'])
x = model.predict([[1,2]], return_std=True)

# Deploy
with mlflow.start_run(run_name='bayesian with std') as run:
    mlflow.log_model(model, 'bayesian_model')
    splice.execute(f'drop table if exists {schema}.sk_bayesian')
    print('Deploying sk_bayesian')
    jid = mlflow.deploy_db(schema, 'sk_bayesian', mlflow.current_run_id(), primary_key={'MOMENT_KEY': 'INT'}, df=df[['col1', 'col2']], create_model_table=True, library_specific={'predict_call':'predict', 'predict_args':'return_std'})
    mlflow.watch_job(jid)
    
print(f'Loading model {run.info.run_uuid}')
mlflow.load_model(run.info.run_uuid)
print('Running model')
splice.execute("insert into sk_bayesian (col1, col2, moment_key) values(-4,-5,1)")
splice.execute("insert into sk_bayesian (col1, col2, moment_key) values(100,22,2)")
splice.execute("insert into sk_bayesian (col1, col2, moment_key) values(7,8,3)")
print('Getting results')
data = splice.df('select col1, col2, prediction, std from sk_bayesian').toPandas()
print('Comparing DB results to model results')
for index, row in data.iterrows():
    c1, c2, p, std = row
    raw_p, raw_std = model.predict([[float(c1),float(c2)]], return_std=True)
    assert math.isclose(raw_p[0],float(p)), f'Something is wrong. Model Table gives {p} but raw model gives {raw_p}'
    assert math.isclose(raw_std,std), f'Something is wrong. Model Table gives {std} but raw model gives {raw_std}'
print('test passed!')

Uploading file... Done.
Deploying sk_bayesian
Deploying model to database...
Your Job has been submitted. The returned value of this function is the job id, which you can use to monitor the your task in real-time. Run mlflow.watch_job(<job id>) tostream them to stdout, or mlflow.fetch_logs(<job id>) to read them one time to a list

---Job Logs---
INFO     2021-06-04 22:08:38.693 - A service worker has found your request
INFO     2021-06-04 22:08:38.773 - Checking whether handler DEPLOY_DATABASE is enabled
INFO     2021-06-04 22:08:38.811 - Handler is available
INFO     2021-06-04 22:08:38.825 - Retrieving Run from MLFlow Tracking Server...
INFO     2021-06-04 22:08:38.904 - Retrieved MLFlow Run
INFO     2021-06-04 22:08:38.923 - Updating MLFlow Run for the UI
INFO     2021-06-04 22:08:39.014 - Reading Model Artifact Stream from Splice Machine
INFO     2021-06-04 22:08:39.033 - Extracting Model from DB with Name: bayesian_model
INFO     2021-06-04 22:08:39.058 - Decoding Model Artifact 

## Deploy Bayesian model into existing table
###  First 2 rows should have <b>NO</b> values for prediction and std (we inserted those rows before model deployment)

In [48]:
setup_sql = """
drop table if exists bayesian_table;
create table bayesian_table(col1 int, col2 int, moment_key int primary key);
insert into bayesian_table values(4,2,1);
insert into bayesian_table values(9,8,7);
select * from bayesian_table
""".split(';')
for sql in setup_sql:
    splice.execute(sql.strip())

print('Deploying bayesian to existing table')
jid = mlflow.deploy_db(schema, 'bayesian_table', run.info.run_uuid, model_cols=['COL1','COL2'], library_specific={'predict_call':'predict', 'predict_args':'return_std'}, create_model_table=False)
mlflow.watch_job(jid)

print('Running model')
splice.execute('insert into bayesian_table (col1, col2, moment_key) values(4,5,20)')
splice.execute('insert into bayesian_table (col1, col2, moment_key) values(-4,-5,21)')
splice.execute('insert into bayesian_table (col1, col2, moment_key) values(7,8,22)')

print('Getting results')
data = splice.df('select col1, col2, prediction, std from bayesian_table').toPandas()

print('Comparing DB results to model results')
for index, row in data.iterrows():
    c1, c2, p, std = row
    if index in (0, 1):
        assert not p or math.isnan(p), f"Something is wrong. prediction should be NaN for the first 2 rows but has value {p}"
        assert not std or math.isnan(std), f"Something is wrong. std should be NaN for the first 2 rows but has value {std}"
    else:
        raw_p, raw_std = model.predict([[float(c1),float(c2)]], return_std=True)
        assert math.isclose(raw_p[0], float(p)), f'Something is wrong. Model Table gives {p} but raw model gives {raw_p}'
        assert math.isclose(raw_std[0], float(std)), f'Something is wrong. Model Table gives {std} but raw model gives {raw_std}'
print('test passed!')

Deploying bayesian to existing table
Deploying model to database...
Your Job has been submitted. The returned value of this function is the job id, which you can use to monitor the your task in real-time. Run mlflow.watch_job(<job id>) tostream them to stdout, or mlflow.fetch_logs(<job id>) to read them one time to a list

---Job Logs---
INFO     2021-06-04 22:08:45.311 - A service worker has found your request
INFO     2021-06-04 22:08:45.388 - Checking whether handler DEPLOY_DATABASE is enabled
INFO     2021-06-04 22:08:45.468 - Handler is available
INFO     2021-06-04 22:08:45.482 - Retrieving Run from MLFlow Tracking Server...
INFO     2021-06-04 22:08:45.558 - Retrieved MLFlow Run
INFO     2021-06-04 22:08:45.578 - Updating MLFlow Run for the UI
INFO     2021-06-04 22:08:45.745 - Reading Model Artifact Stream from Splice Machine
INFO     2021-06-04 22:08:45.764 - Extracting Model from DB with Name: bayesian_model
INFO     2021-06-04 22:08:45.788 - Decoding Model Artifact Binary St

In [46]:
p

'0.49311640498059234'

# GMM Model
## GMM with Std

In [39]:
from sklearn.gaussian_process.gpr import GaussianProcessRegressor
# Data
d = {'col1': [1, 2], 'col2': [3, 4], 'y': [0.2, 5.3]}
df = pd.DataFrame(data=d)
X = [[0., 0.], [1., 1.]]
y = [0, 1]

# Model
g = GaussianProcessRegressor()
g.fit(df[['col1','col2']],df['y'])
# print(g.predict([[5,5]],return_std=True))
# print(g.predict([[5,5]],return_cov=True))

# Deploy
with mlflow.start_run(run_name='gmm with std') as run:
    mlflow.log_model(g, 'gmm_model')
    splice.execute(f'drop table if exists {schema}.sk_gmm')
    jid = mlflow.deploy_db(schema, 'sk_gmm', mlflow.current_run_id(), df=df[['col1', 'col2']], primary_key={'MOMENT_KEY': 'INT'}, create_model_table=True, library_specific={'predict_call':'predict', 'predict_args':'return_std'})
    mlflow.watch_job(jid)
    
print(f'Loading model {run.info.run_uuid}')
mlflow.load_model(run.info.run_uuid)

print('Running model')
splice.execute('insert into sk_gmm (col1,col2,moment_key) values(4,5,1)')
splice.execute('insert into sk_gmm (col1,col2,moment_key) values(-4,-5,2)')
splice.execute('insert into sk_gmm (col1,col2,moment_key) values(7,8,3)')

print('Getting results')
data = splice.df('select col1, col2, prediction, std from sk_gmm').toPandas()

print('Comparing DB results to model results')
for index, row in data.iterrows():
    c1, c2, p, std = row
    raw_p, raw_std = g.predict([[float(c1),float(c2)]], return_std=True)
    assert math.isclose(raw_p[0], float(p)), f'Something is wrong. Model Table gives {p} but raw model gives {raw_p}'
    assert math.isclose(raw_std[0], float(std)), f'Something is wrong. Model Table gives {std} but raw model gives {raw_std}'
print('test passed!')

Uploading file... Done.
Deploying model to database...
Your Job has been submitted. The returned value of this function is the job id, which you can use to monitor the your task in real-time. Run mlflow.watch_job(<job id>) tostream them to stdout, or mlflow.fetch_logs(<job id>) to read them one time to a list

---Job Logs---
INFO     2021-06-04 22:04:40.061 - A service worker has found your request
INFO     2021-06-04 22:04:40.139 - Checking whether handler DEPLOY_DATABASE is enabled
INFO     2021-06-04 22:04:40.219 - Handler is available
INFO     2021-06-04 22:04:40.235 - Retrieving Run from MLFlow Tracking Server...
INFO     2021-06-04 22:04:40.321 - Retrieved MLFlow Run
INFO     2021-06-04 22:04:40.342 - Updating MLFlow Run for the UI
INFO     2021-06-04 22:04:40.434 - Reading Model Artifact Stream from Splice Machine
INFO     2021-06-04 22:04:40.452 - Extracting Model from DB with Name: gmm_model
INFO     2021-06-04 22:04:40.482 - Decoding Model Artifact Binary Stream for Deploymen

## GMM with Covarience

In [43]:
from sklearn.gaussian_process.gpr import GaussianProcessRegressor
# Data
d = {'col1': [1, 2], 'col2': [3, 4], 'y': [0.2, 5.3]}
df = pd.DataFrame(data=d)


# Model
g = GaussianProcessRegressor()
g.fit(df[['col1','col2']],df['y'])
# print(g.predict([[5,5]],return_std=True))
# print(g.predict([[5,5]],return_cov=True))

# Deploy
with mlflow.start_run(run_name='gmm with cov') as run:
    mlflow.log_model(g, 'gmm')
    splice.execute(f'drop table if exists {schema}.sk_gmm_cov')
    jid = mlflow.deploy_db(schema, 'sk_gmm_cov', mlflow.current_run_id(), df=df[['col1', 'col2']], primary_key={'MOMENT_KEY': 'INT'}, create_model_table=True, library_specific={'predict_call':'predict', 'predict_args':'return_cov'})
    mlflow.watch_job(jid)
    
print(f'Loading model {run.info.run_uuid}')
mlflow.load_model(run.info.run_uuid)

print('Running model')
splice.execute('insert into sk_gmm_cov (col1,col2,moment_key) values(4,5,1)')
splice.execute('insert into sk_gmm_cov (col1,col2,moment_key) values(-4,-5,2)')
splice.execute('insert into sk_gmm_cov (col1,col2,moment_key) values(7,8,3)')

print('Getting results')
data = splice.df('select col1, col2, prediction, cov from sk_gmm_cov').toPandas()

print('Comparing DB results to model results')
for index, row in data.iterrows():
    c1, c2, p, std = row
    raw_p, raw_std = g.predict([[float(c1),float(c2)]], return_cov=True)
    
    assert math.isclose(float(raw_p[0]), float(p)), f'Something is wrong. Model Table gives {p} but raw model gives {raw_p[0]}'
    assert  math.isclose(raw_std[0], float(std)), f'Something is wrong. Model Table gives {std} but raw model gives {raw_std}'
print('test passed!')

Uploading file... Done.
Deploying model to database...
Your Job has been submitted. The returned value of this function is the job id, which you can use to monitor the your task in real-time. Run mlflow.watch_job(<job id>) tostream them to stdout, or mlflow.fetch_logs(<job id>) to read them one time to a list

---Job Logs---
INFO     2021-06-04 22:07:07.879 - A service worker has found your request
INFO     2021-06-04 22:07:07.966 - Checking whether handler DEPLOY_DATABASE is enabled
INFO     2021-06-04 22:07:08.046 - Handler is available
INFO     2021-06-04 22:07:08.060 - Retrieving Run from MLFlow Tracking Server...
INFO     2021-06-04 22:07:08.139 - Retrieved MLFlow Run
INFO     2021-06-04 22:07:08.159 - Updating MLFlow Run for the UI
INFO     2021-06-04 22:07:08.250 - Reading Model Artifact Stream from Splice Machine
INFO     2021-06-04 22:07:08.267 - Extracting Model from DB with Name: gmm
INFO     2021-06-04 22:07:08.295 - Decoding Model Artifact Binary Stream for Deployment
INFO

# PCA

In [49]:
import pandas as pd
from sklearn.decomposition import PCA, KernelPCA
from random import random


kpca = KernelPCA(kernel="rbf", gamma=10, n_components=100)
d = []
for i in range(500):
    d.append([random()*i for _ in range(3)])
    
df = pd.DataFrame(data=d, columns=['col1', 'col2', 'col3'])

kpca.get_params()
kpca.fit(df[['col1','col2', 'col3']])
kpca.transform([[1,2,3]])

# Deploy
with mlflow.start_run(run_name='pca') as run:
    mlflow.log_model(kpca, 'kpca')
    splice.execute(f'drop table if exists {schema}.pca')
    jid = mlflow.deploy_db(schema, 'pca', mlflow.current_run_id(), create_model_table=True, df=df[['col1', 'col2', 'col3']], primary_key={'MOMENT_KEY': 'INT'}, library_specific={'predict_call':'transform'})
    mlflow.watch_job(jid)

    
print(f'Loading model {run.info.run_uuid}')
mlflow.load_model(run.info.run_uuid)

print('Running model')
splice.execute('insert into pca (col1,col2,col3,moment_key) values(1,-2,3,4)')
splice.execute('insert into pca (col1,col2,col3,moment_key) values(-2,-3,33,5)')
splice.execute('insert into pca (col1,col2,col3,moment_key) values(3,4,-23,6)')
splice.execute('insert into pca (col1,col2,col3,moment_key) values(66,234,-2,1)')

print('Getting results')
data = splice.df('''select col1, col2, col3, C0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,
                    C20,C21,C22,C23,C24,C25,C26,C27,C28,C29,C30,C31,C32,C33,C34,C35,C36,C37,C38,C39,C40,C41,C42,C43,
                    C44,C45,C46,C47,C48,C49,C50,C51,C52,C53,C54,C55,C56,C57,C58,C59,C60,C61,C62,C63,C64,C65,C66,C67,
                    C68,C69,C70,C71,C72,C73,C74,C75,C76,C77,C78,C79,C80,C81,C82,C83,C84,C85,C86,C87,C88,C89,C90,C91,
                    C92,C93,C94,C95,C96,C97,C98,C99 from pca''').toPandas()

print('Comparing DB results to model results')
for index, row in data.iterrows():
    pcs = row[[f'C{i}' for i in range(100)]].values
    c1, c2, c3 = row['COL1'],row['COL2'],row['COL3']
    raw_pcs = kpca.transform([[float(c1),float(c2), float(c3)]])[0]
    if not (raw_pcs == pcs).all(): 
        print(f'Something is wrong. Checking rounding errors')
        comp=0
        for i,j in zip(raw_pcs, pcs):
            for rnd in range(15,8,-1):
                if round(i,rnd)!=round(j,rnd):
                    raise Exception(f'Values are incorrect. Database returned {j} but model returned {i} for component C{comp}')
                break
        print(f'All values match to at least {rnd} decimal places')
    
print('test passed!')

Uploading file... Done.
Deploying model to database...


Deprecated Parameter 'verbose'. Use mlflow.watch_job(<job id>) or mlflow.fetch_logs(<job id>) to get verbose output. Ignoring...
Passing in primary keys as a list of tuples is deprecated. Use dictionary {column name: type}


Your Job has been submitted. The returned value of this function is the job id, which you can use to monitor the your task in real-time. Run mlflow.watch_job(<job id>) tostream them to stdout, or mlflow.fetch_logs(<job id>) to read them one time to a list

---Job Logs---
INFO     2021-06-04 22:17:02.917 - A service worker has found your request
INFO     2021-06-04 22:17:03.009 - Checking whether handler DEPLOY_DATABASE is enabled
INFO     2021-06-04 22:17:03.062 - Handler is available
INFO     2021-06-04 22:17:03.077 - Retrieving Run from MLFlow Tracking Server...
INFO     2021-06-04 22:17:03.167 - Retrieved MLFlow Run
INFO     2021-06-04 22:17:03.188 - Updating MLFlow Run for the UI
INFO     2021-06-04 22:17:03.286 - Reading Model Artifact Stream from Splice Machine
INFO     2021-06-04 22:17:03.306 - Extracting Model from DB with Name: kpca
INFO     2021-06-04 22:17:03.340 - Decoding Model Artifact Binary Stream for Deployment
INFO     2021-06-04 22:17:03.370 - Decompressing Model Art

# Pipeline test

In [50]:
from sklearn import svm
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
# generate some data to play with
X, y = make_classification(
    n_informative=5, n_redundant=0, random_state=42)
# ANOVA SVM-C
anova_filter = SelectKBest(f_regression, k=5)
clf = svm.SVC(kernel='linear', probability=True)
anova_svm = Pipeline([('anova', anova_filter), ('svc', clf)])
anova_svm.set_params(anova__k=10, svc__C=.1).fit(X, y)
prediction = anova_svm.predict(X)
anova_svm.score(X, y)
# getting the selected features chosen by anova_filter
anova_svm['anova'].get_support()
# Another way to get selected features chosen by anova_filter
anova_svm.named_steps.anova.get_support()
# Indexing can also be used to extract a sub-pipeline.
sub_pipeline = anova_svm[:1]
sub_pipeline
coef = anova_svm[-1].coef_
anova_svm['svc'] is anova_svm[-1]
coef.shape

df = pd.DataFrame(X, columns=[f'C{i}' for i in range(20)])
df['label'] = pd.DataFrame(y)

# Deploy
with mlflow.start_run(run_name='pipeline') as run:
    mlflow.log_model(anova_svm, 'pipeline')
    splice.execute(f'drop table if exists {schema}.skpipe')
    jid = mlflow.deploy_db(schema, 'skpipe', mlflow.current_run_id(), primary_key={'MOMENT_KEY': 'INT'}, create_model_table=True, df=df.drop('label', axis=1))
    mlflow.watch_job(jid)

print(f'Loading model {run.info.run_uuid}')
mlflow.load_model(run.info.run_uuid)

print('Running model')
splice.execute('insert into skpipe (c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,moment_key) values(1.1520228167464979,2.1605391309255357,2.1567734709026443,2.574915424636612,1.286824753247393,0.5889507491535976,1.5086880856146783,2.269695379745943,2.6089968335162177,0.14727247523071063,1.8485328196592175,2.097532276755521,0.27822480992530085,2.666733308369378,2.143893385597627,1.3465081988790233,0.11041583917168307,0.9246752875764862,2.500911791131701,1.1195218819285857,5)')
splice.execute('insert into skpipe (c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,moment_key) values(3.611,6.133,9.065,9.347,4.649,11.534,1.436,3.625,0.617,5.556,1.191,11.871,6.27,13.615,4.942,16.17,1.858,13.121,9.192,16.729,9)')

print('Getting results')
data = splice.df('select c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19, prediction from skpipe').toPandas()

print('Comparing DB results to model results')
for index, row in data.iterrows():
    c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,p = row
    raw_p = anova_svm.predict([[float(c0),float(c1),float(c2),float(c3),float(c4),float(c5),float(c6),float(c7),float(c8),float(c9),float(c10),float(c11),float(c12),float(c13),float(c14),float(c15),float(c16),float(c17),float(c18),float(c19)]])
    assert math.isclose(raw_p,p), f'Something is wrong. Model Table gives {p} but raw model gives {raw_p}'
print('test passed!')

Uploading file... Done.
Deploying model to database...
Your Job has been submitted. The returned value of this function is the job id, which you can use to monitor the your task in real-time. Run mlflow.watch_job(<job id>) tostream them to stdout, or mlflow.fetch_logs(<job id>) to read them one time to a list

---Job Logs---
INFO     2021-06-04 22:20:37.214 - A service worker has found your request
INFO     2021-06-04 22:20:37.269 - Checking whether handler DEPLOY_DATABASE is enabled
INFO     2021-06-04 22:20:37.314 - Handler is available
INFO     2021-06-04 22:20:37.329 - Retrieving Run from MLFlow Tracking Server...
INFO     2021-06-04 22:20:37.418 - Retrieved MLFlow Run
INFO     2021-06-04 22:20:37.437 - Updating MLFlow Run for the UI
INFO     2021-06-04 22:20:37.498 - Reading Model Artifact Stream from Splice Machine
INFO     2021-06-04 22:20:37.517 - Extracting Model from DB with Name: pipeline
INFO     2021-06-04 22:20:37.548 - Decoding Model Artifact Binary Stream for Deployment

# Predict Proba Exmaple

In [53]:
from sklearn.ensemble import RandomForestClassifier

d = {'col1': [1, 2], 'col2': [3, 4], 'y': [4, 1]}
df = pd.DataFrame(data=d)

clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(df[['col1','col2']],df['y'])
clf.predict_proba([[0,1]])

# Deploy
with mlflow.start_run(run_name='predict_proba') as run:
    mlflow.log_model(clf, 'rf')
    splice.execute(f'drop table if exists {schema}.predict_proba')
    jid = mlflow.deploy_db(schema, 'predict_proba', mlflow.current_run_id(), create_model_table=True, df=df[['col1', 'col2']], primary_key={'MOMENT_KEY': 'INT'}, library_specific={'predict_call':'predict_proba'})
    mlflow.watch_job(jid)
    
    
print(f'Loading model {run.info.run_uuid}')
mlflow.load_model(run.info.run_uuid)

print('Running model')
splice.execute('insert into predict_proba (col1,col2,moment_key) values(3.611,6.133,9)')
splice.execute('insert into predict_proba (col1,col2,moment_key) values(1.1520228167464979,2.1605,5)')

print('Getting results')
data = splice.df('select col1,col2, prediction, C1, C4 from predict_proba').toPandas()

print('Comparing DB results to model results')
for index, row in data.iterrows():
    col1,col2,p,c1,c4 = row
    raw_c1,raw_c4 = clf.predict_proba([[float(col1),float(col2)]])[0]
    raw_p = clf.predict([[float(col1),float(col2)]])[0]
    assert f'C{raw_p}' == p, f'Something is wrong. Model Table gives {p} but raw model gives {raw_p}'
    assert math.isclose(raw_c1,c1), f'Something is wrong. Model Table gives {c1} but raw model gives {raw_c1}'
    assert math.isclose(raw_c4,c4), f'Something is wrong. Model Table gives {c4} but raw model gives {raw_c4}'
print('test passed!')

Uploading file... Done.
Deploying model to database...
Your Job has been submitted. The returned value of this function is the job id, which you can use to monitor the your task in real-time. Run mlflow.watch_job(<job id>) tostream them to stdout, or mlflow.fetch_logs(<job id>) to read them one time to a list

---Job Logs---
INFO     2021-06-04 22:22:28.734 - A service worker has found your request
INFO     2021-06-04 22:22:28.813 - Checking whether handler DEPLOY_DATABASE is enabled
INFO     2021-06-04 22:22:28.853 - Handler is available
INFO     2021-06-04 22:22:28.866 - Retrieving Run from MLFlow Tracking Server...
INFO     2021-06-04 22:22:28.963 - Retrieved MLFlow Run
INFO     2021-06-04 22:22:28.981 - Updating MLFlow Run for the UI
INFO     2021-06-04 22:22:29.070 - Reading Model Artifact Stream from Splice Machine
INFO     2021-06-04 22:22:29.088 - Extracting Model from DB with Name: rf
INFO     2021-06-04 22:22:29.117 - Decoding Model Artifact Binary Stream for Deployment
INFO 

In [54]:
spark.stop()