In [None]:
import sklearn
sklearn.__version__

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

In [None]:
from splicemachine.spark import PySpliceContext
from splicemachine.mlflow_support.utilities import get_user
from splicemachine.mlflow_support import *

splice = PySpliceContext(spark)
mlflow.set_experiment('sklearn model deployment')
mlflow.register_splice_context(splice)
schema = get_user()

# Simple Regression

In [None]:
from splicemachine.notebook import *
get_mlflow_ui()

In [None]:
import pandas as pd
# Data
d = {'col1': [1, 2], 'col2': [3, 4], 'y': [0.2, 5.3]}
df = pd.DataFrame(data=d)
df
# Model
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(df[['col1', 'col2']], df['y'])
# Deploy
while mlflow.active_run(): # In case nested runs
    mlflow.end_run() 
mlflow.start_run(run_name='regression')
mlflow.log_model(reg, 'regression_model')
splice.execute(f'drop table if exists {schema}.sk_regression')
jid = mlflow.deploy_db(schema, 'sk_regression', mlflow.current_run_id(), primary_key=[('MOMENT_KEY', 'INT')], df=df[['col1', 'col2']],create_model_table=True, verbose=True)
mlflow.watch_job(jid)

In [None]:
%%sql
insert into sk_regression (col1, col2, moment_key) values(1,5,4347861);
insert into sk_regression (col1, col2, moment_key) values(2,7,4908084);
select * from sk_regression;
select col1, col2, prediction into ${data_and_preds} from sk_regression;

In [None]:
mlflow.load_model('8d8eaffcf42b')

In [None]:
from beakerx.object import beakerx
data = beakerx.get('data_and_preds')
for index, row in data.iterrows():
    c1, c2, p = row
    raw_p = reg.predict([[float(c1),float(c2)]])[0]
    assert raw_p == p, f'Something is wrong. Model Table gives {p} but raw model gives {raw_p}'
print('test passed!')

# Bayesian Model

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import BayesianRidge, LinearRegression
import pandas as pd
# Data
d = {'col1': [1, 2], 'col2': [3, 4], 'y': [0.2, 5.3]}
df = pd.DataFrame(data=d)

# Model
model = BayesianRidge(compute_score=True,normalize = True)
model.fit(df[['col1', 'col2']], df['y'])
x = model.predict([[1,2]], return_std=True)

# Deploy
while mlflow.active_run(): # In case nested runs
    mlflow.end_run() 
mlflow.start_run(run_name='bayesian with std')
mlflow.log_model(model, 'bayesian_model')
splice.execute(f'drop table if exists {schema}.sk_bayesian')
jid = mlflow.deploy_db(schema, 'sk_bayesian', mlflow.current_run_id(), primary_key=[('MOMENT_KEY', 'INT')], df=df[['col1', 'col2']], create_model_table=True, verbose=True, library_specific={'predict_call':'predict', 'predict_args':'return_std'})
mlflow.watch_job(jid)

In [None]:
mlflow.load_model('dd5f3f988a9c')

In [None]:
%%time
%%sql

insert into sk_bayesian (col1, col2, moment_key) values(-4,-5,1);
insert into sk_bayesian (col1, col2, moment_key) values(100,22,2);
insert into sk_bayesian (col1, col2, moment_key) values(7,8,3);

select * from sk_bayesian;
select col1, col2, "prediction", "std" into ${data_and_preds} from sk_bayesian;


In [None]:
from beakerx.object import beakerx
data = beakerx.get('data_and_preds')

for index, row in data.iterrows():
    c1, c2, p, std = row
    raw_p, raw_std = model.predict([[float(c1),float(c2)]], return_std=True)
    assert raw_p == p, f'Something is wrong. Model Table gives {p} but raw model gives {raw_p}'
    assert raw_std == std, f'Something is wrong. Model Table gives {std} but raw model gives {raw_std}'
print('test passed!')

## Deploy Bayesian model into existing table

In [None]:
%%sql
drop table if exists bayesian_table;
create table bayesian_table(col1 int, col2 int, moment_key int primary key);
insert into bayesian_table values(4,2,1);
insert into bayesian_table values(9,8,7);
select * from bayesian_table;

In [None]:
run_id = mlflow.get_run_ids_by_name('bayesian with std')[0]
jid = mlflow.deploy_db(schema, 'bayesian_table', run_id, verbose=True, model_cols=['COL1','COL2'], library_specific={'predict_call':'predict', 'predict_args':'return_std'}, create_model_table=False)
mlflow.watch_job(jid)

In [None]:
mlflow.load_model('dd5f3f988a9c')

## First 2 rows should have <b>NO</b> values for prediction and std (we inserted those rows before model deployment)

In [None]:
%%time
%%sql

insert into bayesian_table (col1, col2, moment_key) values(4,5,20);
insert into bayesian_table (col1, col2, moment_key) values(-4,-5,21);
insert into bayesian_table (col1, col2, moment_key) values(7,8,22);

select * from bayesian_table;
select col1, col2, "prediction", "std" into ${data_and_preds} from bayesian_table;

In [None]:
import math
data = beakerx.get('data_and_preds')
for index, row in data.iterrows():
    c1, c2, p, std = row
    if index in (0, 1):
        assert math.isnan(p), f"Something is wrong. prediction should be NaN for the first 2 rows but has value {p}"
        assert math.isnan(std), f"Something is wrong. std should be NaN for the first 2 rows but has value {std}"
    else:
        raw_p, raw_std = model.predict([[float(c1),float(c2)]], return_std=True)
        assert raw_p == p, f'Something is wrong. Model Table gives {p} but raw model gives {raw_p}'
        assert raw_std == std, f'Something is wrong. Model Table gives {std} but raw model gives {raw_std}'
print('test passed!')

# GMM Model
## GMM with Std

In [None]:
from sklearn.gaussian_process.gpr import GaussianProcessRegressor
# Data
d = {'col1': [1, 2], 'col2': [3, 4], 'y': [0.2, 5.3]}
df = pd.DataFrame(data=d)
X = [[0., 0.], [1., 1.]]
y = [0, 1]

# Model
g = GaussianProcessRegressor()
g.fit(df[['col1','col2']],df['y'])
# print(g.predict([[5,5]],return_std=True))
# print(g.predict([[5,5]],return_cov=True))

# Deploy
while mlflow.active_run(): # In case nested runs
    mlflow.end_run() 
mlflow.start_run(run_name='gmm with std')
mlflow.log_model(g, 'gmm_model')
splice.execute(f'drop table if exists {schema}.sk_gmm')
jid = mlflow.deploy_db(schema, 'sk_gmm', mlflow.current_run_id(), df=df[['col1', 'col2']], primary_key=[('MOMENT_KEY', 'INT')], create_model_table=True, verbose=True, library_specific={'predict_call':'predict', 'predict_args':'return_std'})
mlflow.watch_job(jid)

In [None]:
mlflow.load_model('a4bed73efce0')

In [None]:
%%time
%%sql

insert into sk_gmm (col1,col2,moment_key) values(4,5,1);
insert into sk_gmm (col1,col2,moment_key) values(-4,-5,2);
insert into sk_gmm (col1,col2,moment_key) values(7,8,3);

select * from sk_gmm;
select col1, col2, "prediction", "std" into ${data_and_preds} from sk_gmm;

In [None]:
import math
data = beakerx.get('data_and_preds')
for index, row in data.iterrows():
    c1, c2, p, std = row
    raw_p, raw_std = g.predict([[float(c1),float(c2)]], return_std=True)
    assert raw_p == p, f'Something is wrong. Model Table gives {p} but raw model gives {raw_p}'
    assert raw_std == std, f'Something is wrong. Model Table gives {std} but raw model gives {raw_std}'
print('test passed!')

## GMM with Covarience

In [None]:
from sklearn.gaussian_process.gpr import GaussianProcessRegressor
# Data
d = {'col1': [1, 2], 'col2': [3, 4], 'y': [0.2, 5.3]}
df = pd.DataFrame(data=d)


# Model
g = GaussianProcessRegressor()
g.fit(df[['col1','col2']],df['y'])
# print(g.predict([[5,5]],return_std=True))
# print(g.predict([[5,5]],return_cov=True))

# Deploy
while mlflow.active_run(): # In case nested runs
    mlflow.end_run() 
mlflow.start_run(run_name='gmm with cov')
mlflow.log_model(g, 'gmm')
splice.execute(f'drop table if exists {schema}.sk_gmm_cov')
jid = mlflow.deploy_db(schema, 'sk_gmm_cov', mlflow.current_run_id(), df=df[['col1', 'col2']], primary_key=[('MOMENT_KEY', 'INT')], create_model_table=True, verbose=True, library_specific={'predict_call':'predict', 'predict_args':'return_cov'})
mlflow.watch_job(jid)


In [None]:
mlflow.load_model('cfb906e60196')

In [None]:
%%time
%%sql

insert into sk_gmm_cov (col1,col2,moment_key) values(4,5,1);
insert into sk_gmm_cov (col1,col2,moment_key) values(-4,-5,2);
insert into sk_gmm_cov (col1,col2,moment_key) values(7,8,3);

select * from sk_gmm_cov;
select col1, col2, "prediction", "cov" into ${data_and_preds} from sk_gmm_cov;

In [None]:
data = beakerx.get('data_and_preds')
for index, row in data.iterrows():
    c1, c2, p, std = row
    raw_p, raw_std = g.predict([[float(c1),float(c2)]], return_cov=True)
    assert raw_p == p, f'Something is wrong. Model Table gives {p} but raw model gives {raw_p}'
    assert raw_std == std, f'Something is wrong. Model Table gives {std} but raw model gives {raw_std}'
print('test passed!')

# PCA

In [None]:
import pandas as pd
from sklearn.decomposition import PCA, KernelPCA
from random import random


kpca = KernelPCA(kernel="rbf", gamma=10, n_components=100)
d = []
for i in range(500):
    d.append([random()*i for _ in range(3)])
    
df = pd.DataFrame(data=d, columns=['col1', 'col2', 'col3'])

kpca.get_params()
kpca.fit(df[['col1','col2', 'col3']])
kpca.transform([[1,2,3]])

# Deploy
while mlflow.active_run(): # In case nested runs
    mlflow.end_run() 
mlflow.start_run(run_name='pca')
mlflow.log_model(kpca, 'kpca')
splice.execute(f'drop table if exists {schema}.pca')
jid = mlflow.deploy_db(schema, 'pca', mlflow.current_run_id(), create_model_table=True, df=df[['col1', 'col2', 'col3']], primary_key=[('MOMENT_KEY', 'INT')],  verbose=True, library_specific={'predict_call':'transform'})
mlflow.watch_job(jid)


In [None]:
mlflow.load_model('7f685cfb26bb')

In [None]:
%%sql
insert into pca (col1,col2,col3,moment_key) values(1,-2,3,4);
insert into pca (col1,col2,col3,moment_key) values(-2,-3,33,5);
insert into pca (col1,col2,col3,moment_key) values(3,4,-23,6);
insert into pca (col1,col2,col3,moment_key) values(66,234,-2,1);

select * from pca;
select col1, col2, col3, C0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26,C27,C28,C29,C30,C31,C32,C33,C34,C35,C36,C37,C38,C39,C40,C41,C42,C43,C44,C45,C46,C47,C48,C49,C50,C51,C52,C53,C54,C55,C56,C57,C58,C59,C60,C61,C62,C63,C64,C65,C66,C67,C68,C69,C70,C71,C72,C73,C74,C75,C76,C77,C78,C79,C80,C81,C82,C83,C84,C85,C86,C87,C88,C89,C90,C91,C92,C93,C94,C95,C96,C97,C98,C99 
into ${data_and_preds} from pca;


In [None]:
data = beakerx.get('data_and_preds')
for index, row in data.iterrows():
    pcs = row[[f'C{i}' for i in range(100)]].values
    c1, c2, c3 = row['COL1'],row['COL2'],row['COL3']
    raw_pcs = kpca.transform([[float(c1),float(c2), float(c3)]])[0]
    if not (raw_pcs == pcs).all(): 
        print(f'Something is wrong. Checking rounding errors')
        comp=0
        for i,j in zip(raw_pcs, pcs):
            for rnd in range(15,8,-1):
#                 print(f'checking round at {rnd}')
                if round(i,rnd)!=round(j,rnd):
                    raise Exception(f'Values are incorrect. Database returned {j} but model returned {i} for component C{comp}')
                break

        print(f'All values match to at least {rnd} decimal places')
    
print('test passed!')


# Pipeline test

In [None]:
from sklearn import svm
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
# generate some data to play with
X, y = make_classification(
    n_informative=5, n_redundant=0, random_state=42)
# ANOVA SVM-C
anova_filter = SelectKBest(f_regression, k=5)
clf = svm.SVC(kernel='linear', probability=True)
anova_svm = Pipeline([('anova', anova_filter), ('svc', clf)])
anova_svm.set_params(anova__k=10, svc__C=.1).fit(X, y)
prediction = anova_svm.predict(X)
anova_svm.score(X, y)
# getting the selected features chosen by anova_filter
anova_svm['anova'].get_support()
# Another way to get selected features chosen by anova_filter
anova_svm.named_steps.anova.get_support()
# Indexing can also be used to extract a sub-pipeline.
sub_pipeline = anova_svm[:1]
sub_pipeline
coef = anova_svm[-1].coef_
anova_svm['svc'] is anova_svm[-1]
coef.shape

df = pd.DataFrame(X, columns=[f'C{i}' for i in range(20)])
df['label'] = pd.DataFrame(y)

# Deploy
while mlflow.active_run(): # In case nested runs
    mlflow.end_run() 
mlflow.start_run(run_name='pipeline')
mlflow.log_model(anova_svm, 'pipeline')
splice.execute(f'drop table if exists {schema}.skpipe')
jid = mlflow.deploy_db(schema, 'skpipe', mlflow.current_run_id(), primary_key=[('MOMENT_KEY', 'INT')], create_model_table=True, df=df.drop('label', axis=1), verbose=True)
mlflow.watch_job(jid)


In [None]:
mlflow.load_model('1733b7b8f83c')

In [None]:
%%sql
insert into skpipe (c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,moment_key) values(3.611,6.133,9.065,9.347,4.649,11.534,1.436,3.625,0.617,5.556,1.191,11.871,6.27,13.615,4.942,16.17,1.858,13.121,9.192,16.729,9);
insert into skpipe (c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,moment_key) values(1.1520228167464979,2.1605391309255357,2.1567734709026443,2.574915424636612,1.286824753247393,0.5889507491535976,1.5086880856146783,2.269695379745943,2.6089968335162177,0.14727247523071063,1.8485328196592175,2.097532276755521,0.27822480992530085,2.666733308369378,2.143893385597627,1.3465081988790233,0.11041583917168307,0.9246752875764862,2.500911791131701,1.1195218819285857,5);
select * from skpipe;
select c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19, prediction into ${data_and_preds} from skpipe;


In [None]:
data = beakerx.get('data_and_preds')
for index, row in data.iterrows():
    c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13,c14,c15,c16,c17,c18,c19,p = row
    raw_p = anova_svm.predict([[float(c0),float(c1),float(c2),float(c3),float(c4),float(c5),float(c6),float(c7),float(c8),float(c9),float(c10),float(c11),float(c12),float(c13),float(c14),float(c15),float(c16),float(c17),float(c18),float(c19)]])
    assert raw_p == p, f'Something is wrong. Model Table gives {p} but raw model gives {raw_p}'
print('test passed!')

# Predict Proba Exmaple

In [None]:
from sklearn.ensemble import RandomForestClassifier

d = {'col1': [1, 2], 'col2': [3, 4], 'y': [4, 1]}
df = pd.DataFrame(data=d)

clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(df[['col1','col2']],df['y'])
clf.predict_proba([[0,1]])

# Deploy
while mlflow.active_run(): # In case nested runs
    mlflow.end_run() 
mlflow.start_run(run_name='predict_proba')
mlflow.log_model(clf, 'rf')
splice.execute(f'drop table if exists {schema}.predict_proba')
jid = mlflow.deploy_db(schema, 'predict_proba', mlflow.current_run_id(), create_model_table=True, df=df[['col1', 'col2']], primary_key=[('MOMENT_KEY', 'INT')],  verbose=True, library_specific={'predict_call':'predict_proba'})
mlflow.watch_job(jid)

In [None]:
mlflow.load_model('46acad893a2e')

In [None]:
%%sql
insert into predict_proba (col1,col2,moment_key) values(3.611,6.133,9);
insert into predict_proba (col1,col2,moment_key) values(1.1520228167464979,2.1605,5);
select * from predict_proba;
select col1,col2, "prediction", C1, C4 into ${data_and_preds} from predict_proba;


In [None]:
data = beakerx.get('data_and_preds')
for index, row in data.iterrows():
    col1,col2,p,c1,c4 = row
    raw_c1,raw_c4 = clf.predict_proba([[float(col1),float(col2)]])[0]
    raw_p = clf.predict([[float(col1),float(col2)]])[0]
    assert raw_p == p, f'Something is wrong. Model Table gives {p} but raw model gives {raw_p}'
    assert raw_c1 == c1, f'Something is wrong. Model Table gives {c1} but raw model gives {raw_c1}'
    assert raw_c4 == c4, f'Something is wrong. Model Table gives {c4} but raw model gives {raw_c4}'
print('test passed!')

In [None]:
spark.stop()