In [None]:
!wget https://splice-releases.s3.amazonaws.com/jdbc-driver/db-client-2.7.0.1815.jar

In [None]:
%%sql
%classpath add jar db-client-2.7.0.1815.jar
%defaultDatasource jdbc:splice://host.docker.internal:1527/splicedb;user=splice;password=admin

In [1]:
from pyspark.sql import SparkSession
from splicemachine.spark.context import PySpliceContext
import random
# Create our Spark Session
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext
# Create out Native Database Connection
splice = PySpliceContext(spark,JDBC_URL='jdbc:splice://host.docker.internal:1527/splicedb;user=splice;password=admin',_unit_testing=True)

Class Initialized


In [2]:
from splicemachine.ml.management import MLManager
manager = MLManager(splice, _testing=True)

Tracking Model Metadata on MLFlow Server @ http://mlflow:5001


In [3]:
manager.create_experiment('fraud_demo')

Experiment fraud_demo already exists... setting to active experiment
Active experiment has id 1


In [50]:
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.ml import Pipeline,PipelineModel
from pyspark.ml.classification import RandomForestClassifier, MultilayerPerceptronClassifier, DecisionTreeClassifier
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.clustering import KMeans

models = [RandomForestClassifier,MultilayerPerceptronClassifier,DecisionTreeClassifier,DecisionTreeRegressor,KMeans]
feature_names = ['price','gender','size','name','height','location','age','weather_type','ror']
fitted_pipes = []
for i in range(5):
    #start our first MLFlow run
    tags = {
            'team': 'splice',
            'purpose': 'testing r&d',
            'attempt-date': '11/07/2019',
            'attempt-number': str(i)
           }
    manager.start_run(tags=tags)
    
    # In-place shuffle
    random.shuffle(feature_names)

    assembler = VectorAssembler(inputCols=feature_names[:5], outputCol='features')
    scaler = StandardScaler(inputCol="features", outputCol='scaledFeatures')
    model = models[i]()
    model = model.setFeaturesCol('scaledFeatures')
    
    stages = [assembler,scaler,model]
    mlpipe = Pipeline(stages=stages)
    fitted_pipes.append(mlpipe)
    manager.log_pipeline_stages(mlpipe)
    
    manager.log_feature_transformations(mlpipe)
    
    manager.log_metric('f1',random.random())
    manager.log_metric('acc',random.random())
    manager.log_metric('tpr',random.random())
    manager.log_metric('fnr',random.random())
    manager.log_metric('tnr',random.random())
    manager.log_metric('fpr',random.random())
    manager.log_metric('precision',random.random())
#     manager.log_artifact('MLManager Local testing.ipynb','MLManager Local testing')



In [63]:
from splicemachine.ml.management import _get_cols,_readable_pipeline_stage,_get_stages
from collections import defaultdict

def _find_first_input_by_output(dictionary, value):
        """
        Find the first input column for a given column
        :param dictionary: dictionary to search
        :param value: column
        :return: None if not found, otherwise first column
        """
        keys=[]
        for key in dictionary:
            if dictionary[key][1] == value:  # output column is always the last one
                keys.append(key)
        
        return keys if len(keys)>0 else None


def log_feature_transformations(unfit_pipeline):
        """
        Log the preprocessing transformation sequence
        for every feature in the UNFITTED Spark pipeline
        :param unfit_pipeline: UNFITTED spark pipeline!!
        """
        transformations = defaultdict(lambda: [[], None])  # transformations, outputColumn
        

        for stage in _get_stages(unfit_pipeline):
            input_cols, output_col = _get_cols(stage, get_input=True), _get_cols(stage,
                                                                               get_input=False)
            print('input cols:',input_cols,'output:',output_col)
            if input_cols and output_col:  # make sure it could parse transformer
                for column in input_cols:
                    print('col:',column)
                    first_column_found = _find_first_input_by_output(transformations, column)
                    print('first:',first_column_found)
                    if first_column_found:  # column is not original
                        for f in first_column_found:
                            transformations[f][1] = output_col
                            transformations[f][0].append(
                                _readable_pipeline_stage(stage))
                    else:
                        transformations[column][1] = output_col
                        transformations[column][0].append(_readable_pipeline_stage(stage))
                        print(transformations,'\n')

        for column in transformations:
            param_value = ' -> '.join([column] + transformations[column][0] +
                                      [transformations[column][1]])
            print('Column- ' + column, param_value)
            
for i in fitted_pipes:
    log_feature_transformations(i)
    print('-------------------------------------------')

input cols: ['weather_type', 'name', 'height', 'gender', 'location'] output: features
col: weather_type
first: None
defaultdict(<function log_feature_transformations.<locals>.<lambda> at 0x7fdbc204b6a8>, {'weather_type': [['VectorAssembler'], 'features']}) 

col: name
first: None
defaultdict(<function log_feature_transformations.<locals>.<lambda> at 0x7fdbc204b6a8>, {'weather_type': [['VectorAssembler'], 'features'], 'name': [['VectorAssembler'], 'features']}) 

col: height
first: None
defaultdict(<function log_feature_transformations.<locals>.<lambda> at 0x7fdbc204b6a8>, {'weather_type': [['VectorAssembler'], 'features'], 'name': [['VectorAssembler'], 'features'], 'height': [['VectorAssembler'], 'features']}) 

col: gender
first: None
defaultdict(<function log_feature_transformations.<locals>.<lambda> at 0x7fdbc204b6a8>, {'weather_type': [['VectorAssembler'], 'features'], 'name': [['VectorAssembler'], 'features'], 'height': [['VectorAssembler'], 'features'], 'gender': [['VectorAssembl

In [57]:
for i in fitted_pipes:
    print(i.getStages()[-1].getFeaturesCol())

scaledFeatures
scaledFeatures
scaledFeatures
scaledFeatures
scaledFeatures


In [64]:
manager.list_artifacts()

TypeError: list_artifacts() missing 1 required positional argument: 'run_id'