# `hops-util-py` Integration Tests

This notebook can be converted to a python file and submitted as a spark job for integration tests

## Imports

In [1]:
from hops import experiment, hdfs, tensorboard, devices, kafka, featurestore, tls, util, serving, model, constants
from hops.experiment import Direction
from hops.model import Metric
import stat
import os
import shutil
from pyspark.sql import SQLContext
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, LongType, IntegerType, FloatType
import pandas as pd
import numpy as np
import datetime
import time
import json
from pyspark.sql import DataFrame
from petastorm.unischema import dict_to_spark_row, Unischema, UnischemaField
from petastorm.codecs import ScalarCodec, CompressedImageCodec, NdarrayCodec
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType
from pyspark.sql import SparkSession
import tensorflow as tf
import sys
import random
from confluent_kafka import Producer, Consumer, KafkaError

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,Current session?
18,application_1571991823790_0002,pyspark,idle,Link,Link,✔


SparkSession available as 'spark'.


## Experiment API Tests

In [2]:
def exp_asserts():
    from hops import tensorboard
    from hops import devices
    from hops import hdfs
    import os
    assert tensorboard.logdir() != None
    assert devices.get_num_gpus() == 0
    assert hdfs.project_path() == hdfs.project_path(hdfs.project_name())
    if tensorboard.local_logdir_bool:
        assert "hdfs://" not in tensorboard.logdir()
        assert os.path.exists(tensorboard.logdir())
    else:
        assert "hdfs://" in tensorboard.logdir()
        assert hdfs.exists(tensorboard.logdir())

In [3]:
def no_ret():
    exp_asserts()

In [4]:
def no_ret_params(a, b):
    exp_asserts()

In [5]:
def single_ret_raw_value():
    exp_asserts()
    return 10

In [6]:
def single_ret_raw_value_params(a, b):
    exp_asserts()
    return a+b

In [7]:
def single_ret_path():
    exp_asserts()
    f = open('testfile.txt', 'w')
    f.write('stuff happened')
    f.close()
    return {'logfile': 'testfile.txt'}

In [8]:
def single_ret_path_params(a, b):
    exp_asserts()
    f = open('testfile.txt', 'w')
    f.write('stuff happened')
    f.close()
    return {'logfile': 'testfile.txt'}

In [9]:
def single_ret_val():
    exp_asserts()
    return {'value': -10.3}

In [10]:
def single_ret_val_params(a, b):
    exp_asserts()
    return {'value': a+b}

In [11]:
def multi_ret():
    exp_asserts()
    f = open('testfile.txt', 'w')
    f.write('stuff happened')
    f.close()
    return {'value': 10, 'morevals': 0.5, 'logfile': 'testfile.txt'}

In [12]:
def multi_ret_params(a, b):
    exp_asserts()
    f = open('testfile.txt', 'w')
    f.write('stuff happened')
    f.close()
    return {'value': a+b, 'morevals': b, 'logfile': 'testfile.txt', 'diagram': 'img.png'}

In [13]:
def create_model_in_tensorboard_logdir():
    from hops import tensorboard
    from hops import hdfs
    import os
    import uuid
    model_name = str(uuid.uuid4())
    
    if tensorboard.logdir():
        if os.path.exists(tensorboard.logdir()):
            os.mkdir(tensorboard.logdir() + '/model')
            f = open(tensorboard.logdir() + '/model/model.pb', 'w')
            f.write('model')
            f.close()
        else:
            hdfs.mkdir(tensorboard.logdir() + '/model')
            hdfs.dump("model", tensorboard.logdir() + '/model/model.pb')
    
    return {'name': model_name}

def create_model_in_tensorboard_logdir_params(a, b):
    from hops import tensorboard
    from hops import hdfs
    import os
    import uuid
    model_name = str(uuid.uuid4())
    model_path = tensorboard.logdir() + '/model/' + model_name
    
    #create a 'model'
    if tensorboard.logdir():
        if os.path.exists(tensorboard.logdir()):
            os.mkdir(tensorboard.logdir() + '/model')
            f = open(tensorboard.logdir() + '/model/model.pb', 'w')
            f.write('model')
            f.close()
        else:
            hdfs.mkdir(tensorboard.logdir() + '/model')
            hdfs.dump("model", tensorboard.logdir() + '/model/model.pb')
    
    return {'name': model_name, 'optval': a+b}

In [14]:
def export_model_in_wrapper():
    ret_dict = create_model_in_tensorboard_logdir()
    from hops import model
    from hops import tensorboard
    if tensorboard.logdir():
        model.export(tensorboard.logdir() + '/model', ret_dict['name'])

In [15]:
def assert_return_values(logdir, hp_dict, should_return_hp_dict, return_dict, should_return_return_dict):
    assert hdfs.exists(logdir)
    
    if should_return_hp_dict:
        print('Asserting hp_dict {} is a dict'.format(hp_dict))
        assert type(hp_dict) == dict
    else:
        assert not hp_dict
        
    if should_return_return_dict:
        print('Asserting return_dict {} is a dict'.format(return_dict))
        assert type(return_dict) == dict    
    else:
        assert not return_dict

##### Test `experiment.launch`

In [16]:
params={'a': [-5, 4.9], 'b': [-8, 10.3]}

logdir, return_dict = experiment.launch(no_ret, local_logdir=False, name='no ret')
assert_return_values(logdir, None, False, return_dict, False)                

Finished Experiment

In [17]:
logdir, return_dict = experiment.launch(no_ret, local_logdir=True, name='no ret')
assert_return_values(logdir, None, False, return_dict, False)

Finished Experiment

In [18]:
logdir, return_dict = experiment.launch(no_ret_params, params, local_logdir=True, name='no ret params')
assert_return_values(logdir, None, False, return_dict, False)

Finished Experiment

In [19]:
logdir, return_dict = experiment.launch(no_ret_params, params, local_logdir=False, name='no ret params')
assert_return_values(logdir, None, False, return_dict, False)

Finished Experiment

In [20]:
logdir, return_dict = experiment.launch(single_ret_raw_value, local_logdir=False, name='single ret raw value')
assert_return_values(logdir, None, False, return_dict, True)

Finished Experiment 

Asserting return_dict {'metric': '10'} is a dict

In [21]:
logdir, return_dict = experiment.launch(single_ret_raw_value, local_logdir=True, name='single ret raw value')
assert_return_values(logdir, None, False, return_dict, True)

Finished Experiment 

Asserting return_dict {'metric': '10'} is a dict

In [22]:
logdir, return_dict = experiment.launch(single_ret_raw_value_params, params, local_logdir=True, name='single ret raw value params')
assert_return_values(logdir, None, False, return_dict, False)

Finished Experiment

In [23]:
logdir, return_dict = experiment.launch(single_ret_raw_value_params, params, local_logdir=False, name='single ret raw value params')
assert_return_values(logdir, None, False, return_dict, False)

Finished Experiment

In [24]:
logdir, return_dict = experiment.launch(single_ret_path, local_logdir=False, description='some custom desc', name='single ret path')
assert_return_values(logdir, None, False, return_dict, True)

Finished Experiment 

Asserting return_dict {'logfile': 'Experiments/application_1571904075501_0011_9/testfile.txt'} is a dict

In [25]:
logdir, return_dict = experiment.launch(single_ret_path, local_logdir=True, name='single ret path')
assert_return_values(logdir, None, False, return_dict, True)

Finished Experiment 

Asserting return_dict {'logfile': 'Experiments/application_1571904075501_0011_10/testfile.txt'} is a dict

In [26]:
logdir, return_dict = experiment.launch(single_ret_path_params, params, local_logdir=True, name='single ret path params')
assert_return_values(logdir, None, False, return_dict, False)

Finished Experiment

In [27]:
logdir, return_dict = experiment.launch(single_ret_path_params, params, local_logdir=False, name='single ret path params')
assert_return_values(logdir, None, False, return_dict, False)

Finished Experiment

In [28]:
logdir, return_dict = experiment.launch(single_ret_val, local_logdir=False, name='single ret val', description='some custom desc')
assert_return_values(logdir, None, False, return_dict, True)

Finished Experiment 

Asserting return_dict {'value': '-10.3'} is a dict

In [29]:
logdir, return_dict = experiment.launch(single_ret_val, local_logdir=True, name='single ret val')
assert_return_values(logdir, None, False, return_dict, True)

Finished Experiment 

Asserting return_dict {'value': '-10.3'} is a dict

In [30]:
logdir, return_dict = experiment.launch(single_ret_val_params, params, local_logdir=True, name='single ret val params')
assert_return_values(logdir, None, False, return_dict, False)

Finished Experiment

In [31]:
logdir, return_dict = experiment.launch(single_ret_val_params, params, local_logdir=False, name='single ret val params')
assert_return_values(logdir, None, False, return_dict, False)

Finished Experiment

In [32]:
logdir, return_dict = experiment.launch(multi_ret, local_logdir=False, name='multi ret', metric_key='value')
assert_return_values(logdir, None, False, return_dict, True)

Finished Experiment 

Asserting return_dict {'value': '10', 'morevals': '0.5', 'logfile': 'Experiments/application_1571904075501_0011_17/testfile.txt'} is a dict

In [33]:
logdir, return_dict = experiment.launch(multi_ret, local_logdir=True, name='multi ret', metric_key='morevals')
assert_return_values(logdir, None, False, return_dict, True)

Finished Experiment 

Asserting return_dict {'value': '10', 'morevals': '0.5', 'logfile': 'Experiments/application_1571904075501_0011_18/testfile.txt'} is a dict

In [34]:
logdir, return_dict = experiment.launch(multi_ret_params, params, local_logdir=False, name='multi ret params')
assert_return_values(logdir, None, False, return_dict, False)

Finished Experiment

In [35]:
logdir, return_dict = experiment.launch(multi_ret_params, params, local_logdir=True, name='multi ret params')
assert_return_values(logdir, None, False, return_dict, False)

Finished Experiment

In [36]:
params={'a': [-5], 'b': [-8]}

logdir, return_dict = experiment.launch(single_ret_raw_value_params, params, local_logdir=True, name='multi ret params single comb')
assert_return_values(logdir, None, False, return_dict, True)

Finished Experiment 

Asserting return_dict {'metric': '-13'} is a dict

In [37]:
logdir, return_dict = experiment.launch(multi_ret_params, params, local_logdir=True, name='multi ret params single comb', metric_key='value')
assert_return_values(logdir, None, False, return_dict, True)

Finished Experiment 

Asserting return_dict {'value': '-13', 'morevals': '-8', 'logfile': 'Experiments/application_1571904075501_0011_22/a=-5&b=-8/testfile.txt', 'diagram': 'img.png'} is a dict

In [38]:
experiment.launch(export_model_in_wrapper, name='model exported in wrapper', local_logdir=False)

experiment.launch(export_model_in_wrapper, name='model exported in wrapper', local_logdir=True)

logdir, return_dict = experiment.launch(create_model_in_tensorboard_logdir, local_logdir=True, name='model exported from local logdir')
model.export(logdir + '/model', return_dict['name'])

logdir, return_dict = experiment.launch(create_model_in_tensorboard_logdir, local_logdir=False, name='model exported from hdfs logdir')
model.export(logdir + '/model', return_dict['name'])

Finished Experiment 

Finished Experiment 

Finished Experiment 

Exported model 5a446f03-421e-4aae-bbd7-1a4ae98e1c13 as version 1 successfully.
Polling 5a446f03-421e-4aae-bbd7-1a4ae98e1c13 version 1 for model availability.
Model now available.
Finished Experiment 

Exported model fee7334d-f5b9-4381-8139-a2338a89cf29 as version 1 successfully.
Polling fee7334d-f5b9-4381-8139-a2338a89cf29 version 1 for model availability.
Model now available.

In [39]:
def assert_best_hyperparameters(return_dict, best_hyperparameters):
    print('Asserting best hyperparameters in return_dict {} are {}'.format(return_dict, best_hyperparameters))
    for key in best_hyperparameters.keys():
        assert float(best_hyperparameters[key]) == float(return_dict[key]), '{} not equal to {}'.format(best_hyperparameters[key], return_dict[key])

In [40]:
def assert_return_dict(logdir, return_dict):
    return_dict_contents = hdfs.load(logdir + '/.outputs.json')
    logdir_return_dict = json.loads(return_dict_contents)
    print('Assserting returned dict {} is equal to .return in best logdir {}'.format(return_dict, logdir_return_dict))
    assert return_dict == logdir_return_dict, 'dicts are not the same {} - {}'.format(return_dict, logdir_return_dict)

##### Test Parallel Experiments `experiment.grid_search`

In [41]:
params={'a': [-5, 4.9], 'b': [-8, 10.3]}
try:
    experiment.grid_search(no_ret_params, params, name='fail no ret val')
    assert False, 'should fail due to no return value'
except:
    pass

try:
    experiment.grid_search(multi_ret_params, params, name='fail no opt key')
    assert False, 'should fail due to optimization_key not being set'
except:
    pass
    
logdir, hp_dict, return_dict = experiment.grid_search(single_ret_raw_value_params, params, local_logdir=True, direction=Direction.MIN)
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.grid_search(single_ret_raw_value_params, params, local_logdir=False, direction=Direction.MAX)
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.grid_search(single_ret_val_params, params, local_logdir=True, direction=Direction.MIN, optimization_key='value')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.grid_search(single_ret_val_params, params, local_logdir=False, direction=Direction.MAX, optimization_key='value')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.grid_search(multi_ret_params, params, local_logdir=False, direction=Direction.MIN, optimization_key='value')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, metric = experiment.grid_search(multi_ret_params, params, local_logdir=True, direction=Direction.MAX, optimization_key='morevals')
assert_return_values(logdir, hp_dict, True, return_dict, True)

params={'a': [-1, 1.5], 'b': [-1.5, 1]}

# Make sure minimization work
logdir, hp_dict, return_dict = experiment.grid_search(single_ret_raw_value_params, params, local_logdir=True, direction=Direction.MIN)
assert_return_values(logdir, hp_dict, True, return_dict, True)
assert_best_hyperparameters(hp_dict, {'a': -1, 'b': -1.5})
assert_return_dict(logdir, return_dict)

# Make sure maximization work
logdir, hp_dict, return_dict = experiment.grid_search(single_ret_raw_value_params, params, local_logdir=True, direction=Direction.MAX)
assert_return_values(logdir, hp_dict, True, return_dict, True)
assert_best_hyperparameters(hp_dict, {'a': 1.5, 'b': 1})
assert_return_dict(logdir, return_dict)

best_logdir, hp_dict, return_dict = experiment.grid_search(create_model_in_tensorboard_logdir_params, params, local_logdir=True, name='grid search model exported from local logdir', optimization_key='optval', direction=Direction.MIN)
model.export(best_logdir + '/model', return_dict['name'])

best_logdir, hp_dict, return_dict = experiment.grid_search(create_model_in_tensorboard_logdir_params, params, local_logdir=False, name='grid search model exported from hdfs logdir', optimization_key='optval', direction=Direction.MAX)
model.export(best_logdir + '/model', return_dict['name'])

An error was encountered:
Invalid status code '400' from http://10.0.2.15:8998/sessions/16/statements/41 with error payload: {"msg":"requirement failed: Session isn't active."}


##### Test Parallel Experiments `experiment.random_search`

In [42]:
params={'a': [-5, 4.9], 'b': [-8, 10.3]}
try:
    experiment.random_search(no_ret_params, params, samples=2, name='fail opt no ret')
    assert False, 'should fail due to no return value'
except:
    pass

try:
    experiment.random_search(multi_ret_params, params, samples=2, name='fail opt no key')
    assert False, 'should fail due to optimization_key not being set'
except:
    pass

logdir, hp_dict, return_dict = experiment.random_search(single_ret_raw_value_params, params, samples=2, local_logdir=True, direction=Direction.MIN)
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.random_search(single_ret_raw_value_params, params, samples=2, local_logdir=False, direction=Direction.MAX)
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.random_search(single_ret_val_params, params, samples=2, local_logdir=True, direction=Direction.MIN, optimization_key='value')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.random_search(single_ret_val_params, params, samples=2, local_logdir=False, direction=Direction.MAX, optimization_key='value')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.random_search(multi_ret_params, params, samples=2, local_logdir=False, direction=Direction.MAX, optimization_key='value')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.random_search(multi_ret_params, params, samples=2, local_logdir=True, direction=Direction.MIN, optimization_key='morevals')
assert_return_values(logdir, hp_dict, True, return_dict, True)

params={'a': [-1, 1], 'b': [-1, 1]}

# Make sure minimization work
logdir, hp_dict, return_dict = experiment.random_search(single_ret_raw_value_params, params, local_logdir=True, direction=Direction.MIN, samples=100)
assert_return_values(logdir, hp_dict, True, return_dict, True)
assert_best_hyperparameters(hp_dict, {'a': -1, 'b': -1})
assert_return_dict(logdir, return_dict)

# Make sure maximization work
logdir, hp_dict, return_dict = experiment.random_search(single_ret_raw_value_params, params, local_logdir=True, direction=Direction.MAX, samples=100)
assert_return_values(logdir, hp_dict, True, return_dict, True)
assert_best_hyperparameters(hp_dict, {'a': 1, 'b': 1})
assert_return_dict(logdir, return_dict)

best_logdir, hp_dict, return_dict = experiment.random_search(create_model_in_tensorboard_logdir_params, params, local_logdir=True, name='random search model exported from local logdir', optimization_key='optval', direction=Direction.MIN)
model.export(best_logdir + '/model', return_dict['name'])

best_logdir, hp_dict, return_dict = experiment.random_search(create_model_in_tensorboard_logdir_params, params, local_logdir=False, name='random search model exported from hdfs logdir', optimization_key='optval', direction=Direction.MAX)
model.export(best_logdir + '/model', return_dict['name'])

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


##### Test Parallel Experiments `experiment.differential_evolution`

In [43]:
params={'a': [1, 4.9], 'b': [3, 10.3]}
try:
    experiment.differential_evolution(no_ret_params, params, name='fail opt no ret')
    assert False, 'should fail due to no return value'
except:
    pass

try:
    experiment.differential_evolution(multi_ret_params, params, name='fail opt no key')
    assert False, 'should fail due to optimization_key not being set'
except:
    pass

logdir, hp_dict, return_dict = experiment.differential_evolution(single_ret_raw_value_params, params, local_logdir=True, direction=Direction.MIN)
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.differential_evolution(single_ret_raw_value_params, params, local_logdir=False, direction=Direction.MAX)
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.differential_evolution(single_ret_val_params, params, local_logdir=True, direction=Direction.MIN, optimization_key='value')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.differential_evolution(single_ret_val_params, params,local_logdir=False, direction=Direction.MAX, optimization_key='value')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.differential_evolution(multi_ret_params, params, local_logdir=False, direction=Direction.MAX, optimization_key='value')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.differential_evolution(multi_ret_params, params, local_logdir=True, direction=Direction.MIN, optimization_key='morevals')
assert_return_values(logdir, hp_dict, True, return_dict, True)

params={'a': [4, 5], 'b': [1, 2]}

# Make sure minimization work
logdir, hp_dict, return_dict = experiment.differential_evolution(single_ret_raw_value_params, params, local_logdir=True, direction=Direction.MIN)
assert_return_values(logdir, hp_dict, True, return_dict, True)
assert_best_hyperparameters(hp_dict, {'a': 4, 'b': 1})
assert_return_dict(logdir, return_dict)

# Make sure maximization work
logdir, hp_dict, return_dict = experiment.differential_evolution(single_ret_raw_value_params, params, local_logdir=True, direction=Direction.MAX)
assert_return_values(logdir, hp_dict, True, return_dict, True)
assert_best_hyperparameters(hp_dict, {'a': 5, 'b': 2})
assert_return_dict(logdir, return_dict)

best_logdir, hp_dict, return_dict = experiment.differential_evolution(create_model_in_tensorboard_logdir_params, params, local_logdir=True, name='diff evo model exported from local logdir', optimization_key='optval', direction=Direction.MIN)
model.export(best_logdir + '/model', return_dict['name'])

best_logdir, hp_dict, return_dict = experiment.differential_evolution(create_model_in_tensorboard_logdir_params, params, local_logdir=False, name='diff evo model exported from hdfs logdir', optimization_key='optval', direction=Direction.MAX)
model.export(best_logdir + '/model', return_dict['name'])

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [44]:
def dist_exp_asserts():
    from hops import tensorboard
    from hops import devices
    from hops import hdfs
    import os
    assert devices.get_num_gpus()==0
    assert hdfs.project_path() == hdfs.project_path(hdfs.project_name())
    
    tf_config = json.loads(os.environ['TF_CONFIG'])
    
    role = tf_config['task']['type']
    
    print(tensorboard.logdir())
    
    # Only chief and evaluator role should have access to TB logdir to write checkpoints/summary/evaluation etc
    if role == 'chief':
        assert tensorboard.logdir() != None, 'chief TB is None'
        if tensorboard.local_logdir_bool:
            assert "hdfs://" not in tensorboard.logdir(), 'chief TB is not local'
            assert os.path.exists(tensorboard.logdir()), 'chief local TB path does not exists'
        else:
            assert "hdfs://" in tensorboard.logdir(), 'chief TB is not in HDFS'
            assert hdfs.exists(tensorboard.logdir()), 'chief hdfs TB path does not exists'
    elif role == 'worker' or role == 'ps':
        assert tensorboard.logdir() == None, 'ps or worker TB is not None {}'.format(tf_config)
    elif role == 'evaluator':
        assert tensorboard.logdir() != None, 'evaluator TB is None'
        assert hdfs.exists(tensorboard.logdir()), 'evaluator TB path does not exists'

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


##### Test Distributed Training `experiment.collective_all_reduce`

In [45]:
def no_ret():
    dist_exp_asserts()

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [46]:
def multi_return():
    dist_exp_asserts()
    f = open('testfile.txt', 'w')
    f.write('stuff happened')
    f.close()
    f = open('img.png', 'w')
    f.write('stuff happened')
    f.close()
    return {'value': 10, 'morevals': 3, 'logfile': 'testfile.txt', 'diagram': 'img.png'}

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [47]:
experiment.collective_all_reduce(no_ret)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [48]:
experiment.collective_all_reduce(multi_return)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [49]:
experiment.collective_all_reduce(export_model_in_wrapper, name='collective model exported in wrapper', local_logdir=False)

experiment.collective_all_reduce(export_model_in_wrapper, name='collective model exported in wrapper', local_logdir=True)

logdir, return_dict = experiment.collective_all_reduce(create_model_in_tensorboard_logdir, local_logdir=True, name='collective model exported from local logdir')
model.export(logdir + '/model', return_dict['name'])

logdir, return_dict = experiment.collective_all_reduce(create_model_in_tensorboard_logdir, local_logdir=False, name='collective model exported from hdfs logdir')
model.export(logdir + '/model', return_dict['name'])

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [50]:
experiment.mirrored(no_ret)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [51]:
experiment.mirrored(multi_return)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [52]:
experiment.mirrored(export_model_in_wrapper, name='mirrored model exported in wrapper', local_logdir=False)

experiment.mirrored(export_model_in_wrapper, name='mirrored model exported in wrapper', local_logdir=True)

logdir, return_dict = experiment.mirrored(create_model_in_tensorboard_logdir, local_logdir=True, name='mirrored model exported from local logdir')
model.export(logdir + '/model', return_dict['name'])

logdir, return_dict = experiment.mirrored(create_model_in_tensorboard_logdir, local_logdir=False, name='mirrored model exported from hdfs logdir')
model.export(logdir + '/model', return_dict['name'])

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [53]:
#experiment.parameter_server(no_ret)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [54]:
#experiment.parameter_server(multi_return)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [55]:
#experiment.parameter_server(export_model_in_wrapper, name='mirrored model exported in wrapper', local_logdir=False)

#experiment.parameter_server(export_model_in_wrapper, name='mirrored model exported in wrapper', local_logdir=True)

#logdir, return_dict = experiment.parameter_server(create_model_in_tensorboard_logdir, local_logdir=True, name='mirrored model exported from local logdir')
#serving.export(logdir + '/model', return_dict['name'], 1)

#logdir, return_dict = experiment.parameter_server(create_model_in_tensorboard_logdir, local_logdir=False, name='mirrored model exported from hdfs logdir')
#serving.export(logdir + '/model', return_dict['name'], 1)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


## HopsFS Tests

##### Test HopsFS operations

- `hdfs.project_user()`
- `hdfs.project_name()`
- `hdfs.project_path()`
- `hdfs.exists()`
- `hdfs.load()`
- `hdfs.copy_to_hdfs()`
- `hdfs.copy_to_local()`
- `hdfs.ls()`
- `hdfs.lsl()`
- `hdfs.glob()`
- `hdfs.cp()`
- `hdfs.rmr()`
- `hdfs.rename()`
- `hdfs.stat()`
- `hdfs.isdir()`
- `hdfs.isfile()`
- `hdfs.add_module()`
- `hdfs.delete()`
- `hdfs.get_plain_path()`

In [56]:
project_user = hdfs.project_user()
project_name = hdfs.project_name()
assert project_name in project_user
project_path = hdfs.project_path()
assert project_name in project_path

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [57]:
logs_README = hdfs.load("Logs/README.md")
assert len(logs_README) > 0

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [58]:
hdfs.dump("test", "Logs/README_dump_test.md")
assert hdfs.exists("Logs/README_dump_test.md")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [59]:
logs_README_dumped = hdfs.load("Logs/README_dump_test.md")
assert logs_README_dumped.decode("utf-8") == "test"

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [60]:
# copy_to_hdfs file relative path

with open('upload.txt', 'w') as f:
    f.write("first upload")
hdfs.copy_to_hdfs("upload.txt", "Resources")
assert hdfs.exists("Resources/upload.txt")
hdfs_copied_file = hdfs.load("Resources/upload.txt")
assert "first upload" == hdfs_copied_file.decode("utf-8"), "first content does not match"

with open('upload.txt', 'w') as f:
    f.write("second upload")
hdfs.copy_to_hdfs("upload.txt", "Resources", overwrite=True)
assert hdfs.exists("Resources/upload.txt")
hdfs_copied_file = hdfs.load("Resources/upload.txt")
assert "second upload" == hdfs_copied_file.decode("utf-8"), "second content does not match"

try:
    hdfs.copy_to_hdfs("upload.txt", "Resources")
    assert False
except IOError:
    pass

hdfs.rmr("Resources/upload.txt")
os.remove("upload.txt")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [61]:
# copy_to_hdfs file absolute path

with open('upload_absolute.txt', 'w') as f:
    f.write("first upload")
hdfs.copy_to_hdfs(os.getcwd() + "/upload_absolute.txt", "Resources")
assert hdfs.exists("Resources/upload_absolute.txt")
hdfs_copied_file = hdfs.load("Resources/upload_absolute.txt")
assert "first upload" == hdfs_copied_file.decode("utf-8"), "first content does not match"

with open('upload_absolute.txt', 'w') as f:
    f.write("second upload")
hdfs.copy_to_hdfs(os.getcwd() + "/upload_absolute.txt", "Resources", overwrite=True)
assert hdfs.exists("Resources/upload_absolute.txt")
hdfs_copied_file = hdfs.load("Resources/upload_absolute.txt")
assert "second upload" == hdfs_copied_file.decode("utf-8"), "second content does not match"

try:
    hdfs.copy_to_hdfs("upload_absolute.txt", "Resources")
    assert False
except IOError:
    pass

hdfs.rmr("Resources/upload_absolute.txt")
os.remove("upload_absolute.txt")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [62]:
# copy_to_hdfs directory relative path

if not os.path.exists("upload_dir"):
    os.mkdir("upload_dir")

assert not hdfs.exists("Resources/upload_dir")
with open('upload_dir/upload.txt', 'w') as f:
    f.write("first upload")
hdfs.copy_to_hdfs("upload_dir", "Resources")
hdfs_copied_file = hdfs.load("Resources/upload_dir/upload.txt")
assert hdfs.exists("Resources/upload_dir")
with open('upload_dir/upload.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_copied_file.decode("utf-8") == local_copied_file, "first content compare failed"

with open('upload_dir/upload.txt', 'w') as f:
    f.write("second upload")
hdfs.copy_to_hdfs("upload_dir", "Resources", overwrite=True)
hdfs_copied_file = hdfs.load("Resources/upload_dir/upload.txt")
assert hdfs.exists("Resources/upload_dir")
with open('upload_dir/upload.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_copied_file.decode("utf-8") == local_copied_file, "second content compare failed"

shutil.rmtree("upload_dir")
hdfs.rmr("Resources/upload_dir")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [63]:
# copy_to_hdfs directory absolute path

if not os.path.exists("upload_dir_absolute"):
    os.mkdir("upload_dir_absolute")
    
assert not hdfs.exists("Resources/upload_dir_absolute")
with open('upload_dir_absolute/upload.txt', 'w') as f:
    f.write("first upload")
hdfs.copy_to_hdfs(os.getcwd() + "/upload_dir_absolute", "Resources")
hdfs_copied_file = hdfs.load("Resources/upload_dir_absolute/upload.txt")
assert hdfs.exists("Resources/upload_dir_absolute")
with open('upload_dir_absolute/upload.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_copied_file.decode("utf-8") == local_copied_file, "first content compare failed"

with open('upload_dir_absolute/upload.txt', 'w') as f:
    f.write("second upload")
hdfs.copy_to_hdfs(os.getcwd() + "/upload_dir_absolute", "Resources", overwrite=True)
hdfs_copied_file = hdfs.load("Resources/upload_dir_absolute/upload.txt")
assert hdfs.exists("Resources/upload_dir_absolute")
with open('upload_dir_absolute/upload.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_copied_file.decode("utf-8") == local_copied_file, "second content compare failed"

shutil.rmtree("upload_dir_absolute")
hdfs.rmr("Resources/upload_dir_absolute")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [64]:
#copy_to_local file

# Download first time
hdfs.dump("initial content", "Resources/somefile.txt")
hdfs.copy_to_local("Resources/somefile.txt")
hdfs_copied_file = hdfs.load("Resources/somefile.txt")
with open('somefile.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_copied_file.decode("utf-8") == local_copied_file, "first content compare failed"
first_modified = os.path.getmtime("somefile.txt")

# Download second time
hdfs.copy_to_local("Resources/somefile.txt")
hdfs_copied_file = hdfs.load("Resources/somefile.txt")
with open('somefile.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_copied_file.decode("utf-8") == local_copied_file, "second content compare failed"
second_modified = os.path.getmtime("somefile.txt")
assert first_modified == second_modified, "modified time not matching"

# Content changing on disk
hdfs.dump("content changed at some point", "Resources/somefile.txt")
hdfs_new_content = hdfs.load("Resources/somefile.txt")
hdfs.copy_to_local("Resources/somefile.txt")
with open('somefile.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_new_content.decode("utf-8") == local_copied_file, "third content compare failed"
third_modified = os.path.getmtime("somefile.txt")
assert not second_modified == third_modified, "modified time not matching"

# Download last time with overwrite, file should have changed on disk
hdfs.copy_to_local("Resources/somefile.txt", overwrite=True)
hdfs_copied_file = hdfs.load("Resources/somefile.txt")
with open('somefile.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_copied_file.decode("utf-8") == local_copied_file, "fourth content compare failed"
fourth_modified = os.path.getmtime("somefile.txt")
assert not third_modified == fourth_modified, "modified time not matching"

# Download again to make sure overwrite did not cause problems
hdfs.copy_to_local("Resources/somefile.txt")
hdfs_copied_file = hdfs.load("Resources/somefile.txt")
with open('somefile.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_copied_file.decode("utf-8") == local_copied_file, "fifth content compare failed"
fifth_modified = os.path.getmtime("somefile.txt")
assert fourth_modified == fifth_modified, "modified time not matching"

hdfs.rmr("Resources/somefile.txt")
os.remove("somefile.txt")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [65]:
#copy_to_local directory

assert not os.path.exists("Resources")
hdfs.copy_to_local("Resources")
first_modified = os.path.getmtime("Resources")
assert os.path.exists("Resources")
assert os.path.isdir("Resources")

hdfs.copy_to_local("Resources")
second_modified = os.path.getmtime("Resources")
assert first_modified == second_modified

localized_dir = hdfs.copy_to_local("Resources", overwrite=True)
third_modified = os.path.getmtime("Resources")
assert not second_modified == third_modified
num_files_first = len(os.listdir(localized_dir))

# Add a new file, it should also be localized
hdfs.dump("a wild file appeared", "Resources/newfile.txt")
hdfs.copy_to_local("Resources")
fourth_modified = os.path.getmtime("Resources")
assert first_modified == second_modified
num_files_second = len(os.listdir(localized_dir))
assert (num_files_first + 1) == num_files_second
assert not third_modified == fourth_modified

hdfs.rmr("Resources/newfile.txt")
shutil.rmtree("Resources")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [66]:
logs_files_md = hdfs.glob("Logs/*.md")
logs_path_names = hdfs.lsl("Logs/")
if hdfs.exists("Logs/test.txt"):
    hdfs.rmr("Logs/test.txt")
assert not hdfs.exists("Logs/test.txt")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [67]:
hdfs.dump("dummy", "Resources/test.txt")
hdfs.cp("Resources/test.txt", "Logs/")
logs_files = hdfs.ls("Logs/")
assert "test.txt" in ",".join(logs_files)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [68]:
hdfs.mkdir("Logs/test_dir")
assert hdfs.exists("Logs/test_dir")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [69]:
logs_files_prior_delete = hdfs.ls("Logs/")
hdfs.rmr("Logs/test_dir")
logs_files_after_delete = hdfs.ls("Logs/")
assert len(logs_files_prior_delete) > len(logs_files_after_delete)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [70]:
logs_files_prior_move = hdfs.ls("Logs/")
assert "README_dump_test.md" in ",".join(logs_files_prior_move)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [71]:
hdfs.move("Logs/README_dump_test.md", "Logs/README_dump_test2.md")
logs_files_after_move = hdfs.ls("Logs/")
assert "README_dump_test.md" not in ",".join(logs_files_after_move)
assert "README_dump_test2.md" in ",".join(logs_files_after_move)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [72]:
logs_files_prior_rename = hdfs.ls("Logs/")
assert "README_dump_test2.md" in ",".join(logs_files_prior_rename)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [73]:
hdfs.rename("Logs/README_dump_test2.md", "Logs/README_dump_test.md")
logs_files_after_rename = hdfs.ls("Logs/")
assert "Logs/README_dump_test2.md" not in ",".join(logs_files_after_rename)
assert "Logs/README_dump_test.md" in ",".join(logs_files_after_rename)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [74]:
file_stat = hdfs.stat("Logs/README.md")
hdfs.chmod("Logs/README.md", 775)
file_stat = hdfs.stat("Logs/README.md")
assert 775 == file_stat.st_mode

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [75]:
hdfs.chmod("Logs/README.md", 777)
file_stat = hdfs.stat("Logs/README.md")
assert 777 == file_stat.st_mode

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [76]:
file_owner = file_stat.st_uid
assert hdfs.exists("Logs/")
assert not hdfs.exists("Not_Existing/neither_am_i")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [77]:
assert hdfs.isdir("Resources")
assert not hdfs.isdir("Resources/README.md")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [78]:
assert hdfs.isfile("Resources/README.md")
assert not hdfs.isfile("Resources")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [79]:
hdfs.dump("def simple():\n\treturn 5", "Resources/my_module.py")
py_path = hdfs.add_module("Resources/my_module.py")
assert py_path in sys.path
import my_module
assert my_module.simple() == 5

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [80]:
plain = hdfs.get_plain_path("hdfs://10.0.2.15:8020/Projects/demo_deep_learning_admin000/Models/")
assert plain == "/Projects/demo_deep_learning_admin000/Models/"

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [81]:
hdfs.mkdir("Logs/test_delete_dir")
assert hdfs.exists("Logs/test_delete_dir")
hdfs.delete("Logs/test_delete_dir")
assert not hdfs.exists("Logs/test_delete_dir")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


## Feature Store Tests

These tests require that you have the following files in the Resources directory:

- `attendances_features.csv`
- `games_features.csv`
- `players_features.csv`
- `season_scores_features.csv`
- `teams_features.csv`

These files can be downloaded from here: `http://snurran.sics.se/hops/hops-util-py_test/`

##### Test Featurestore Create Feature Group Operations (`featurestore.create_featuregroup()`)

In [82]:
def load_fs_sample_data():
    resources_path = hdfs.project_path() + "Resources/"
    games_features_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(resources_path + "games_features.csv")
    players_features_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(resources_path + "players_features.csv")
    teams_features_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(resources_path + "teams_features.csv")
    season_scores_features_df = spark.read.format("csv").option("header", "true").option("inferSchema","true").load(resources_path + "season_scores_features.csv")
    attendances_features_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(resources_path + "attendances_features.csv")
    return games_features_df,players_features_df,teams_features_df,season_scores_features_df, attendances_features_df
games_features_df,players_features_df,teams_features_df,season_scores_features_df, attendances_features_df = load_fs_sample_data()

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [83]:
featurestore.create_featuregroup(
    games_features_df,
    "games_features",
    description="Features of average season scores for football teams"
)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [84]:
featurestore.create_featuregroup(
    teams_features_df,
    "teams_features",
    description="a spanish version of teams_features"
)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [85]:
featurestore.create_featuregroup(
    season_scores_features_df,
    "season_scores_features",
    description="Features of average season scores for football teams"
)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [86]:
featurestore.create_featuregroup(
    attendances_features_df,
    "attendances_features",
    description="Features of average attendance of games of football teams"
)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [87]:
teams_features_1_df = featurestore.get_featuregroup("teams_features")
teams_features_2_df = teams_features_1_df.withColumnRenamed(
    "team_id", "equipo_id").withColumnRenamed(
    "team_budget", "equipo_presupuesto").withColumnRenamed(
    "team_position", "equipo_posicion")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [88]:
featurestore.create_featuregroup(
    teams_features_2_df,
    "teams_features_spanish",
    description="a spanish version of teams_features",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False
)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [89]:
featurestore.create_featuregroup(
    teams_features_2_df,
    "teams_features_spanish",
    description="a spanish version of teams_features",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    featurestore=featurestore.project_featurestore(),
    featuregroup_version=1
)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [90]:
featurestore.create_featuregroup(
    teams_features_2_df,
    "teams_features_spanish",
    description="a spanish version of teams_features",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    featuregroup_version=2
)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [91]:
from hops import hdfs
query = "SELECT * FROM games_features_1 WHERE score > 1"
storage_connector = hdfs.project_name() + "_featurestore"
featuregroup_name = "games_features_on_demand"
featurestore.create_on_demand_featuregroup(query, featuregroup_name, storage_connector)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [92]:
assert "games_features_1" in featurestore.get_featuregroups()
assert "teams_features_1" in featurestore.get_featuregroups()
assert "season_scores_features_1" in featurestore.get_featuregroups()
assert "attendances_features_1" in featurestore.get_featuregroups()
assert "teams_features_spanish_1" in featurestore.get_featuregroups()
assert "teams_features_spanish_2" in featurestore.get_featuregroups()
assert "games_features_on_demand_1" in featurestore.get_featuregroups()

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


##### Test Featurestore Utility Operations, 

- `featurestore.get_metadata()`,
- `featurestore.project_featurestore()`, 
- `featurestore.get_latest_featuregroup_version()`, 
- `featurestore.get_features_list()`

In [93]:
featurestore.get_featurestore_metadata(update_cache=True)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [94]:
assert featurestore.project_featurestore() == hdfs.project_name() + "_featurestore"

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [95]:
assert featurestore.project_featurestore() in featurestore.get_project_featurestores()

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [96]:
assert len(featurestore.get_project_featurestores()) == 1

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [97]:
assert featurestore.get_latest_featuregroup_version("teams_features_spanish") == 2

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [98]:
assert featurestore.get_latest_featuregroup_version("teams_features") == 1

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [99]:
assert "away_team_id" in featurestore.get_features_list()

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [100]:
assert "home_team_id" in featurestore.get_features_list()

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [101]:
assert (hdfs.project_name() + "_featurestore", 'JDBC') in featurestore.get_storage_connectors()

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [102]:
assert len(featurestore.get_storage_connectors()) >= 3

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


##### Test Read operations of Features and Feature Groups, 

- `featurestore.get_feature()`, 
- `featurestore.get_features()`, 
- `featurestore.get_featuregroup()`

In [103]:
tmp = featurestore.get_feature("team_budget")
assert tmp.count() == 50
assert len(tmp.columns) == 1
assert "team_budget" in tmp.columns

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [104]:
tmp = featurestore.get_feature(
    "team_budget", 
    featurestore=featurestore.project_featurestore(), 
    featuregroup="teams_features", 
    featuregroup_version = 1,
    dataframe_type = "spark"
)
assert tmp.count() == 50
assert len(tmp.columns) == 1
assert "team_budget" in tmp.columns

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [105]:
tmp = featurestore.get_featuregroup("teams_features")
assert tmp.count() == 50
assert len(tmp.columns) == 3
assert "team_budget" in tmp.columns
assert "team_id" in tmp.columns
assert "team_position" in tmp.columns

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [106]:
tmp = featurestore.get_featuregroup(
    "teams_features", 
    featurestore=featurestore.project_featurestore(), 
    featuregroup_version = 1,
    dataframe_type = "spark"
)
assert tmp.count() == 50
assert len(tmp.columns) == 3
assert "team_budget" in tmp.columns
assert "team_id" in tmp.columns
assert "team_position" in tmp.columns

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [107]:
features = ["team_budget", "average_attendance"]
tmp = featurestore.get_features(
    features
)
assert set(features) == set(tmp.columns)
assert tmp.count() == 50
assert len(tmp.columns) == len(features)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [108]:
features = ["teams_features_1.team_budget", "attendances_features_1.average_attendance"]
tmp = featurestore.get_features(features)
assert set(["team_budget", "average_attendance"]) == set(tmp.columns)
assert tmp.count() == 50
assert len(tmp.columns) == len(features)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [109]:
features = ["team_budget", "average_attendance"]
tmp = featurestore.get_features(
    features,
    featurestore=featurestore.project_featurestore(),
    featuregroups_version_dict={
        "teams_features": 1, 
        "attendances_features": 1
    }
)
assert set(features) == set(tmp.columns)
assert tmp.count() == 50
assert len(tmp.columns) == len(features)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [110]:
tmp = featurestore.get_features(
    features,
    featurestore=featurestore.project_featurestore(),
    featuregroups_version_dict={
        "teams_features": 1, 
        "attendances_features": 1
    },
    join_key = "team_id",
    dataframe_type = "spark"
)
assert set(features) == set(tmp.columns)
assert tmp.count() == 50
assert len(tmp.columns) == len(features)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [111]:
features = ["team_budget", "average_attendance",
    "team_position", "sum_attendance"
    ]
tmp = featurestore.get_features(
   features
)
assert set(features) == set(tmp.columns)
assert tmp.count() == 50
assert len(tmp.columns) == len(features)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [112]:
features = ["team_budget", "team_id"]
tmp = featurestore.get_features(
    features,
    featuregroups_version_dict = {
        "teams_features" : 1
    }
)
assert set(features) == set(tmp.columns)
assert tmp.count() == 50
assert len(tmp.columns) == len(features)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [113]:
tmp = featurestore.sql(
    "SELECT team_budget, score " \
    "FROM teams_features_1 JOIN games_features_1 ON " \
    "games_features_1.home_team_id = teams_features_1.team_id")
features = ['team_budget', 'score']
assert set(features) == set(tmp.columns)
assert tmp.count() == 49
assert len(tmp.columns) == len(features)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [114]:
tmp = featurestore.sql("SELECT * FROM teams_features_1 WHERE team_position < 5")
assert len(tmp.columns) == 3
assert "team_budget" in tmp.columns
assert "team_id" in tmp.columns
assert "team_position" in tmp.columns
for x in tmp.toPandas()["team_position"].values:
    assert x < 5

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [115]:
tmp = featurestore.sql("SELECT * FROM teams_features_1 WHERE team_position < 5",
                featurestore=featurestore.project_featurestore(), 
                 dataframe_type = "spark")
assert len(tmp.columns) == 3
assert "team_budget" in tmp.columns
assert "team_id" in tmp.columns
assert "team_position" in tmp.columns
for x in tmp.toPandas()["team_position"].values:
    assert x < 5

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


#####  Test Insert Operations in Existing Feature Groups, `featurestore.insert_into_featuregroup()`

In [116]:
sqlContext = SQLContext(spark.sparkContext)
schema = StructType([StructField("equipo_id", IntegerType(), True),
                     StructField("equipo_presupuesto", FloatType(), True),
                     StructField("equipo_posicion", IntegerType(), True)
                        ])
sample_df = sqlContext.createDataFrame([(999, 41251.52, 1), (998, 1319.4, 8), (997, 21219.1, 2)], schema)
insert_count = sample_df.count()
assert insert_count == 3

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [117]:
spanish_team_features_df = featurestore.get_featuregroup(
    "teams_features_spanish")
pre_insert_count = spanish_team_features_df.count()
assert pre_insert_count == 50

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [118]:
featurestore.insert_into_featuregroup(
    sample_df, 
    "teams_features_spanish", 
    descriptive_statistics=False, 
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False
)
spanish_team_features_df_updated = featurestore.get_featuregroup(
    "teams_features_spanish")

after_insert_count = spanish_team_features_df_updated.count()
assert after_insert_count == 53

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [119]:
featurestore.insert_into_featuregroup(
    sample_df, 
    "teams_features_spanish", 
    featurestore=featurestore.project_featurestore(), 
    featuregroup_version=1, 
    mode="append",
    descriptive_statistics=False, 
    feature_correlation=False, 
    feature_histograms=False,
    cluster_analysis=False, 
    stat_columns=None, 
    num_bins=20, 
    corr_method='pearson',
    num_clusters=5
)

after_insert_count2 = featurestore.get_featuregroup("teams_features_spanish").count()
assert after_insert_count2 == 56

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [120]:
featurestore.insert_into_featuregroup(
    sample_df, 
    "teams_features_spanish",
    descriptive_statistics=False, 
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    mode="overwrite")

count_after_overwrite = featurestore.get_featuregroup("teams_features_spanish").count()
assert count_after_overwrite == 3

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


##### Test integration of feature store with Numpy, Pandas and plain Python

In [121]:
pandas_df = featurestore.get_features(["team_budget", "average_attendance"], dataframe_type="pandas")
assert "team_budget" in pandas_df.columns.values
assert "average_attendance" in pandas_df.columns.values
assert len(pandas_df) == 50
assert len(pandas_df.columns.values) == 2
assert isinstance(pandas_df, pd.DataFrame)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [122]:
numpy_df = featurestore.get_features(["team_budget", "average_attendance"], 
                                      dataframe_type="numpy")
assert numpy_df.shape[0] == 50
assert numpy_df.shape[1] == 2
assert isinstance(numpy_df, np.ndarray)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [123]:
python_df = featurestore.get_features(["team_budget", "average_attendance"], 
                                      dataframe_type="python")
assert len(python_df) == 50
assert isinstance(python_df, list)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [124]:
spark_df = featurestore.get_features(["team_budget", "average_attendance"], 
                                      dataframe_type="spark")
assert spark_df.count() == 50
assert isinstance(spark_df, DataFrame)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [125]:
# Let's rename the columns to differentiate this feature group from existing ones in the feature store
pandas_df.columns = ["team_budget_test", "average_attendance_test"]

featurestore.create_featuregroup(
    pandas_df,
    "pandas_test_example",
    description="test featuregroup created from pandas dataframe",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False
)
assert "pandas_test_example_1" in featurestore.get_featuregroups()

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [126]:
count_pre_pandas_insert_overwrite = featurestore.get_featuregroup("pandas_test_example").count()
featurestore.insert_into_featuregroup(
    pandas_df, 
    "pandas_test_example",
    descriptive_statistics=False, 
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    mode="overwrite")
count_after_pandas_insert_overwrite = featurestore.get_featuregroup("pandas_test_example").count()
assert count_pre_pandas_insert_overwrite == count_after_pandas_insert_overwrite

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [127]:
featurestore.create_featuregroup(
    numpy_df,
    "numpy_test_example",
    description="test featuregroup created from numpy matrix",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False
)
assert "numpy_test_example_1" in featurestore.get_featuregroups()

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [128]:
numpy_test_df_count_pre_insert_overwrite = featurestore.get_featuregroup("numpy_test_example", dataframe_type="spark").count()
featurestore.insert_into_featuregroup(
    numpy_df, 
    "numpy_test_example",
    descriptive_statistics=False, 
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    mode="overwrite")
numpy_test_df_count_after_insert_overwrite = featurestore.get_featuregroup("numpy_test_example", dataframe_type="spark").count()
assert numpy_test_df_count_pre_insert_overwrite == numpy_test_df_count_pre_insert_overwrite

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [129]:
featurestore.create_featuregroup(
    python_df,
    "python_test_example",
    description="test featuregroup created from python 2D list",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False
)

python_test_df_count_pre_insert_overwrite = featurestore.get_featuregroup("python_test_example", dataframe_type="spark").count()
assert "python_test_example_1" in featurestore.get_featuregroups()

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [130]:
featurestore.insert_into_featuregroup(
    python_df, 
    "python_test_example",
    descriptive_statistics=False, 
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    mode="overwrite")

python_test_df_count_after_insert_overwrite = featurestore.get_featuregroup("python_test_example", dataframe_type="spark").count()
assert python_test_df_count_pre_insert_overwrite == python_test_df_count_after_insert_overwrite

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


##### Test update Feature Store Statistics `featurestore.update_featuregroup_stats()`

In [131]:
featurestore.update_featuregroup_stats("teams_features")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [132]:
featurestore.update_featuregroup_stats(
    "teams_features", 
    featuregroup_version=1, 
    featurestore=featurestore.project_featurestore(), 
    descriptive_statistics=True,
    feature_correlation=True, 
    feature_histograms=True,
    cluster_analysis=True,
    stat_columns=None)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


##### Test Write Training Dataset Operations 

- `featurestore.get_latest_training_dataset_version()`
- `create_training_dataset()`

In [133]:
features_df = featurestore.get_features(
    ["team_budget", "average_attendance",
    "team_position"]
)
latest_version = featurestore.get_latest_training_dataset_version("team_position_prediction")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [134]:
featurestore.create_training_dataset(
    features_df, "team_position_prediction",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    training_dataset_version = 1
)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [135]:
featurestore.create_training_dataset(
    features_df, "team_position_prediction_csv",
    description="a dataset with features for football teams, used for training a model to predict league-position",
    featurestore=featurestore.project_featurestore(),
    data_format="csv",
    training_dataset_version= 1,
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    stat_columns=None)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [136]:
featurestore.create_training_dataset(
    features_df, "team_position_prediction_tsv",
    description="a dataset with features for football teams, used for training a model to predict league-position",
    featurestore=featurestore.project_featurestore(),
    data_format="tsv",
    training_dataset_version=1,
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    stat_columns=None)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [137]:
featurestore.create_training_dataset(
    features_df, "team_position_prediction_parquet",
    description="a dataset with features for football teams, used for training a model to predict league-position",
    featurestore=featurestore.project_featurestore(),
    data_format="parquet",
    training_dataset_version=1,
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    stat_columns=None)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [138]:
featurestore.create_training_dataset(
    features_df, "team_position_prediction_orc",
    description="a dataset with features for football teams, used for training a model to predict league-position",
    featurestore=featurestore.project_featurestore(),
    data_format="orc",
    training_dataset_version=1,
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    stat_columns=None)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [139]:
featurestore.create_training_dataset(
    features_df, "team_position_prediction_avro",
    description="a dataset with features for football teams, used for training a model to predict league-position",
    featurestore=featurestore.project_featurestore(),
    data_format="avro",
    training_dataset_version=1,
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    stat_columns=None)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [140]:
featurestore.create_training_dataset(
    features_df, "team_position_prediction_hdf5",
    description="a dataset with features for football teams, used for training a model to predict league-position",
    featurestore=featurestore.project_featurestore(),
    data_format="hdf5",
    training_dataset_version=1,
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    stat_columns=None)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [141]:
featurestore.create_training_dataset(
    features_df, "team_position_prediction_npy",
    description="a dataset with features for football teams, used for training a model to predict league-position",
    featurestore=featurestore.project_featurestore(),
    data_format="npy",
    training_dataset_version=1,
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    stat_columns=None)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [142]:
# Petastorm is only supported in python 3
if sys.version_info[0] >= 3:
    PetastormSchema = Unischema('team_position_prediction_petastorm_schema', [
        UnischemaField('team_budget', np.float32, (), ScalarCodec(FloatType()), False),
        UnischemaField('average_attendance', np.float32, (), ScalarCodec(FloatType()), False),
        UnischemaField('team_position', np.int32, (), ScalarCodec(IntegerType()), False)
    ])

    petastorm_args = {
        "schema": PetastormSchema
    }

    featurestore.create_training_dataset(
        features_df, "team_position_prediction_petastorm",
        description="a dataset with features for football teams, used for training a model to predict league-position",
        featurestore=featurestore.project_featurestore(),
        data_format="petastorm",
        training_dataset_version=1,
        descriptive_statistics=False,
        feature_correlation=False,
        feature_histograms=False,
        cluster_analysis=False,
        stat_columns=None,
        petastorm_args=petastorm_args
    )

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [143]:
tds = featurestore.get_training_datasets()
assert 'team_position_prediction_1' in tds
assert 'team_position_prediction_csv_1' in tds
assert 'team_position_prediction_tsv_1' in tds
assert 'team_position_prediction_parquet_1' in tds
assert 'team_position_prediction_orc_1' in tds
assert 'team_position_prediction_avro_1' in tds
assert 'team_position_prediction_hdf5_1'in tds
assert 'team_position_prediction_npy_1' in tds
if sys.version_info[0] >= 3:
    assert 'team_position_prediction_petastorm_1' in tds

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


##### Test Insert into an existing training dataset, `featurestore.insert_into_training_dataset()`

In [144]:
count_pre_insert = featurestore.get_training_dataset("team_position_prediction_csv").count()
featurestore.insert_into_training_dataset(
    features_df, 
    "team_position_prediction_csv",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    training_dataset_version=featurestore.get_latest_training_dataset_version("team_position_prediction_csv")
)
count_after_insert = featurestore.get_training_dataset("team_position_prediction_csv").count()
assert count_pre_insert == count_after_insert # td only support overwrites

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


##### Test Training Dataset Utility Methods

- `featurestore.get_training_dataset_path()`
- `featurestore.get_training_dataset_tf_record_schema`

In [145]:
assert hdfs.project_path() in featurestore.get_training_dataset_path("team_position_prediction_csv")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [146]:
assert hdfs.project_name() + "_Training_Datasets" in featurestore.get_training_dataset_path("team_position_prediction_csv")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [147]:
assert "team_position_prediction_csv" in featurestore.get_training_dataset_path("team_position_prediction_csv")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [148]:
tf_schema = featurestore.get_training_dataset_tf_record_schema("team_position_prediction")
assert tf_schema == {'team_budget': tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=None), 
                     'average_attendance': tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=None), 
                     'team_position': tf.FixedLenFeature(shape=[], dtype=tf.int64, default_value=None)}

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [149]:
features_df = featurestore.get_training_dataset("team_position_prediction")
tf_schema = featurestore.get_dataframe_tf_record_schema(features_df)
assert tf_schema == {'team_budget': tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=None), 
                     'average_attendance': tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=None), 
                     'team_position': tf.FixedLenFeature(shape=[], dtype=tf.int64, default_value=None)}

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


##### Test update Training Dataset stats

- `featurestore.update_training_dataset_stats()`

In [150]:
featurestore.update_training_dataset_stats("team_position_prediction")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [151]:
featurestore.update_training_dataset_stats(
    "team_position_prediction", 
    training_dataset_version=1, 
    featurestore=featurestore.project_featurestore(), 
    descriptive_statistics=True,
    feature_correlation=True, 
    feature_histograms=True,
    cluster_analysis=True,
    stat_columns=None)

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


##### Test Read Training Datasets API `featurestore.get_training_dataset()`

In [152]:
cols = ['team_budget', 'average_attendance', 'team_position']
tmp = featurestore.get_training_dataset("team_position_prediction_csv")
assert set(tmp.columns) == set(cols)
assert tmp.count() == 50

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [153]:
tmp = featurestore.get_training_dataset("team_position_prediction_hdf5")
assert tmp.count() == 50

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [154]:
if sys.version_info[0] >= 3:
    tmp = featurestore.get_training_dataset("team_position_prediction_petastorm")
    assert set(tmp.columns) == set(cols)
    assert tmp.count() == 50

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [155]:
tmp = featurestore.get_training_dataset("team_position_prediction_avro")
assert set(tmp.columns) == set(cols)
assert tmp.count() == 50

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [156]:
tmp = featurestore.get_training_dataset("team_position_prediction_orc")
assert set(tmp.columns) == set(cols)
assert tmp.count() == 50

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [157]:
tmp = featurestore.get_training_dataset("team_position_prediction_tsv")
assert set(tmp.columns) == set(cols)
assert tmp.count() == 50

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [158]:
tmp = featurestore.get_training_dataset("team_position_prediction_npy")
assert tmp.count() == 50

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [159]:
tmp = featurestore.get_training_dataset("team_position_prediction_parquet")
assert set(tmp.columns) == set(cols)
assert tmp.count() == 50

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


##### Test Featurestore Get Statistics

- `featurestore.get_featuregroup_statistics()`
- `featurestore.get_training_dataset_statistics()`

In [160]:
stats = featurestore.get_featuregroup_statistics("teams_features")
assert not stats.cluster_analysis is None
assert not stats.cluster_analysis.clusters is None
assert not stats.cluster_analysis.datapoints is None
assert len(stats.cluster_analysis.clusters) == len(stats.cluster_analysis.datapoints)
assert not stats.cluster_analysis.clusters[0].datapoint_name is None
assert not stats.cluster_analysis.clusters[0].cluster is None
assert not stats.correlation_matrix is None
assert not stats.correlation_matrix.feature_correlations is None
assert len(stats.correlation_matrix.feature_correlations) > 0
assert len(stats.correlation_matrix.feature_correlations) < constants.FEATURE_STORE.MAX_CORRELATION_MATRIX_COLUMNS
assert not stats.correlation_matrix.feature_correlations[0].feature_name is None
assert not stats.correlation_matrix.feature_correlations[0].correlation_values is None
assert len(stats.correlation_matrix.feature_correlations[0].correlation_values) == \
len(stats.correlation_matrix.feature_correlations)
assert not stats.descriptive_stats is None
assert not stats.descriptive_stats.descriptive_stats is None
assert len(stats.descriptive_stats.descriptive_stats) > 0
assert not stats.descriptive_stats.descriptive_stats[0].feature_name is None
assert not stats.descriptive_stats.descriptive_stats[0].metric_values is None
assert len(stats.descriptive_stats.descriptive_stats[0].metric_values) > 0
assert not stats.descriptive_stats.descriptive_stats[0].metric_values[0].metric_name is None
assert not stats.descriptive_stats.descriptive_stats[0].metric_values[0].value is None
assert not stats.feature_histograms is None
assert not stats.feature_histograms.feature_distributions is None
assert len(stats.feature_histograms.feature_distributions) > 0
assert not stats.feature_histograms.feature_distributions[0].feature_name is None
assert not stats.feature_histograms.feature_distributions[0].frequency_distribution is None
assert len(stats.feature_histograms.feature_distributions[0].frequency_distribution) > 0
assert not stats.feature_histograms.feature_distributions[0].frequency_distribution[0].bin is None
assert not stats.feature_histograms.feature_distributions[0].frequency_distribution[0].frequency is None

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [161]:
stats = featurestore.get_training_dataset_statistics("team_position_prediction")
assert not stats.cluster_analysis is None
assert not stats.cluster_analysis.clusters is None
assert not stats.cluster_analysis.datapoints is None
assert len(stats.cluster_analysis.clusters) == len(stats.cluster_analysis.datapoints)
assert not stats.cluster_analysis.clusters[0].datapoint_name is None
assert not stats.cluster_analysis.clusters[0].cluster is None
assert not stats.correlation_matrix is None
assert not stats.correlation_matrix.feature_correlations is None
assert len(stats.correlation_matrix.feature_correlations) > 0
assert len(stats.correlation_matrix.feature_correlations) < constants.FEATURE_STORE.MAX_CORRELATION_MATRIX_COLUMNS
assert not stats.correlation_matrix.feature_correlations[0].feature_name is None
assert not stats.correlation_matrix.feature_correlations[0].correlation_values is None
assert len(stats.correlation_matrix.feature_correlations[0].correlation_values) == len(stats.correlation_matrix.feature_correlations)
assert not stats.descriptive_stats is None
assert not stats.descriptive_stats.descriptive_stats is None
assert len(stats.descriptive_stats.descriptive_stats) > 0
assert not stats.descriptive_stats.descriptive_stats[0].feature_name is None
assert not stats.descriptive_stats.descriptive_stats[0].metric_values is None
assert len(stats.descriptive_stats.descriptive_stats[0].metric_values) > 0
assert not stats.descriptive_stats.descriptive_stats[0].metric_values[0].metric_name is None
assert not stats.descriptive_stats.descriptive_stats[0].metric_values[0].value is None
assert not stats.feature_histograms is None
assert not stats.feature_histograms.feature_distributions is None
assert len(stats.feature_histograms.feature_distributions) > 0
assert not stats.feature_histograms.feature_distributions[0].feature_name is None
assert not stats.feature_histograms.feature_distributions[0].frequency_distribution is None
assert len(stats.feature_histograms.feature_distributions[0].frequency_distribution) > 0
assert not stats.feature_histograms.feature_distributions[0].frequency_distribution[0].bin is None
assert not stats.feature_histograms.feature_distributions[0].frequency_distribution[0].frequency is None

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


##### Test Featurestore Visualizations

- `featurestore.visualize_featuregroup_distributions()`
- `featurestore.visualize_featuregroup_correlations()`
- `featurestore.visualize_featuregroup_clusters()`
- `featurestore.visualize_featuregroup_descriptive_stats()`
- `featurestore.visualize_training_dataset_distributions()`
- `featurestore.visualize_training_dataset_correlations()`
- `featurestore.visualize_traniing_dataset_clusters()`
- `featurestore.visualize_training_dataset_descriptive_stats()`

In [162]:
fig = featurestore.visualize_featuregroup_distributions("teams_features", plot=False)
fig.savefig("teams_features_distributions.png")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [163]:
fig = featurestore.visualize_featuregroup_correlations("teams_features", plot=False)
fig.savefig("teams_features_correlations.png")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [164]:
fig = featurestore.visualize_featuregroup_clusters("teams_features", plot=False)
fig.savefig("teams_features_clusters.png")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [165]:
desc_stats_df = featurestore.visualize_featuregroup_descriptive_stats("teams_features")
desc_stats_df.head()

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [166]:
fig = featurestore.visualize_training_dataset_distributions("team_position_prediction", plot=False)
fig.savefig("team_position_prediction_distributions.png")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [167]:
fig = featurestore.visualize_training_dataset_correlations("team_position_prediction", plot=False)
fig.savefig("team_position_prediction_correlations.png")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [168]:
fig = featurestore.visualize_training_dataset_clusters("team_position_prediction", plot=False)
fig.savefig("team_position_prediction_clusters.png")

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [169]:
desc_stats_df = featurestore.visualize_training_dataset_descriptive_stats("team_position_prediction")
desc_stats_df.head()

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


##### Cleanup (Delete FS Contents so that next test run works the same)

In [170]:
# Delete feature groups
spark.sql('use ' + featurestore.project_featurestore())
for fg in featurestore.get_featuregroups():
    try:
        spark.sql("drop table " + fg)
    except:
        pass

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [171]:
# Delete training datasets
td_dir = hdfs.project_name() + "_Training_Datasets/"
for td in featurestore.get_training_datasets():
    try:
        hdfs.rmr(td_dir + td)
    except:
        pass

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [172]:
featurestore.get_featurestore_metadata(update_cache=True)
# on demand feature group will still be there.. maybe add delete endpoint in the python SDK?
#assert featurestore.get_featuregroups() == [] 
assert featurestore.get_training_datasets() == []

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


## Kafka Tests

##### Test default config 

- `kafka.get_default_config()`, 
- `kafka.get_security_protocol()`,
- `kafka.get_broker_endpoints_list()`

In [173]:
config = kafka.get_kafka_default_config()
assert "bootstrap.servers" in config
assert "security.protocol" in config
assert "ssl.ca.location" in config
assert "ssl.key.location" in config
assert "ssl.certificate.location" in config

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


In [174]:
assert len(kafka.get_security_protocol()) > 0
assert len(kafka.get_broker_endpoints_list()) > 0

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


## TLS Tests

##### Test access to TLS tokens

- `tls.get_key_store()`
- `tls.get_trust_store()`
- `tls.get_key_store_pwd()`
- `tls.get_trust_store_pwd()`
- `tls.get_client_certificate_location()`
- `tls.get_client_key_location()`
- `tls.get_ca_chain_location()`

In [175]:
assert len(tls.get_key_store()) > 0
assert len(tls.get_trust_store()) > 0
assert len(tls.get_key_store_pwd()) > 0
assert len(tls.get_trust_store_pwd()) > 0
assert len(tls.get_client_certificate_location()) > 0
assert len(tls.get_client_key_location()) > 0
assert len(tls.get_ca_chain_location()) > 0

An error was encountered:
Invalid status code '404' from http://10.0.2.15:8998/sessions/16 with error payload: {"msg":"Session '16' not found."}


## Serving Tests

These tests require that you have the following files in the Resources directory:

- `iris_model.knn`
- `iris_flower_classifier.py`
- `mnist`

Where mnist is a directory containing a tensorflow model.

These files can be downloaded from here: `http://snurran.sics.se/hops/hops-util-py_test/`

##### Test Export Model HDFS

In [37]:
model_path_relative = "Resources"

model.export(model_path_relative, "IrisFlowerClassifier", model_version=1, overwrite=True)
model.export(model_path_relative, "IrisFlowerClassifier", model_version=2, overwrite=True, metrics={'accuracy': 21.5, 'loss': 31.3})
model.export(model_path_relative, "IrisFlowerClassifier", model_version=3, overwrite=True, metrics={'accuracy': 0.5})
model.export(model_path_relative, "IrisFlowerClassifier", model_version=4, overwrite=True, metrics={'accuracy': 9})
model.export(model_path_relative, "IrisFlowerClassifier", model_version=5, overwrite=True, metrics={'accuracy': 30})

model_path_abs = hdfs.project_path() + "Resources"

model.export(model_path_abs, "IrisFlowerClassifier_abs", metrics={'accuracy': 10.9})
model.export(model_path_abs, "IrisFlowerClassifier_abs", metrics={'accuracy': 21.5, 'loss': 31.3})
model.export(model_path_abs, "IrisFlowerClassifier_abs", metrics={'accuracy': 0.5})
model.export(model_path_abs, "IrisFlowerClassifier_abs")
model.export(model_path_abs, "IrisFlowerClassifier_abs", metrics={'accuracy': 30})

Exported model IrisFlowerClassifier as version 1 successfully.
Polling IrisFlowerClassifier version 1 for model availability.
Model now available.
Exported model IrisFlowerClassifier as version 2 successfully.
Polling IrisFlowerClassifier version 2 for model availability.
Model now available.
Exported model IrisFlowerClassifier as version 3 successfully.
Polling IrisFlowerClassifier version 3 for model availability.
Model now available.
Exported model IrisFlowerClassifier as version 4 successfully.
Polling IrisFlowerClassifier version 4 for model availability.
Model now available.
Exported model IrisFlowerClassifier as version 5 successfully.
Polling IrisFlowerClassifier version 5 for model availability.
Model now available.
Exported model IrisFlowerClassifier_abs as version 11 successfully.
Polling IrisFlowerClassifier_abs version 11 for model availability.
Model now available.
Exported model IrisFlowerClassifier_abs as version 12 successfully.
Polling IrisFlowerClassifier_abs version

##### Test Export Model Local

In [38]:
local_model_dir = os.getcwd() + '/model'
local_model_file = local_model_dir + '/model.pb'
if not os.path.exists(local_model_dir):
    os.mkdir(local_model_dir)
    f = open(local_model_file, "w")
    f.write("model")
    f.close()
    
model.export(local_model_dir, 'local_model_dir')
model.export('model', 'local_model_dir')

model.export(local_model_file, 'local_model_file')

Started copying local path /srv/hops/hopsdata/tmp/nm-local-dir/usercache/NLucCipWZt56uigje1cP3j3_fYyvpEcQ7n_WV6vHxY8/appcache/application_1571991823790_0002/container_e03_1571991823790_0002_01_000001/model/model.pb to hdfs path hdfs://10.0.2.15:8020/Projects/fawfawdasd/Models/local_model_dir/11

Finished copying

Exported model local_model_dir as version 11 successfully.
Polling local_model_dir version 11 for model availability.
Model now available.
Started copying local path model/model.pb to hdfs path hdfs://10.0.2.15:8020/Projects/fawfawdasd/Models/local_model_dir/12

Finished copying

Exported model local_model_dir as version 12 successfully.
Polling local_model_dir version 12 for model availability.
Model now available.
Started copying local path /srv/hops/hopsdata/tmp/nm-local-dir/usercache/NLucCipWZt56uigje1cP3j3_fYyvpEcQ7n_WV6vHxY8/appcache/application_1571991823790_0002/container_e03_1571991823790_0002_01_000001/model/model.pb to hdfs path hdfs://10.0.2.15:8020/Projects/fawfaw

In [39]:
try:
    model.export(model_path_relative, "IrisFlowerClassifier", model_version=4, overwrite=True, metrics={'accuracy': "not number"})
    assert False
except AssertionError:
    assert True    

In [40]:
try:
    model.export(model_path_abs, "IrisFlowerClassifier", model_version=4, overwrite=True, metrics={1337: "0.5"})
    assert False
except AssertionError:
    assert True    

In [41]:
assert hdfs.exists("Models/IrisFlowerClassifier/1/iris_knn.pkl")
assert hdfs.exists("Models/IrisFlowerClassifier/1/iris_flower_classifier.py")

In [42]:
best_model = model.get_best_model("IrisFlowerClassifier", 'accuracy', Metric.MAX)
print(best_model)
assert best_model['name'] == "IrisFlowerClassifier"
assert best_model['version'] == 5

{'type': 'modelDTO', 'href': 'https://hopsworks0.logicalclocks.com:8181/hopsworks-api/api/project/1144/models/IrisFlowerClassifier_5', 'created': '2019-10-25T08:53:47.582', 'description': 'A collection of models for IrisFlowerClassifier', 'id': 'IrisFlowerClassifier_5', 'metrics': {'accuracy': '30'}, 'name': 'IrisFlowerClassifier', 'userFullName': 'Admin Admin', 'version': 5}

In [43]:
best_model = model.get_best_model("IrisFlowerClassifier", 'accuracy', Metric.MIN)
print(best_model)
assert best_model['name'] == "IrisFlowerClassifier"
assert best_model['version'] == 3

{'type': 'modelDTO', 'href': 'https://hopsworks0.logicalclocks.com:8181/hopsworks-api/api/project/1144/models/IrisFlowerClassifier_3', 'created': '2019-10-25T08:53:32.951', 'description': 'A collection of models for IrisFlowerClassifier', 'id': 'IrisFlowerClassifier_3', 'metrics': {'accuracy': '0.5'}, 'name': 'IrisFlowerClassifier', 'userFullName': 'Admin Admin', 'version': 3}

In [44]:
try:
    best_model = model.get_best_model("not_exist", 'accuracy', Metric.MIN)
    assert False
except model.ModelNotFound:
    assert True

In [45]:
try:
    best_model = model.get_best_model("IrisFlowerClassifier", 'not_exist', Metric.MIN)
    assert False
except model.ModelNotFound:
    assert True

In [46]:
try:
    model.get_model("mnist", 3)
    assert False
except model.ModelNotFound:
    assert True

In [47]:
model_path = "Resources/mnist/"
model.export(model_path, "mnist", model_version=2, overwrite=True)
assert hdfs.exists("Models/mnist/2/")

Exported model mnist as version 2 successfully.
Polling mnist version 2 for model availability.
Model now available.

##### Test Serve Model

In [48]:
script_path = "Models/IrisFlowerClassifier/1/iris_flower_classifier.py"
serving.exists("IrisFlowerClassifier")
if serving.exists("IrisFlowerClassifier"):
    serving.delete("IrisFlowerClassifier")
serving.create_or_update(script_path, "IrisFlowerClassifier", serving_type="SKLEARN", 
                                 model_version=1)

Creating a serving for model IrisFlowerClassifier ...
Serving for model IrisFlowerClassifier successfully created

In [49]:
assert serving.exists("IrisFlowerClassifier")

In [50]:
model_path = "Models/mnist/2/"
if serving.exists("mnist"):
    serving.delete("mnist")
serving.create_or_update(model_path, "mnist", serving_type="TENSORFLOW", 
                                 model_version=2)

Creating a serving for model mnist ...
Serving for model mnist successfully created

In [51]:
assert serving.exists("mnist")

##### Test Data Access Operations on Model

In [52]:
assert serving.get_id("IrisFlowerClassifier") is not None
assert serving.get_id("mnist") is not None
assert "Models/IrisFlowerClassifier/1/iris_flower_classifier.py" in serving.get_artifact_path("IrisFlowerClassifier")
assert "Models/mnist/2/" in serving.get_artifact_path("mnist")
assert serving.get_type("IrisFlowerClassifier") == "SKLEARN"
assert serving.get_type("mnist") == "TENSORFLOW"
assert serving.get_version("IrisFlowerClassifier") == 1
assert serving.get_version("mnist") == 2
assert serving.get_kafka_topic("IrisFlowerClassifier") is not None
assert serving.get_kafka_topic("mnist") is not None
assert serving.get_status("IrisFlowerClassifier") == "Stopped"
assert serving.get_status("mnist") == "Stopped"

##### Test Start/Stop Serving

In [53]:
serving.start("IrisFlowerClassifier")
serving.start("mnist")

Starting serving with name: IrisFlowerClassifier...
Serving with name: IrisFlowerClassifier successfully started
Starting serving with name: mnist...
Serving with name: mnist successfully started

In [54]:
assert serving.get_status("IrisFlowerClassifier") == "Running"
assert serving.get_status("mnist") == "Running"

In [55]:
serving.stop("IrisFlowerClassifier")
serving.stop("mnist")

Stopping serving with name: IrisFlowerClassifier...
Serving with name: IrisFlowerClassifier successfully stopped
Stopping serving with name: mnist...
Serving with name: mnist successfully stopped

In [56]:
assert serving.get_status("IrisFlowerClassifier") == "Stopped"
assert serving.get_status("mnist") == "Stopped"

##### Test Send Inference Requests

In [57]:
serving.start("IrisFlowerClassifier")
serving.start("mnist")

Starting serving with name: IrisFlowerClassifier...
Serving with name: IrisFlowerClassifier successfully started
Starting serving with name: mnist...
Serving with name: mnist successfully started

In [58]:
for i in range(20):
    data = {"inputs" : [[random.uniform(1, 8) for i in range(4)]]}
    response = serving.make_inference_request("IrisFlowerClassifier", data)
    print(response)
    assert response is not None
    assert "predictions" or "prediction" in response

{'predictions': [0]}
{'predictions': [0]}
{'predictions': [1]}
{'predictions': [0]}
{'predictions': [1]}
{'predictions': [0]}
{'predictions': [0]}
{'predictions': [1]}
{'predictions': [0]}
{'predictions': [0]}
{'predictions': [2]}
{'predictions': [0]}
{'predictions': [1]}
{'predictions': [2]}
{'predictions': [1]}
{'predictions': [0]}
{'predictions': [0]}
{'predictions': [0]}
{'predictions': [0]}
{'predictions': [0]}

In [59]:
for i in range(20):
    data = {
                "signature_name": 'predict_images',
                "instances": [np.random.rand(784).tolist()]
            }
    response = serving.make_inference_request("mnist", data)
    print(response)
    assert response is not None
    assert "predictions" in response

{'predictions': [[0.00164127175, 5.96660321e-09, 0.421270937, 0.252333134, 9.5285659e-06, 0.319685638, 0.00119336753, 6.79556761e-05, 0.00376569643, 3.24859611e-05]]}
{'predictions': [[0.000162672572, 1.6033546e-08, 0.813967884, 0.0811143368, 1.34863186e-07, 0.0978580043, 0.000632441486, 1.0146463e-05, 0.006224351, 2.99392887e-05]]}
{'predictions': [[0.000340557075, 2.91633118e-09, 0.284717, 0.415190637, 1.27425676e-06, 0.267949641, 0.00123108109, 6.81755228e-06, 0.0304695051, 9.34484488e-05]]}
{'predictions': [[0.000192859312, 4.40479786e-09, 0.145375639, 0.404690772, 8.73659829e-08, 0.444295526, 5.80828237e-05, 1.76897502e-05, 0.00536274444, 6.5450763e-06]]}
{'predictions': [[2.72578454e-05, 7.65436781e-09, 0.104631409, 0.843327582, 5.98233146e-06, 0.0425032564, 0.000137176597, 7.24815836e-05, 0.00917834789, 0.0001165957]]}
{'predictions': [[8.40702487e-05, 2.64554512e-08, 0.0373441279, 0.18434687, 2.55491286e-05, 0.713262379, 0.000958068122, 0.000318706181, 0.0630112737, 0.000648908

##### Test Kafka Inference Log

In [60]:
# Avro Python is only supported in python 2
if sys.version_info[0] < 3:
    topic = serving.get_kafka_topic("IrisFlowerClassifier")
    config = kafka.get_kafka_default_config()
    config['default.topic.config'] = {'auto.offset.reset': 'earliest'}
    consumer = Consumer(config)
    topics = [topic]
    consumer.subscribe(topics)
    json_schema = kafka.get_schema(topic)
    avro_schema = kafka.convert_json_schema_to_avro(json_schema)

In [61]:
# Avro Python is only supported in python 2
if sys.version_info[0] < 3:
    for i in range(0, 10):
        msg = consumer.poll(timeout=1.5)
        if msg is not None:
            value = msg.value()
            event_dict = kafka.parse_avro_msg(value, avro_schema)
            assert "modelName" in event_dict
            assert "requestTimestamp" in event_dict
            assert "servingType" in event_dict
            assert "inferenceResponse" in event_dict
            assert event_dict["modelName"] == "IrisFlowerClassifier"
            assert event_dict["servingType"] == "SKLEARN"

In [62]:
# Avro Python is only supported in python 2
if sys.version_info[0] < 3:
    topic = serving.get_kafka_topic("mnist")
    config = kafka.get_kafka_default_config()
    config['default.topic.config'] = {'auto.offset.reset': 'earliest'}
    consumer = Consumer(config)
    topics = [topic]
    consumer.subscribe(topics)
    json_schema = kafka.get_schema(topic)
    avro_schema = kafka.convert_json_schema_to_avro(json_schema)

In [63]:
# Avro Python is only supported in python 2
if sys.version_info[0] < 3:
    for i in range(0, 10):
        msg = consumer.poll(timeout=1.5)
        if msg is not None:
            value = msg.value()
            event_dict = kafka.parse_avro_msg(value, avro_schema)
            assert "modelName" in event_dict
            assert "requestTimestamp" in event_dict
            assert "servingType" in event_dict
            assert "inferenceResponse" in event_dict
            assert event_dict["modelName"] == "mnist"
            assert event_dict["servingType"] == "TENSORFLOW"

##### Test Delete Serving

In [64]:
serving.delete("IrisFlowerClassifier")
serving.delete("mnist")

Deleting serving with name: IrisFlowerClassifier...
Serving with name: IrisFlowerClassifier successfully deleted
Deleting serving with name: mnist...
Serving with name: mnist successfully deleted

In [65]:
assert not serving.exists("IrisFlowerClassifier")
assert not serving.exists("mnist")

## Pandas and Numpy helper

In [None]:
from hops import pandas_helper as pandas
import pandas as pd

lst = ['Geeks', 'For', 'Geeks', 'is', 'portal', 'for', 'Geeks']

data = {'Name':['Tom', 'nick', 'krish', 'jack'], 'Age':[20, 21, 19, 18]}

data1 = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'],
        'Age':[27, 24, 22, 32],
        'Address':['Delhi', 'Kanpur', 'Allahabad', 'Kannauj'],
        'Qualification':['Msc', 'MA', 'MCA', 'Phd']}

pandas_df =  pd.DataFrame(data)
pandas.write_csv("Resources/team-pandas.csv", pandas_df)
pandas.write_parquet("Resources/team-pandas.parquet", pandas_df)
pandas.write_json("Resources/team-pandas.json", pandas_df)

test_df = pandas.read_csv("Resources/team-pandas.csv")
test_df.count()

test_df = pandas.read_json("Resources/team-pandas.json")
test_df.count()

#test_df = pandas.read_parquet("Resources/team-pandas.parquet")
#test_df.count()

In [None]:
from hops import numpy_helper as numpy
import numpy as np

numpy_df = np.array([1, 2, 3])
x = np.arange(10)

numpy_df.shape

numpy.save("Resources/numpy-path.npy", numpy_df)
numpy.savez("Resources/numpy-path.npz", numpy_df, x )
numpy.savez("Resources/numpy-compressed.npz", numpy_df, x )

npzfile = numpy.load("Resources/numpy-path.npz")
npzfile.files

compressed = numpy.load("Resources/numpy-compressed.npz")
compressed.files