# `hops-util-py` Integration Tests

This notebook can be converted to a python file and submitted as a spark job for integration tests

## Imports

In [170]:
from hops import experiment, hdfs, tensorboard, devices, kafka, featurestore, tls, util, serving, constants
import stat
import os
import shutil
from pyspark.sql import SQLContext
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, LongType, IntegerType, FloatType
import pandas as pd
import numpy as np
import datetime
import time
import json
from pyspark.sql import DataFrame
from petastorm.unischema import dict_to_spark_row, Unischema, UnischemaField
from petastorm.codecs import ScalarCodec, CompressedImageCodec, NdarrayCodec
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType
from pyspark.sql import SparkSession
import tensorflow as tf
import sys
import random
from confluent_kafka import Producer, Consumer, KafkaError

## Experiment API Tests

In [2]:
def exp_asserts():
    from hops import tensorboard
    from hops import devices
    from hops import hdfs
    import os
    assert tensorboard.logdir() != None
    assert devices.get_num_gpus() >= 0
    assert hdfs.project_path() == hdfs.project_path(hdfs.project_name())
    if tensorboard.local_logdir_bool:
        assert "hdfs://" not in tensorboard.logdir()
        assert os.path.exists(tensorboard.logdir())
    else:
        assert "hdfs://" in tensorboard.logdir()
        assert hdfs.exists(tensorboard.logdir())

In [3]:
def no_ret():
    exp_asserts()

In [4]:
def no_ret_params(a, b):
    exp_asserts()

In [5]:
def single_ret_no_name():
    exp_asserts()
    return 10

In [167]:
def single_ret_no_name_params(a, b):
    exp_asserts()
    return a+b

In [7]:
def single_ret_path():
    exp_asserts()
    f = open('testfile.txt', 'w')
    f.write('stuff happened')
    f.close()
    return {'logfile': 'testfile.txt'}

In [8]:
def single_ret_path_params(a, b):
    exp_asserts()
    f = open('testfile.txt', 'w')
    f.write('stuff happened')
    f.close()
    return {'logfile': 'testfile.txt'}

In [9]:
def single_ret_val():
    exp_asserts()
    return {'value': 10}

In [10]:
def single_ret_val_params(a, b):
    exp_asserts()
    return {'value': a+b}

In [11]:
def multi_ret():
    exp_asserts()
    f = open('testfile.txt', 'w')
    f.write('stuff happened')
    f.close()
    return {'value': 10, 'morevals': 0.5, 'logfile': 'testfile.txt'}

In [12]:
def multi_ret_params(a, b):
    exp_asserts()
    f = open('testfile.txt', 'w')
    f.write('stuff happened')
    f.close()
    return {'value': a+b, 'morevals': b, 'logfile': 'testfile.txt', 'diagram': 'img.png'}

In [13]:
def assert_return_values(logdir, hp_dict, should_return_hp_dict, return_dict, should_return_return_dict):
    assert hdfs.exists(logdir)
    
    if should_return_hp_dict:
        assert type(hp_dict) == dict
        
    if should_return_return_dict:
        assert type(return_dict) == dict    

In [164]:
def assert_best_hyperparameters(return_dict, best_hyperparameters):
    for key in best_hyperparameters.keys():
        assert float(best_hyperparameters[key]) == float(return_dict[key]), '{} not equal to {}'.format(best_hyperparameters[key], return_dict[key])

In [168]:
def assert_return_dict(logdir, return_dict):
    return_dict_contents = hdfs.load(logdir + '/.return')
    logdir_return_dict = json.loads(return_dict_contents)
    assert return_dict == logdir_return_dict, 'dics are not the same {} - {}'.format(return_dict, logdir_return_dict)
    
    
    

##### Test `experiment.launch`

In [15]:
from hops import experiment
params={'a': [-5, 4.9], 'b': [-8, 10.3]}

logdir, return_dict = experiment.launch(no_ret, local_logdir=False)
assert_return_values(logdir, None, False, return_dict, True)

logdir, return_dict = experiment.launch(no_ret, local_logdir=True)
assert_return_values(logdir, None, False, return_dict, True)
                     
logdir, return_dict = experiment.launch(no_ret_params, params, local_logdir=True)
assert_return_values(logdir, None, False, return_dict, True)
                     
logdir, return_dict = experiment.launch(no_ret_params, params, local_logdir=False)
assert_return_values(logdir, None, False, return_dict, True)

logdir, return_dict = experiment.launch(single_ret_no_name, local_logdir=False, name='some custom name')
assert_return_values(logdir, None, False, return_dict, True)
                     
logdir, return_dict = experiment.launch(single_ret_no_name, local_logdir=True)
assert_return_values(logdir, None, False, return_dict, True)
                     
logdir, return_dict = experiment.launch(single_ret_no_name_params, params, local_logdir=True)
assert_return_values(logdir, None, False, return_dict, True)

logdir, return_dict = experiment.launch(single_ret_no_name_params, params, local_logdir=False)
assert_return_values(logdir, None, False, return_dict, True)

logdir, return_dict = experiment.launch(single_ret_path, local_logdir=False, description='some custom desc')
assert_return_values(logdir, None, False, return_dict, True)

logdir, return_dict = experiment.launch(single_ret_path, local_logdir=True)
assert_return_values(logdir, None, False, return_dict, True)

logdir, return_dict = experiment.launch(single_ret_path_params, params, local_logdir=True)
assert_return_values(logdir, None, False, return_dict, True)

logdir, return_dict = experiment.launch(single_ret_path_params, params, local_logdir=False)
assert_return_values(logdir, None, False, return_dict, True)

logdir, return_dict = experiment.launch(single_ret_val, local_logdir=False, name='some custom name', description='some custom desc')
assert_return_values(logdir, None, False, return_dict, True)

logdir, return_dict = experiment.launch(single_ret_val, local_logdir=True)
assert_return_values(logdir, None, False, return_dict, True)

logdir, return_dict = experiment.launch(single_ret_val_params, params, local_logdir=True)
assert_return_values(logdir, None, False, return_dict, True)

logdir, return_dict = experiment.launch(single_ret_val_params, params, local_logdir=False)
assert_return_values(logdir, None, False, return_dict, True)

logdir, return_dict = experiment.launch(multi_ret, local_logdir=False)
assert_return_values(logdir, None, False, return_dict, True)

logdir, return_dict = experiment.launch(multi_ret, local_logdir=True)
assert_return_values(logdir, None, False, return_dict, True)

logdir, return_dict = experiment.launch(multi_ret_params, params, local_logdir=False)
assert_return_values(logdir, None, False, return_dict, True)

logdir, return_dict = experiment.launch(multi_ret_params, params, local_logdir=True)
assert_return_values(logdir, None, False, return_dict, True)                    

'NoneType' object is not iterable
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/experiment.py", line 108, in launch
    logdir, return_dict = launcher._run(sc, map_fun, run_id, args_dict, local_logdir)
TypeError: 'NoneType' object is not iterable



##### Test Parallel Experiments `experiment.random_search`

In [172]:
params={'a': [-5, 4.9], 'b': [-8, 10.3]}
try:
    experiment.grid_search(no_ret_params, params)
    assert False, 'should fail due to no return value'
except:
    pass

try:
    experiment.grid_search(multi_ret_params, params)
    assert False, 'should fail due to optimization_key not being set'
except:
    pass
    
logdir, hp_dict, return_dict = experiment.grid_search(single_ret_no_name_params, params, local_logdir=True, direction='min')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.grid_search(single_ret_no_name_params, params, local_logdir=False, direction='max')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.grid_search(single_ret_val_params, params, local_logdir=True, direction='min')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.grid_search(single_ret_val_params, params, local_logdir=False, direction='max')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.grid_search(multi_ret_params, params, local_logdir=False, direction='min', optimization_key='value')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, metric = experiment.grid_search(multi_ret_params, params, local_logdir=True, direction='max', optimization_key='morevals')
assert_return_values(logdir, hp_dict, True, return_dict, True)

params={'a': [-1, 1.5], 'b': [-1.5, 1]}

# Make sure minimization work
logdir, hp_dict, return_dict = experiment.grid_search(single_ret_no_name_params, params, local_logdir=True, direction='min')
assert_return_values(logdir, hp_dict, True, return_dict, True)
assert_best_hyperparameters(hp_dict, {'a': -1, 'b': -1.5})
assert_return_dict(logdir, return_dict)

# Make sure maximization work
logdir, hp_dict, return_dict = experiment.grid_search(single_ret_no_name_params, params, local_logdir=True, direction='max')
assert_return_values(logdir, hp_dict, True, return_dict, True)
assert_best_hyperparameters(hp_dict, {'a': 1.5, 'b': 1})
assert_return_dict(logdir, return_dict)

/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_41?xattr=CREATE
<Response [200]>
/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_41?xattr=REPLACE
<Response [200]>
/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_42?xattr=CREATE
<Response [200]>
/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_42?xattr=REPLACE
<Response [200]>
/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_43?xattr=CREATE
<Response [200]>
Finished Experiment 

/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_43?xattr=REPLACE
<Response [200]>
/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_44?xattr=CREATE
<Response [200]>
Finished Experiment 

/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_44?xattr=REPLACE
<Response [200]>

##### Test Parallel Experiments `experiment.random_search`

In [17]:
try:
    experiment.random_search(no_ret_params, params, samples=2)
    assert False, 'should fail due to no return value'
except:
    pass

try:
    experiment.random_search(multi_ret_params, params, samples=2)
    assert False, 'should fail due to optimization_key not being set'
except:
    pass

logdir, hp_dict, return_dict = experiment.random_search(single_ret_no_name_params, params, samples=2, local_logdir=True, direction='min')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.random_search(single_ret_no_name_params, params, samples=2, local_logdir=False, direction='max')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.random_search(single_ret_val_params, params, samples=2, local_logdir=True, direction='min')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.random_search(single_ret_val_params, params, samples=2, local_logdir=False, direction='max')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.random_search(multi_ret_params, params, samples=2, local_logdir=False, direction='max', optimization_key='value')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.random_search(multi_ret_params, params, samples=2, local_logdir=True, direction='min', optimization_key='morevals')
assert_return_values(logdir, hp_dict, True, return_dict, True)

/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_10?xattr=CREATE
<Response [200]>
/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_10?xattr=REPLACE
<Response [200]>
/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_11?xattr=CREATE
<Response [200]>
/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_11?xattr=REPLACE
<Response [200]>
/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_12?xattr=CREATE
<Response [200]>
Finished Experiment 

/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_12?xattr=REPLACE
<Response [200]>
/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_13?xattr=CREATE
<Response [200]>
Finished Experiment 

/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_13?xattr=REPLACE
<Response [200]>
/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_14?xattr=CREATE
<Respo

##### Test Parallel Experiments `experiment.differential_evolution`

In [160]:
try:
    experiment.differential_evolution(no_ret_params, params)
    assert False, 'should fail due to no return value'
except:
    pass

try:
    experiment.differential_evolution(multi_ret_params, params)
    assert False, 'should fail due to optimization_key not being set'
except:
    pass

logdir, hp_dict, return_dict = experiment.differential_evolution(single_ret_no_name_params, params, local_logdir=True, direction='min')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.differential_evolution(single_ret_no_name_params, params, local_logdir=False, direction='max')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.differential_evolution(single_ret_val_params, params, local_logdir=True, direction='min')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.differential_evolution(single_ret_val_params, params,local_logdir=False, direction='max')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.differential_evolution(multi_ret_params, params, local_logdir=False, direction='max', optimization_key='value')
assert_return_values(logdir, hp_dict, True, return_dict, True)

logdir, hp_dict, return_dict = experiment.differential_evolution(multi_ret_params, params, local_logdir=True, direction='min', optimization_key='morevals')
assert_return_values(logdir, hp_dict, True, return_dict, True)

/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_18?xattr=CREATE
<Response [200]>
/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_18?xattr=REPLACE
<Response [200]>
/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_19?xattr=CREATE
<Response [200]>
/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_19?xattr=REPLACE
<Response [200]>
/hopsworks-api/api/project/120/experiments/application_1565768877650_0025_20?xattr=CREATE
<Response [200]>
Generation 1 || average metric: -0.8333333333333334, best metric: -6.0, best parameter combination: ['a=-3', 'b=-3']

Generation 2 || average metric: -5.0, best metric: -9.0, best parameter combination: ['a=-3', 'b=-6']

Generation 3 || average metric: -6.333333333333333, best metric: -9.0, best parameter combination: ['a=-3', 'b=-6']

Generation 4 || average metric: -7.5, best metric: -11.0, best parameter combination: ['a=-4', 'b=-7']

Finished Experiment 


##### Test Distributed Training `experiment.collective_all_reduce`

## HopsFS Tests

##### Test HopsFS operations

- `hdfs.project_user()`
- `hdfs.project_name()`
- `hdfs.project_path()`
- `hdfs.exists()`
- `hdfs.load()`
- `hdfs.copy_to_hdfs()`
- `hdfs.copy_to_local()`
- `hdfs.ls()`
- `hdfs.lsl()`
- `hdfs.glob()`
- `hdfs.cp()`
- `hdfs.rmr()`
- `hdfs.rename()`
- `hdfs.stat()`
- `hdfs.isdir()`
- `hdfs.isfile()`
- `hdfs.add_module()`
- `hdfs.delete()`
- `hdfs.get_plain_path()`

In [19]:
project_user = hdfs.project_user()
project_name = hdfs.project_name()
assert project_name in project_user
project_path = hdfs.project_path()
assert project_name in project_path

In [20]:
logs_README = hdfs.load("Logs/README.md")
assert len(logs_README) > 0

In [21]:
hdfs.dump("test", "Logs/README_dump_test.md")
assert hdfs.exists("Logs/README_dump_test.md")

In [22]:
logs_README_dumped = hdfs.load("Logs/README_dump_test.md")
assert logs_README_dumped.decode("utf-8") == "test"

In [23]:
# copy_to_hdfs file relative path

with open('upload.txt', 'w') as f:
    f.write("first upload")
hdfs.copy_to_hdfs("upload.txt", "Resources")
assert hdfs.exists("Resources/upload.txt")
hdfs_copied_file = hdfs.load("Resources/upload.txt")
assert "first upload" == hdfs_copied_file.decode("utf-8"), "first content does not match"

with open('upload.txt', 'w') as f:
    f.write("second upload")
hdfs.copy_to_hdfs("upload.txt", "Resources", overwrite=True)
assert hdfs.exists("Resources/upload.txt")
hdfs_copied_file = hdfs.load("Resources/upload.txt")
assert "second upload" == hdfs_copied_file.decode("utf-8"), "second content does not match"

try:
    hdfs.copy_to_hdfs("upload.txt", "Resources")
    assert False
except IOError:
    pass

hdfs.rmr("Resources/upload.txt")
os.remove("upload.txt")

Started copying local path upload.txt to hdfs path hdfs://10.0.2.15:8020/Projects/collect/Resources

Finished copying

Started copying local path upload.txt to hdfs path hdfs://10.0.2.15:8020/Projects/collect/Resources/upload.txt

Finished copying

Started copying local path upload.txt to hdfs path hdfs://10.0.2.15:8020/Projects/collect/Resources

In [24]:
# copy_to_hdfs file absolute path

with open('upload_absolute.txt', 'w') as f:
    f.write("first upload")
hdfs.copy_to_hdfs(os.getcwd() + "/upload_absolute.txt", "Resources")
assert hdfs.exists("Resources/upload_absolute.txt")
hdfs_copied_file = hdfs.load("Resources/upload_absolute.txt")
assert "first upload" == hdfs_copied_file.decode("utf-8"), "first content does not match"

with open('upload_absolute.txt', 'w') as f:
    f.write("second upload")
hdfs.copy_to_hdfs(os.getcwd() + "/upload_absolute.txt", "Resources", overwrite=True)
assert hdfs.exists("Resources/upload_absolute.txt")
hdfs_copied_file = hdfs.load("Resources/upload_absolute.txt")
assert "second upload" == hdfs_copied_file.decode("utf-8"), "second content does not match"

try:
    hdfs.copy_to_hdfs("upload_absolute.txt", "Resources")
    assert False
except IOError:
    pass

hdfs.rmr("Resources/upload_absolute.txt")
os.remove("upload_absolute.txt")

Started copying local path /srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/upload_absolute.txt to hdfs path hdfs://10.0.2.15:8020/Projects/collect/Resources

Finished copying

Started copying local path /srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/upload_absolute.txt to hdfs path hdfs://10.0.2.15:8020/Projects/collect/Resources/upload_absolute.txt

Finished copying

Started copying local path upload_absolute.txt to hdfs path hdfs://10.0.2.15:8020/Projects/collect/Resources

In [25]:
# copy_to_hdfs directory relative path

if not os.path.exists("upload_dir"):
    os.mkdir("upload_dir")

assert not hdfs.exists("Resources/upload_dir")
with open('upload_dir/upload.txt', 'w') as f:
    f.write("first upload")
hdfs.copy_to_hdfs("upload_dir", "Resources")
hdfs_copied_file = hdfs.load("Resources/upload_dir/upload.txt")
assert hdfs.exists("Resources/upload_dir")
with open('upload_dir/upload.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_copied_file.decode("utf-8") == local_copied_file, "first content compare failed"

with open('upload_dir/upload.txt', 'w') as f:
    f.write("second upload")
hdfs.copy_to_hdfs("upload_dir", "Resources", overwrite=True)
hdfs_copied_file = hdfs.load("Resources/upload_dir/upload.txt")
assert hdfs.exists("Resources/upload_dir")
with open('upload_dir/upload.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_copied_file.decode("utf-8") == local_copied_file, "second content compare failed"

shutil.rmtree("upload_dir")
hdfs.rmr("Resources/upload_dir")

Started copying local path upload_dir to hdfs path hdfs://10.0.2.15:8020/Projects/collect/Resources

Finished copying

Started copying local path upload_dir to hdfs path hdfs://10.0.2.15:8020/Projects/collect/Resources/upload_dir

Finished copying

In [26]:
# copy_to_hdfs directory absolute path

if not os.path.exists("upload_dir_absolute"):
    os.mkdir("upload_dir_absolute")
    
assert not hdfs.exists("Resources/upload_dir_absolute")
with open('upload_dir_absolute/upload.txt', 'w') as f:
    f.write("first upload")
hdfs.copy_to_hdfs(os.getcwd() + "/upload_dir_absolute", "Resources")
hdfs_copied_file = hdfs.load("Resources/upload_dir_absolute/upload.txt")
assert hdfs.exists("Resources/upload_dir_absolute")
with open('upload_dir_absolute/upload.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_copied_file.decode("utf-8") == local_copied_file, "first content compare failed"

with open('upload_dir_absolute/upload.txt', 'w') as f:
    f.write("second upload")
hdfs.copy_to_hdfs(os.getcwd() + "/upload_dir_absolute", "Resources", overwrite=True)
hdfs_copied_file = hdfs.load("Resources/upload_dir_absolute/upload.txt")
assert hdfs.exists("Resources/upload_dir_absolute")
with open('upload_dir_absolute/upload.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_copied_file.decode("utf-8") == local_copied_file, "second content compare failed"

shutil.rmtree("upload_dir_absolute")
hdfs.rmr("Resources/upload_dir_absolute")

Started copying local path /srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/upload_dir_absolute to hdfs path hdfs://10.0.2.15:8020/Projects/collect/Resources

Finished copying

Started copying local path /srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/upload_dir_absolute to hdfs path hdfs://10.0.2.15:8020/Projects/collect/Resources/upload_dir_absolute

Finished copying

In [27]:
#copy_to_local file

# Download first time
hdfs.dump("initial content", "Resources/somefile.txt")
hdfs.copy_to_local("Resources/somefile.txt")
hdfs_copied_file = hdfs.load("Resources/somefile.txt")
with open('somefile.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_copied_file.decode("utf-8") == local_copied_file, "first content compare failed"
first_modified = os.path.getmtime("somefile.txt")

# Download second time
hdfs.copy_to_local("Resources/somefile.txt")
hdfs_copied_file = hdfs.load("Resources/somefile.txt")
with open('somefile.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_copied_file.decode("utf-8") == local_copied_file, "second content compare failed"
second_modified = os.path.getmtime("somefile.txt")
assert first_modified == second_modified, "modified time not matching"

# Content changing on disk
hdfs.dump("content changed at some point", "Resources/somefile.txt")
hdfs_new_content = hdfs.load("Resources/somefile.txt")
hdfs.copy_to_local("Resources/somefile.txt")
with open('somefile.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_new_content.decode("utf-8") == local_copied_file, "third content compare failed"
third_modified = os.path.getmtime("somefile.txt")
assert not second_modified == third_modified, "modified time not matching"

# Download last time with overwrite, file should have changed on disk
hdfs.copy_to_local("Resources/somefile.txt", overwrite=True)
hdfs_copied_file = hdfs.load("Resources/somefile.txt")
with open('somefile.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_copied_file.decode("utf-8") == local_copied_file, "fourth content compare failed"
fourth_modified = os.path.getmtime("somefile.txt")
assert not third_modified == fourth_modified, "modified time not matching"

# Download again to make sure overwrite did not cause problems
hdfs.copy_to_local("Resources/somefile.txt")
hdfs_copied_file = hdfs.load("Resources/somefile.txt")
with open('somefile.txt', 'r') as f:
    local_copied_file = f.read()
assert hdfs_copied_file.decode("utf-8") == local_copied_file, "fifth content compare failed"
fifth_modified = os.path.getmtime("somefile.txt")
assert fourth_modified == fifth_modified, "modified time not matching"

hdfs.rmr("Resources/somefile.txt")
os.remove("somefile.txt")

Started copying hdfs://10.0.2.15:8020/Projects/collect/Resources/somefile.txt to local disk on path /srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/

Finished copying

File hdfs://10.0.2.15:8020/Projects/collect/Resources/somefile.txt is already localized, skipping download...
Started copying hdfs://10.0.2.15:8020/Projects/collect/Resources/somefile.txt to local disk on path /srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/

Finished copying

Started copying hdfs://10.0.2.15:8020/Projects/collect/Resources/somefile.txt to local disk on path /srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/

Finished copying

File hdfs:/

In [28]:
#copy_to_local directory

assert not os.path.exists("Resources")
hdfs.copy_to_local("Resources")
first_modified = os.path.getmtime("Resources")
assert os.path.exists("Resources")
assert os.path.isdir("Resources")

hdfs.copy_to_local("Resources")
second_modified = os.path.getmtime("Resources")
assert first_modified == second_modified

localized_dir = hdfs.copy_to_local("Resources", overwrite=True)
third_modified = os.path.getmtime("Resources")
assert not second_modified == third_modified
num_files_first = len(os.listdir(localized_dir))

# Add a new file, it should also be localized
hdfs.dump("a wild file appeared", "Resources/newfile.txt")
hdfs.copy_to_local("Resources")
fourth_modified = os.path.getmtime("Resources")
assert first_modified == second_modified
num_files_second = len(os.listdir(localized_dir))
assert (num_files_first + 1) == num_files_second
assert not third_modified == fourth_modified

hdfs.rmr("Resources/newfile.txt")
shutil.rmtree("Resources")

Started copying hdfs://10.0.2.15:8020/Projects/collect/Resources to local disk on path /srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/

Finished copying

Full directory subtree already on local disk and unchanged. Set overwrite=True to force download
Started copying hdfs://10.0.2.15:8020/Projects/collect/Resources to local disk on path /srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/

Finished copying

Started copying hdfs://10.0.2.15:8020/Projects/collect/Resources to local disk on path /srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/

Finished copying

In [29]:
logs_files_md = hdfs.glob("Logs/*.md")
logs_path_names = hdfs.lsl("Logs/")
if hdfs.exists("Logs/test.txt"):
    hdfs.rmr("Logs/test.txt")
assert not hdfs.exists("Logs/test.txt")

In [30]:
hdfs.dump("dummy", "Resources/test.txt")
hdfs.cp("Resources/test.txt", "Logs/")
logs_files = hdfs.ls("Logs/")
assert "test.txt" in ",".join(logs_files)

In [31]:
hdfs.mkdir("Logs/test_dir")
assert hdfs.exists("Logs/test_dir")

In [32]:
logs_files_prior_delete = hdfs.ls("Logs/")
hdfs.rmr("Logs/test_dir")
logs_files_after_delete = hdfs.ls("Logs/")
assert len(logs_files_prior_delete) > len(logs_files_after_delete)

In [33]:
logs_files_prior_move = hdfs.ls("Logs/")
assert "README_dump_test.md" in ",".join(logs_files_prior_move)

In [34]:
hdfs.move("Logs/README_dump_test.md", "Logs/README_dump_test2.md")
logs_files_after_move = hdfs.ls("Logs/")
assert "README_dump_test.md" not in ",".join(logs_files_after_move)
assert "README_dump_test2.md" in ",".join(logs_files_after_move)

In [35]:
logs_files_prior_rename = hdfs.ls("Logs/")
assert "README_dump_test2.md" in ",".join(logs_files_prior_rename)

In [36]:
hdfs.rename("Logs/README_dump_test2.md", "Logs/README_dump_test.md")
logs_files_after_rename = hdfs.ls("Logs/")
assert "Logs/README_dump_test2.md" not in ",".join(logs_files_after_rename)
assert "Logs/README_dump_test.md" in ",".join(logs_files_after_rename)

In [37]:
file_stat = hdfs.stat("Logs/README.md")
hdfs.chmod("Logs/README.md", 775)
file_stat = hdfs.stat("Logs/README.md")
assert 775 == file_stat.st_mode

In [38]:
hdfs.chmod("Logs/README.md", 777)
file_stat = hdfs.stat("Logs/README.md")
assert 777 == file_stat.st_mode

In [39]:
file_owner = file_stat.st_uid
assert hdfs.exists("Logs/")
assert not hdfs.exists("Not_Existing/neither_am_i")

In [40]:
assert hdfs.isdir("Resources")
assert not hdfs.isdir("Resources/README.md")

In [41]:
assert hdfs.isfile("Resources/README.md")
assert not hdfs.isfile("Resources")

In [42]:
hdfs.dump("def simple():\n\treturn 5", "Resources/my_module.py")
py_path = hdfs.add_module("Resources/my_module.py")
assert py_path in sys.path
import my_module
assert my_module.simple() == 5

Started copying hdfs://10.0.2.15:8020/Projects/collect/Resources/my_module.py to local disk on path /srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/localized_deps

Finished copying

In [43]:
plain = hdfs.get_plain_path("hdfs://10.0.2.15:8020/Projects/demo_deep_learning_admin000/Models/")
assert plain == "/Projects/demo_deep_learning_admin000/Models/"

In [44]:
hdfs.mkdir("Logs/test_delete_dir")
assert hdfs.exists("Logs/test_delete_dir")
hdfs.delete("Logs/test_delete_dir")
assert not hdfs.exists("Logs/test_delete_dir")

## Feature Store Tests

These tests require that you have the following files in the Resources directory:

- `attendances_features.csv`
- `games_features.csv`
- `players_features.csv`
- `season_scores_features.csv`
- `teams_features.csv`

These files can be downloaded from here: `http://snurran.sics.se/hops/hops-util-py_test/`

##### Test Featurestore Create Feature Group Operations (`featurestore.create_featuregroup()`)

In [45]:
def load_fs_sample_data():
    resources_path = hdfs.project_path() + "Resources/"
    games_features_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(resources_path + "games_features.csv")
    players_features_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(resources_path + "players_features.csv")
    teams_features_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(resources_path + "teams_features.csv")
    season_scores_features_df = spark.read.format("csv").option("header", "true").option("inferSchema","true").load(resources_path + "season_scores_features.csv")
    attendances_features_df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(resources_path + "attendances_features.csv")
    return games_features_df,players_features_df,teams_features_df,season_scores_features_df, attendances_features_df
games_features_df,players_features_df,teams_features_df,season_scores_features_df, attendances_features_df = load_fs_sample_data()

'Path does not exist: hdfs://10.0.2.15:8020/Projects/collect/Resources/games_features.csv;'
Traceback (most recent call last):
  File "<stdin>", line 3, in load_fs_sample_data
  File "/srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/pyspark.zip/pyspark/sql/readwriter.py", line 166, in load
    return self._df(self._jreader.load(path))
  File "/srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, self.gateway_client, self.target_id, self.name)
  File "/srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/pyspark.zip/pyspark/sql/utils.py", line 69, in deco

In [46]:
featurestore.create_featuregroup(
    games_features_df,
    "games_features",
    description="Features of average season scores for football teams"
)

name 'games_features_df' is not defined
Traceback (most recent call last):
NameError: name 'games_features_df' is not defined



In [47]:
featurestore.create_featuregroup(
    teams_features_df,
    "teams_features",
    description="a spanish version of teams_features"
)

name 'teams_features_df' is not defined
Traceback (most recent call last):
NameError: name 'teams_features_df' is not defined



In [48]:
featurestore.create_featuregroup(
    season_scores_features_df,
    "season_scores_features",
    description="Features of average season scores for football teams"
)

name 'season_scores_features_df' is not defined
Traceback (most recent call last):
NameError: name 'season_scores_features_df' is not defined



In [49]:
featurestore.create_featuregroup(
    attendances_features_df,
    "attendances_features",
    description="Features of average attendance of games of football teams"
)

name 'attendances_features_df' is not defined
Traceback (most recent call last):
NameError: name 'attendances_features_df' is not defined



In [50]:
teams_features_1_df = featurestore.get_featuregroup("teams_features")
teams_features_2_df = teams_features_1_df.withColumnRenamed(
    "team_id", "equipo_id").withColumnRenamed(
    "team_budget", "equipo_presupuesto").withColumnRenamed(
    "team_position", "equipo_posicion")

Could not find the requested feature group with name: teams_features and version: 1 among the list of available feature groups: []
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 266, in get_featuregroup
    dataframe_type = dataframe_type, jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 695, in _do_get_featuregroup
    fg = query_planner._find_featuregroup(featurestore_metadata.featuregroups, featuregroup_name, featuregroup_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 224, in _find_featuregroup
    featuregroup_names))
hops.featurestore_impl.exceptions.exceptions.FeaturegroupNotFound: Could not find the requested feature group with name: teams_features and version: 1 among the list of available feature gr

In [51]:
featurestore.create_featuregroup(
    teams_features_2_df,
    "teams_features_spanish",
    description="a spanish version of teams_features",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False
)

name 'teams_features_2_df' is not defined
Traceback (most recent call last):
NameError: name 'teams_features_2_df' is not defined



In [52]:
featurestore.create_featuregroup(
    teams_features_2_df,
    "teams_features_spanish",
    description="a spanish version of teams_features",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    featurestore=featurestore.project_featurestore(),
    featuregroup_version=1
)

name 'teams_features_2_df' is not defined
Traceback (most recent call last):
NameError: name 'teams_features_2_df' is not defined



In [53]:
featurestore.create_featuregroup(
    teams_features_2_df,
    "teams_features_spanish",
    description="a spanish version of teams_features",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    featuregroup_version=2
)

name 'teams_features_2_df' is not defined
Traceback (most recent call last):
NameError: name 'teams_features_2_df' is not defined



In [54]:
from hops import hdfs
query = "SELECT * FROM games_features_1 WHERE score > 1"
storage_connector = hdfs.project_name() + "_featurestore"
featuregroup_name = "games_features_on_demand"
featurestore.create_on_demand_featuregroup(query, featuregroup_name, storage_connector)

Feature group created successfully

In [55]:
assert "games_features_1" in featurestore.get_featuregroups()
assert "teams_features_1" in featurestore.get_featuregroups()
assert "season_scores_features_1" in featurestore.get_featuregroups()
assert "attendances_features_1" in featurestore.get_featuregroups()
assert "teams_features_spanish_1" in featurestore.get_featuregroups()
assert "teams_features_spanish_2" in featurestore.get_featuregroups()
assert "games_features_on_demand_1" in featurestore.get_featuregroups()


Traceback (most recent call last):
AssertionError



##### Test Featurestore Utility Operations, 

- `featurestore.get_metadata()`,
- `featurestore.project_featurestore()`, 
- `featurestore.get_latest_featuregroup_version()`, 
- `featurestore.get_features_list()`

In [56]:
featurestore.get_featurestore_metadata(update_cache=True)

<hops.featurestore_impl.dao.common.featurestore_metadata.FeaturestoreMetadata object at 0x7fdcf5e3fbe0>

In [57]:
assert featurestore.project_featurestore() == hdfs.project_name() + "_featurestore"

In [58]:
assert featurestore.project_featurestore() in featurestore.get_project_featurestores()

In [59]:
assert len(featurestore.get_project_featurestores()) == 1

In [60]:
assert featurestore.get_latest_featuregroup_version("teams_features_spanish") == 2


Traceback (most recent call last):
AssertionError



In [61]:
assert featurestore.get_latest_featuregroup_version("teams_features") == 1


Traceback (most recent call last):
AssertionError



In [62]:
assert "away_team_id" in featurestore.get_features_list()


Traceback (most recent call last):
AssertionError



In [63]:
assert "home_team_id" in featurestore.get_features_list()


Traceback (most recent call last):
AssertionError



In [64]:
assert (hdfs.project_name() + "_featurestore", 'JDBC') in featurestore.get_storage_connectors()

In [65]:
assert len(featurestore.get_storage_connectors()) >= 3

##### Test Read operations of Features and Feature Groups, 

- `featurestore.get_feature()`, 
- `featurestore.get_features()`, 
- `featurestore.get_featuregroup()`

In [66]:
tmp = featurestore.get_feature("team_budget")
assert tmp.count() == 50
assert len(tmp.columns) == 1
assert "team_budget" in tmp.columns

Could not find the feature with name 'team_budget' in any of the featuregroups of the featurestore: 'collect_featurestore'
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 310, in get_feature
    jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 372, in _do_get_feature
    logical_query_plan.create_logical_plan()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 30, in create_logical_plan
    self._feature_query()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 75, in _feature_query
    featuregroups_parsed.values())
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_p

In [67]:
tmp = featurestore.get_feature(
    "team_budget", 
    featurestore=featurestore.project_featurestore(), 
    featuregroup="teams_features", 
    featuregroup_version = 1,
    dataframe_type = "spark"
)
assert tmp.count() == 50
assert len(tmp.columns) == 1
assert "team_budget" in tmp.columns

'teams_features_1'
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 310, in get_feature
    jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 372, in _do_get_feature
    logical_query_plan.create_logical_plan()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 30, in create_logical_plan
    self._feature_query()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 66, in _feature_query
    self.query.featuregroup_version)
KeyError: 'teams_features_1'



In [68]:
tmp = featurestore.get_featuregroup("teams_features")
assert tmp.count() == 50
assert len(tmp.columns) == 3
assert "team_budget" in tmp.columns
assert "team_id" in tmp.columns
assert "team_position" in tmp.columns

Could not find the requested feature group with name: teams_features and version: 1 among the list of available feature groups: ['games_features_on_demand_1']
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 266, in get_featuregroup
    dataframe_type = dataframe_type, jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 695, in _do_get_featuregroup
    fg = query_planner._find_featuregroup(featurestore_metadata.featuregroups, featuregroup_name, featuregroup_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 224, in _find_featuregroup
    featuregroup_names))
hops.featurestore_impl.exceptions.exceptions.FeaturegroupNotFound: Could not find the requested feature group with name: teams_features and version: 1 among the 

In [69]:
tmp = featurestore.get_featuregroup(
    "teams_features", 
    featurestore=featurestore.project_featurestore(), 
    featuregroup_version = 1,
    dataframe_type = "spark"
)
assert tmp.count() == 50
assert len(tmp.columns) == 3
assert "team_budget" in tmp.columns
assert "team_id" in tmp.columns
assert "team_position" in tmp.columns

Could not find the requested feature group with name: teams_features and version: 1 among the list of available feature groups: ['games_features_on_demand_1']
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 266, in get_featuregroup
    dataframe_type = dataframe_type, jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 695, in _do_get_featuregroup
    fg = query_planner._find_featuregroup(featurestore_metadata.featuregroups, featuregroup_name, featuregroup_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 224, in _find_featuregroup
    featuregroup_names))
hops.featurestore_impl.exceptions.exceptions.FeaturegroupNotFound: Could not find the requested feature group with name: teams_features and version: 1 among the 

In [70]:
features = ["team_budget", "average_attendance"]
tmp = featurestore.get_features(
    features
)
assert set(features) == set(tmp.columns)
assert tmp.count() == 50
assert len(tmp.columns) == len(features)

Could not find the feature with name 'team_budget' in any of the featuregroups of the featurestore: 'collect_featurestore'
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 357, in get_features
    join_key=join_key, dataframe_type=dataframe_type, jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 542, in _do_get_features
    logical_query_plan.create_logical_plan()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 32, in create_logical_plan
    self._features_query()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 145, in _features_query
    featuregroups_parsed.values())
  File "/srv/hops/anaconda/anaconda/envs/python36/lib

In [71]:
features = ["teams_features_1.team_budget", "attendances_features_1.average_attendance"]
tmp = featurestore.get_features(features)
assert set(["team_budget", "average_attendance"]) == set(tmp.columns)
assert tmp.count() == 50
assert len(tmp.columns) == len(features)

Could not find the feature with name 'attendances_features_1.average_attendance' in any of the featuregroups of the featurestore: 'collect_featurestore'
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 357, in get_features
    join_key=join_key, dataframe_type=dataframe_type, jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 542, in _do_get_features
    logical_query_plan.create_logical_plan()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 32, in create_logical_plan
    self._features_query()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 145, in _features_query
    featuregroups_parsed.values())
  File "/srv/hops/anaco

In [72]:
features = ["team_budget", "average_attendance"]
tmp = featurestore.get_features(
    features,
    featurestore=featurestore.project_featurestore(),
    featuregroups_version_dict={
        "teams_features": 1, 
        "attendances_features": 1
    }
)
assert set(features) == set(tmp.columns)
assert tmp.count() == 50
assert len(tmp.columns) == len(features)

descriptor 'intersection' of 'set' object needs an argument
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 357, in get_features
    join_key=join_key, dataframe_type=dataframe_type, jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 542, in _do_get_features
    logical_query_plan.create_logical_plan()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 32, in create_logical_plan
    self._features_query()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 130, in _features_query
    join_col = query_planner._get_join_col(featuregroups_filtered)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/f

In [73]:
tmp = featurestore.get_features(
    features,
    featurestore=featurestore.project_featurestore(),
    featuregroups_version_dict={
        "teams_features": 1, 
        "attendances_features": 1
    },
    join_key = "team_id",
    dataframe_type = "spark"
)
assert set(features) == set(tmp.columns)
assert tmp.count() == 50
assert len(tmp.columns) == len(features)

'teams_features_1'
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 357, in get_features
    join_key=join_key, dataframe_type=dataframe_type, jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 542, in _do_get_features
    logical_query_plan.create_logical_plan()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 32, in create_logical_plan
    self._features_query()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 115, in _features_query
    for entry in self.query.featuregroups_version_dict]
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py

In [74]:
features = ["team_budget", "average_attendance",
    "team_position", "sum_attendance"
    ]
tmp = featurestore.get_features(
   features
)
assert set(features) == set(tmp.columns)
assert tmp.count() == 50
assert len(tmp.columns) == len(features)

Could not find the feature with name 'sum_attendance' in any of the featuregroups of the featurestore: 'collect_featurestore'
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 357, in get_features
    join_key=join_key, dataframe_type=dataframe_type, jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 542, in _do_get_features
    logical_query_plan.create_logical_plan()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 32, in create_logical_plan
    self._features_query()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 145, in _features_query
    featuregroups_parsed.values())
  File "/srv/hops/anaconda/anaconda/envs/python36/

In [75]:
features = ["team_budget", "team_id"]
tmp = featurestore.get_features(
    features,
    featuregroups_version_dict = {
        "teams_features" : 1
    }
)
assert set(features) == set(tmp.columns)
assert tmp.count() == 50
assert len(tmp.columns) == len(features)

'teams_features_1'
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 357, in get_features
    join_key=join_key, dataframe_type=dataframe_type, jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 542, in _do_get_features
    logical_query_plan.create_logical_plan()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 32, in create_logical_plan
    self._features_query()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 102, in _features_query
    self.query.featuregroups_version_dict[0][constants.REST_CONFIG.JSON_FEATUREGROUP_VERSION]
KeyError: 'teams_features_1'



In [76]:
tmp = featurestore.sql(
    "SELECT team_budget, score " \
    "FROM teams_features_1 JOIN games_features_1 ON " \
    "games_features_1.home_team_id = teams_features_1.team_id")
features = ['team_budget', 'score']
assert set(features) == set(tmp.columns)
assert tmp.count() == 49
assert len(tmp.columns) == len(features)

'Table or view not found: teams_features_1; line 1 pos 31'
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 388, in sql
    result = core._run_and_log_sql(spark, query)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 396, in _run_and_log_sql
    return spark.sql(sql_str)
  File "/srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/pyspark.zip/pyspark/sql/session.py", line 767, in sql
    return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)
  File "/srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, sel

In [77]:
tmp = featurestore.sql("SELECT * FROM teams_features_1 WHERE team_position < 5")
assert len(tmp.columns) == 3
assert "team_budget" in tmp.columns
assert "team_id" in tmp.columns
assert "team_position" in tmp.columns
for x in tmp.toPandas()["team_position"].values:
    assert x < 5

'Table or view not found: teams_features_1; line 1 pos 14'
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 388, in sql
    result = core._run_and_log_sql(spark, query)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 396, in _run_and_log_sql
    return spark.sql(sql_str)
  File "/srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/pyspark.zip/pyspark/sql/session.py", line 767, in sql
    return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)
  File "/srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, sel

In [78]:
tmp = featurestore.sql("SELECT * FROM teams_features_1 WHERE team_position < 5",
                featurestore=featurestore.project_featurestore(), 
                 dataframe_type = "spark")
assert len(tmp.columns) == 3
assert "team_budget" in tmp.columns
assert "team_id" in tmp.columns
assert "team_position" in tmp.columns
for x in tmp.toPandas()["team_position"].values:
    assert x < 5

'Table or view not found: teams_features_1; line 1 pos 14'
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 388, in sql
    result = core._run_and_log_sql(spark, query)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 396, in _run_and_log_sql
    return spark.sql(sql_str)
  File "/srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/pyspark.zip/pyspark/sql/session.py", line 767, in sql
    return DataFrame(self._jsparkSession.sql(sqlQuery), self._wrapped)
  File "/srv/hops/hopsdata/tmp/nm-local-dir/usercache/_FeeifJIetMYtCB_6LAfgxXQUl5UEEu7P85ArwnhBaE/appcache/application_1565768877650_0025/container_e02_1565768877650_0025_01_000001/py4j-0.10.7-src.zip/py4j/java_gateway.py", line 1257, in __call__
    answer, sel

#####  Test Insert Operations in Existing Feature Groups, `featurestore.insert_into_featuregroup()`

In [79]:
sqlContext = SQLContext(spark.sparkContext)
schema = StructType([StructField("equipo_id", IntegerType(), True),
                     StructField("equipo_presupuesto", FloatType(), True),
                     StructField("equipo_posicion", IntegerType(), True)
                        ])
sample_df = sqlContext.createDataFrame([(999, 41251.52, 1), (998, 1319.4, 8), (997, 21219.1, 2)], schema)
insert_count = sample_df.count()
assert insert_count == 3

In [80]:
spanish_team_features_df = featurestore.get_featuregroup(
    "teams_features_spanish")
pre_insert_count = spanish_team_features_df.count()
assert pre_insert_count == 50

Could not find the requested feature group with name: teams_features_spanish and version: 1 among the list of available feature groups: ['games_features_on_demand_1']
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 266, in get_featuregroup
    dataframe_type = dataframe_type, jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 695, in _do_get_featuregroup
    fg = query_planner._find_featuregroup(featurestore_metadata.featuregroups, featuregroup_name, featuregroup_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 224, in _find_featuregroup
    featuregroup_names))
hops.featurestore_impl.exceptions.exceptions.FeaturegroupNotFound: Could not find the requested feature group with name: teams_features_spanish and versi

In [81]:
featurestore.insert_into_featuregroup(
    sample_df, 
    "teams_features_spanish", 
    descriptive_statistics=False, 
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False
)
spanish_team_features_df_updated = featurestore.get_featuregroup(
    "teams_features_spanish")

after_insert_count = spanish_team_features_df_updated.count()
assert after_insert_count == 53

Could not find the requested feature group with name: teams_features_spanish and version: 1 among the list of available feature groups: ['games_features_on_demand_1']
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 457, in insert_into_featuregroup
    num_clusters=num_clusters)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 487, in _do_insert_into_featuregroup
    fg = query_planner._find_featuregroup(featurestore_metadata.featuregroups, featuregroup_name, featuregroup_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 224, in _find_featuregroup
    featuregroup_names))
hops.featurestore_impl.exceptions.exceptions.FeaturegroupNotFound: Could not find the requested feature group with name: teams_features_spanish and version: 1 among

In [82]:
featurestore.insert_into_featuregroup(
    sample_df, 
    "teams_features_spanish", 
    featurestore=featurestore.project_featurestore(), 
    featuregroup_version=1, 
    mode="append",
    descriptive_statistics=False, 
    feature_correlation=False, 
    feature_histograms=False,
    cluster_analysis=False, 
    stat_columns=None, 
    num_bins=20, 
    corr_method='pearson',
    num_clusters=5
)

after_insert_count2 = featurestore.get_featuregroup("teams_features_spanish").count()
assert after_insert_count2 == 56

Could not find the requested feature group with name: teams_features_spanish and version: 1 among the list of available feature groups: ['games_features_on_demand_1']
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 457, in insert_into_featuregroup
    num_clusters=num_clusters)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 487, in _do_insert_into_featuregroup
    fg = query_planner._find_featuregroup(featurestore_metadata.featuregroups, featuregroup_name, featuregroup_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 224, in _find_featuregroup
    featuregroup_names))
hops.featurestore_impl.exceptions.exceptions.FeaturegroupNotFound: Could not find the requested feature group with name: teams_features_spanish and version: 1 among

In [83]:
featurestore.insert_into_featuregroup(
    sample_df, 
    "teams_features_spanish",
    descriptive_statistics=False, 
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    mode="overwrite")

count_after_overwrite = featurestore.get_featuregroup("teams_features_spanish").count()
assert count_after_overwrite == 3

Could not find the requested feature group with name: teams_features_spanish and version: 1 among the list of available feature groups: ['games_features_on_demand_1']
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 457, in insert_into_featuregroup
    num_clusters=num_clusters)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 487, in _do_insert_into_featuregroup
    fg = query_planner._find_featuregroup(featurestore_metadata.featuregroups, featuregroup_name, featuregroup_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 224, in _find_featuregroup
    featuregroup_names))
hops.featurestore_impl.exceptions.exceptions.FeaturegroupNotFound: Could not find the requested feature group with name: teams_features_spanish and version: 1 among

##### Test integration of feature store with Numpy, Pandas and plain Python

In [84]:
pandas_df = featurestore.get_features(["team_budget", "average_attendance"], dataframe_type="pandas")
assert "team_budget" in pandas_df.columns.values
assert "average_attendance" in pandas_df.columns.values
assert len(pandas_df) == 50
assert len(pandas_df.columns.values) == 2
assert isinstance(pandas_df, pd.DataFrame)

Could not find the feature with name 'team_budget' in any of the featuregroups of the featurestore: 'collect_featurestore'
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 357, in get_features
    join_key=join_key, dataframe_type=dataframe_type, jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 542, in _do_get_features
    logical_query_plan.create_logical_plan()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 32, in create_logical_plan
    self._features_query()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 145, in _features_query
    featuregroups_parsed.values())
  File "/srv/hops/anaconda/anaconda/envs/python36/lib

In [85]:
numpy_df = featurestore.get_features(["team_budget", "average_attendance"], 
                                      dataframe_type="numpy")
assert numpy_df.shape[0] == 50
assert numpy_df.shape[1] == 2
assert isinstance(numpy_df, np.ndarray)

Could not find the feature with name 'team_budget' in any of the featuregroups of the featurestore: 'collect_featurestore'
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 357, in get_features
    join_key=join_key, dataframe_type=dataframe_type, jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 542, in _do_get_features
    logical_query_plan.create_logical_plan()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 32, in create_logical_plan
    self._features_query()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 145, in _features_query
    featuregroups_parsed.values())
  File "/srv/hops/anaconda/anaconda/envs/python36/lib

In [86]:
python_df = featurestore.get_features(["team_budget", "average_attendance"], 
                                      dataframe_type="python")
assert len(python_df) == 50
assert isinstance(python_df, list)

Could not find the feature with name 'team_budget' in any of the featuregroups of the featurestore: 'collect_featurestore'
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 357, in get_features
    join_key=join_key, dataframe_type=dataframe_type, jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 542, in _do_get_features
    logical_query_plan.create_logical_plan()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 32, in create_logical_plan
    self._features_query()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 145, in _features_query
    featuregroups_parsed.values())
  File "/srv/hops/anaconda/anaconda/envs/python36/lib

In [87]:
spark_df = featurestore.get_features(["team_budget", "average_attendance"], 
                                      dataframe_type="spark")
assert spark_df.count() == 50
assert isinstance(spark_df, DataFrame)

Could not find the feature with name 'team_budget' in any of the featuregroups of the featurestore: 'collect_featurestore'
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 357, in get_features
    join_key=join_key, dataframe_type=dataframe_type, jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 542, in _do_get_features
    logical_query_plan.create_logical_plan()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 32, in create_logical_plan
    self._features_query()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 145, in _features_query
    featuregroups_parsed.values())
  File "/srv/hops/anaconda/anaconda/envs/python36/lib

In [88]:
# Let's rename the columns to differentiate this feature group from existing ones in the feature store
pandas_df.columns = ["team_budget_test", "average_attendance_test"]

featurestore.create_featuregroup(
    pandas_df,
    "pandas_test_example",
    description="test featuregroup created from pandas dataframe",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False
)
assert "pandas_test_example_1" in featurestore.get_featuregroups()

name 'pandas_df' is not defined
Traceback (most recent call last):
NameError: name 'pandas_df' is not defined



In [89]:
count_pre_pandas_insert_overwrite = featurestore.get_featuregroup("pandas_test_example").count()
featurestore.insert_into_featuregroup(
    pandas_df, 
    "pandas_test_example",
    descriptive_statistics=False, 
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    mode="overwrite")
count_after_pandas_insert_overwrite = featurestore.get_featuregroup("pandas_test_example").count()
assert count_pre_pandas_insert_overwrite == count_after_pandas_insert_overwrite

Could not find the requested feature group with name: pandas_test_example and version: 1 among the list of available feature groups: ['games_features_on_demand_1']
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 266, in get_featuregroup
    dataframe_type = dataframe_type, jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 695, in _do_get_featuregroup
    fg = query_planner._find_featuregroup(featurestore_metadata.featuregroups, featuregroup_name, featuregroup_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 224, in _find_featuregroup
    featuregroup_names))
hops.featurestore_impl.exceptions.exceptions.FeaturegroupNotFound: Could not find the requested feature group with name: pandas_test_example and version: 1 

In [90]:
featurestore.create_featuregroup(
    numpy_df,
    "numpy_test_example",
    description="test featuregroup created from numpy matrix",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False
)
assert "numpy_test_example_1" in featurestore.get_featuregroups()

name 'numpy_df' is not defined
Traceback (most recent call last):
NameError: name 'numpy_df' is not defined



In [91]:
numpy_test_df_count_pre_insert_overwrite = featurestore.get_featuregroup("numpy_test_example", dataframe_type="spark").count()
featurestore.insert_into_featuregroup(
    numpy_df, 
    "numpy_test_example",
    descriptive_statistics=False, 
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    mode="overwrite")
numpy_test_df_count_after_insert_overwrite = featurestore.get_featuregroup("numpy_test_example", dataframe_type="spark").count()
assert numpy_test_df_count_pre_insert_overwrite == numpy_test_df_count_pre_insert_overwrite

Could not find the requested feature group with name: numpy_test_example and version: 1 among the list of available feature groups: ['games_features_on_demand_1']
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 266, in get_featuregroup
    dataframe_type = dataframe_type, jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 695, in _do_get_featuregroup
    fg = query_planner._find_featuregroup(featurestore_metadata.featuregroups, featuregroup_name, featuregroup_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 224, in _find_featuregroup
    featuregroup_names))
hops.featurestore_impl.exceptions.exceptions.FeaturegroupNotFound: Could not find the requested feature group with name: numpy_test_example and version: 1 am

In [92]:
featurestore.create_featuregroup(
    python_df,
    "python_test_example",
    description="test featuregroup created from python 2D list",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False
)

python_test_df_count_pre_insert_overwrite = featurestore.get_featuregroup("python_test_example", dataframe_type="spark").count()
assert "python_test_example_1" in featurestore.get_featuregroups()

name 'python_df' is not defined
Traceback (most recent call last):
NameError: name 'python_df' is not defined



In [93]:
featurestore.insert_into_featuregroup(
    python_df, 
    "python_test_example",
    descriptive_statistics=False, 
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    mode="overwrite")

python_test_df_count_after_insert_overwrite = featurestore.get_featuregroup("python_test_example", dataframe_type="spark").count()
assert python_test_df_count_pre_insert_overwrite == python_test_df_count_after_insert_overwrite

name 'python_df' is not defined
Traceback (most recent call last):
NameError: name 'python_df' is not defined



##### Test update Feature Store Statistics `featurestore.update_featuregroup_stats()`

In [94]:
featurestore.update_featuregroup_stats("teams_features")

There was an error in computing the statistics for feature group: teams_features , with version: 1 in featurestore: None. Error: 'Table or view not found: teams_features_1; line 1 pos 14'
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 528, in update_featuregroup_stats
    featurestore, str(e)))
hops.featurestore_impl.exceptions.exceptions.StatisticsComputationError: There was an error in computing the statistics for feature group: teams_features , with version: 1 in featurestore: None. Error: 'Table or view not found: teams_features_1; line 1 pos 14'



In [95]:
featurestore.update_featuregroup_stats(
    "teams_features", 
    featuregroup_version=1, 
    featurestore=featurestore.project_featurestore(), 
    descriptive_statistics=True,
    feature_correlation=True, 
    feature_histograms=True,
    cluster_analysis=True,
    stat_columns=None)

There was an error in computing the statistics for feature group: teams_features , with version: 1 in featurestore: collect_featurestore. Error: 'Table or view not found: teams_features_1; line 1 pos 14'
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 528, in update_featuregroup_stats
    featurestore, str(e)))
hops.featurestore_impl.exceptions.exceptions.StatisticsComputationError: There was an error in computing the statistics for feature group: teams_features , with version: 1 in featurestore: collect_featurestore. Error: 'Table or view not found: teams_features_1; line 1 pos 14'



##### Test Write Training Dataset Operations 

- `featurestore.get_latest_training_dataset_version()`
- `create_training_dataset()`

In [96]:
features_df = featurestore.get_features(
    ["team_budget", "average_attendance",
    "team_position"]
)
latest_version = featurestore.get_latest_training_dataset_version("team_position_prediction")

Could not find the feature with name 'team_budget' in any of the featuregroups of the featurestore: 'collect_featurestore'
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 357, in get_features
    join_key=join_key, dataframe_type=dataframe_type, jdbc_args=jdbc_args)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 542, in _do_get_features
    logical_query_plan.create_logical_plan()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 32, in create_logical_plan
    self._features_query()
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/logical_query_plan.py", line 145, in _features_query
    featuregroups_parsed.values())
  File "/srv/hops/anaconda/anaconda/envs/python36/lib

In [97]:
featurestore.create_training_dataset(
    features_df, "team_position_prediction",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    training_dataset_version = 1
)

name 'features_df' is not defined
Traceback (most recent call last):
NameError: name 'features_df' is not defined



In [98]:
featurestore.create_training_dataset(
    features_df, "team_position_prediction_csv",
    description="a dataset with features for football teams, used for training a model to predict league-position",
    featurestore=featurestore.project_featurestore(),
    data_format="csv",
    training_dataset_version= 1,
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    stat_columns=None)

name 'features_df' is not defined
Traceback (most recent call last):
NameError: name 'features_df' is not defined



In [99]:
featurestore.create_training_dataset(
    features_df, "team_position_prediction_tsv",
    description="a dataset with features for football teams, used for training a model to predict league-position",
    featurestore=featurestore.project_featurestore(),
    data_format="tsv",
    training_dataset_version=1,
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    stat_columns=None)

name 'features_df' is not defined
Traceback (most recent call last):
NameError: name 'features_df' is not defined



In [100]:
featurestore.create_training_dataset(
    features_df, "team_position_prediction_parquet",
    description="a dataset with features for football teams, used for training a model to predict league-position",
    featurestore=featurestore.project_featurestore(),
    data_format="parquet",
    training_dataset_version=1,
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    stat_columns=None)

name 'features_df' is not defined
Traceback (most recent call last):
NameError: name 'features_df' is not defined



In [101]:
featurestore.create_training_dataset(
    features_df, "team_position_prediction_orc",
    description="a dataset with features for football teams, used for training a model to predict league-position",
    featurestore=featurestore.project_featurestore(),
    data_format="orc",
    training_dataset_version=1,
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    stat_columns=None)

name 'features_df' is not defined
Traceback (most recent call last):
NameError: name 'features_df' is not defined



In [102]:
featurestore.create_training_dataset(
    features_df, "team_position_prediction_avro",
    description="a dataset with features for football teams, used for training a model to predict league-position",
    featurestore=featurestore.project_featurestore(),
    data_format="avro",
    training_dataset_version=1,
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    stat_columns=None)

name 'features_df' is not defined
Traceback (most recent call last):
NameError: name 'features_df' is not defined



In [103]:
featurestore.create_training_dataset(
    features_df, "team_position_prediction_hdf5",
    description="a dataset with features for football teams, used for training a model to predict league-position",
    featurestore=featurestore.project_featurestore(),
    data_format="hdf5",
    training_dataset_version=1,
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    stat_columns=None)

name 'features_df' is not defined
Traceback (most recent call last):
NameError: name 'features_df' is not defined



In [104]:
featurestore.create_training_dataset(
    features_df, "team_position_prediction_npy",
    description="a dataset with features for football teams, used for training a model to predict league-position",
    featurestore=featurestore.project_featurestore(),
    data_format="npy",
    training_dataset_version=1,
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    stat_columns=None)

name 'features_df' is not defined
Traceback (most recent call last):
NameError: name 'features_df' is not defined



In [105]:
# Petastorm is only supported in python 3
if sys.version_info[0] >= 3:
    PetastormSchema = Unischema('team_position_prediction_petastorm_schema', [
        UnischemaField('team_budget', np.float32, (), ScalarCodec(FloatType()), False),
        UnischemaField('average_attendance', np.float32, (), ScalarCodec(FloatType()), False),
        UnischemaField('team_position', np.int32, (), ScalarCodec(IntegerType()), False)
    ])

    petastorm_args = {
        "schema": PetastormSchema
    }

    featurestore.create_training_dataset(
        features_df, "team_position_prediction_petastorm",
        description="a dataset with features for football teams, used for training a model to predict league-position",
        featurestore=featurestore.project_featurestore(),
        data_format="petastorm",
        training_dataset_version=1,
        descriptive_statistics=False,
        feature_correlation=False,
        feature_histograms=False,
        cluster_analysis=False,
        stat_columns=None,
        petastorm_args=petastorm_args
    )

name 'features_df' is not defined
Traceback (most recent call last):
NameError: name 'features_df' is not defined



In [106]:
tds = featurestore.get_training_datasets()
assert 'team_position_prediction_1' in tds
assert 'team_position_prediction_csv_1' in tds
assert 'team_position_prediction_tsv_1' in tds
assert 'team_position_prediction_parquet_1' in tds
assert 'team_position_prediction_orc_1' in tds
assert 'team_position_prediction_avro_1' in tds
assert 'team_position_prediction_hdf5_1'in tds
assert 'team_position_prediction_npy_1' in tds
if sys.version_info[0] >= 3:
    assert 'team_position_prediction_petastorm_1' in tds


Traceback (most recent call last):
AssertionError



##### Test Insert into an existing training dataset, `featurestore.insert_into_training_dataset()`

In [107]:
count_pre_insert = featurestore.get_training_dataset("team_position_prediction_csv").count()
featurestore.insert_into_training_dataset(
    features_df, 
    "team_position_prediction_csv",
    descriptive_statistics=False,
    feature_correlation=False,
    feature_histograms=False,
    cluster_analysis=False,
    training_dataset_version=featurestore.get_latest_training_dataset_version("team_position_prediction_csv")
)
count_after_insert = featurestore.get_training_dataset("team_position_prediction_csv").count()
assert count_pre_insert == count_after_insert # td only support overwrites

Could not find the requested training dataset with name: team_position_prediction_csv and version: 1 among the list of available training datasets: []
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 868, in get_training_dataset
    dataframe_type=dataframe_type)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 756, in _do_get_training_dataset
    training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 196, in _find_training_dataset
    training_dataset_names))
hops.featurestore_impl.exceptions.exceptions.TrainingDatasetNotFound: Could not find the requested training dataset with name: team_position_prediction_csv and version: 1 among the list of available training datasets: []



##### Test Training Dataset Utility Methods

- `featurestore.get_training_dataset_path()`
- `featurestore.get_training_dataset_tf_record_schema`

In [108]:
assert hdfs.project_path() in featurestore.get_training_dataset_path("team_position_prediction_csv")

Could not find the requested training dataset with name: team_position_prediction_csv and version: 1 among the list of available training datasets: []
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 1084, in get_training_dataset_path
    training_dataset_version=training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 1168, in _do_get_training_dataset_path
    training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 196, in _find_training_dataset
    training_dataset_names))
hops.featurestore_impl.exceptions.exceptions.TrainingDatasetNotFound: Could not find the requested training dataset with name: team_position_prediction_csv and version: 1 among the list of available training datasets: []



In [109]:
assert hdfs.project_name() + "_Training_Datasets" in featurestore.get_training_dataset_path("team_position_prediction_csv")

Could not find the requested training dataset with name: team_position_prediction_csv and version: 1 among the list of available training datasets: []
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 1084, in get_training_dataset_path
    training_dataset_version=training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 1168, in _do_get_training_dataset_path
    training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 196, in _find_training_dataset
    training_dataset_names))
hops.featurestore_impl.exceptions.exceptions.TrainingDatasetNotFound: Could not find the requested training dataset with name: team_position_prediction_csv and version: 1 among the list of available training datasets: []



In [110]:
assert "team_position_prediction_csv" in featurestore.get_training_dataset_path("team_position_prediction_csv")

Could not find the requested training dataset with name: team_position_prediction_csv and version: 1 among the list of available training datasets: []
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 1084, in get_training_dataset_path
    training_dataset_version=training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 1168, in _do_get_training_dataset_path
    training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 196, in _find_training_dataset
    training_dataset_names))
hops.featurestore_impl.exceptions.exceptions.TrainingDatasetNotFound: Could not find the requested training dataset with name: team_position_prediction_csv and version: 1 among the list of available training datasets: []



In [111]:
tf_schema = featurestore.get_training_dataset_tf_record_schema("team_position_prediction")
assert tf_schema == {'team_budget': tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=None), 
                     'average_attendance': tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=None), 
                     'team_position': tf.FixedLenFeature(shape=[], dtype=tf.int64, default_value=None)}

Could not find the requested training dataset with name: team_position_prediction and version: 1 among the list of available training datasets: []
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 836, in get_training_dataset_tf_record_schema
    featurestore=featurestore)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 1201, in _do_get_training_dataset_tf_record_schema
    training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 196, in _find_training_dataset
    training_dataset_names))
hops.featurestore_impl.exceptions.exceptions.TrainingDatasetNotFound: Could not find the requested training dataset with name: team_position_prediction and version: 1 among the list of available training datasets: []



In [112]:
features_df = featurestore.get_training_dataset("team_position_prediction")
tf_schema = featurestore.get_dataframe_tf_record_schema(features_df)
assert tf_schema == {'team_budget': tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=None), 
                     'average_attendance': tf.FixedLenFeature(shape=[], dtype=tf.float32, default_value=None), 
                     'team_position': tf.FixedLenFeature(shape=[], dtype=tf.int64, default_value=None)}

Could not find the requested training dataset with name: team_position_prediction and version: 1 among the list of available training datasets: []
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 868, in get_training_dataset
    dataframe_type=dataframe_type)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 756, in _do_get_training_dataset
    training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 196, in _find_training_dataset
    training_dataset_names))
hops.featurestore_impl.exceptions.exceptions.TrainingDatasetNotFound: Could not find the requested training dataset with name: team_position_prediction and version: 1 among the list of available training datasets: []



##### Test update Training Dataset stats

- `featurestore.update_training_dataset_stats()`

In [113]:
featurestore.update_training_dataset_stats("team_position_prediction")

There was an error in computing the statistics for training dataset: team_position_prediction , with version: 1 in featurestore: None. Error: Could not find the requested training dataset with name: team_position_prediction and version: 1 among the list of available training datasets: []
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 1212, in update_training_dataset_stats
    featurestore, str(e)))
hops.featurestore_impl.exceptions.exceptions.StatisticsComputationError: There was an error in computing the statistics for training dataset: team_position_prediction , with version: 1 in featurestore: None. Error: Could not find the requested training dataset with name: team_position_prediction and version: 1 among the list of available training datasets: []



In [114]:
featurestore.update_training_dataset_stats(
    "team_position_prediction", 
    training_dataset_version=1, 
    featurestore=featurestore.project_featurestore(), 
    descriptive_statistics=True,
    feature_correlation=True, 
    feature_histograms=True,
    cluster_analysis=True,
    stat_columns=None)

There was an error in computing the statistics for training dataset: team_position_prediction , with version: 1 in featurestore: collect_featurestore. Error: Could not find the requested training dataset with name: team_position_prediction and version: 1 among the list of available training datasets: []
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 1212, in update_training_dataset_stats
    featurestore, str(e)))
hops.featurestore_impl.exceptions.exceptions.StatisticsComputationError: There was an error in computing the statistics for training dataset: team_position_prediction , with version: 1 in featurestore: collect_featurestore. Error: Could not find the requested training dataset with name: team_position_prediction and version: 1 among the list of available training datasets: []



##### Test Read Training Datasets API `featurestore.get_training_dataset()`

In [115]:
cols = ['team_budget', 'average_attendance', 'team_position']
tmp = featurestore.get_training_dataset("team_position_prediction_csv")
assert set(tmp.columns) == set(cols)
assert tmp.count() == 50

Could not find the requested training dataset with name: team_position_prediction_csv and version: 1 among the list of available training datasets: []
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 868, in get_training_dataset
    dataframe_type=dataframe_type)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 756, in _do_get_training_dataset
    training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 196, in _find_training_dataset
    training_dataset_names))
hops.featurestore_impl.exceptions.exceptions.TrainingDatasetNotFound: Could not find the requested training dataset with name: team_position_prediction_csv and version: 1 among the list of available training datasets: []



In [116]:
tmp = featurestore.get_training_dataset("team_position_prediction_hdf5")
assert tmp.count() == 50

Could not find the requested training dataset with name: team_position_prediction_hdf5 and version: 1 among the list of available training datasets: []
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 868, in get_training_dataset
    dataframe_type=dataframe_type)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 756, in _do_get_training_dataset
    training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 196, in _find_training_dataset
    training_dataset_names))
hops.featurestore_impl.exceptions.exceptions.TrainingDatasetNotFound: Could not find the requested training dataset with name: team_position_prediction_hdf5 and version: 1 among the list of available training datasets: []



In [117]:
if sys.version_info[0] >= 3:
    tmp = featurestore.get_training_dataset("team_position_prediction_petastorm")
    assert set(tmp.columns) == set(cols)
    assert tmp.count() == 50

Could not find the requested training dataset with name: team_position_prediction_petastorm and version: 1 among the list of available training datasets: []
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 868, in get_training_dataset
    dataframe_type=dataframe_type)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 756, in _do_get_training_dataset
    training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 196, in _find_training_dataset
    training_dataset_names))
hops.featurestore_impl.exceptions.exceptions.TrainingDatasetNotFound: Could not find the requested training dataset with name: team_position_prediction_petastorm and version: 1 among the list of available training datasets: []



In [118]:
tmp = featurestore.get_training_dataset("team_position_prediction_avro")
assert set(tmp.columns) == set(cols)
assert tmp.count() == 50

Could not find the requested training dataset with name: team_position_prediction_avro and version: 1 among the list of available training datasets: []
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 868, in get_training_dataset
    dataframe_type=dataframe_type)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 756, in _do_get_training_dataset
    training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 196, in _find_training_dataset
    training_dataset_names))
hops.featurestore_impl.exceptions.exceptions.TrainingDatasetNotFound: Could not find the requested training dataset with name: team_position_prediction_avro and version: 1 among the list of available training datasets: []



In [119]:
tmp = featurestore.get_training_dataset("team_position_prediction_orc")
assert set(tmp.columns) == set(cols)
assert tmp.count() == 50

Could not find the requested training dataset with name: team_position_prediction_orc and version: 1 among the list of available training datasets: []
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 868, in get_training_dataset
    dataframe_type=dataframe_type)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 756, in _do_get_training_dataset
    training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 196, in _find_training_dataset
    training_dataset_names))
hops.featurestore_impl.exceptions.exceptions.TrainingDatasetNotFound: Could not find the requested training dataset with name: team_position_prediction_orc and version: 1 among the list of available training datasets: []



In [120]:
tmp = featurestore.get_training_dataset("team_position_prediction_tsv")
assert set(tmp.columns) == set(cols)
assert tmp.count() == 50

Could not find the requested training dataset with name: team_position_prediction_tsv and version: 1 among the list of available training datasets: []
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 868, in get_training_dataset
    dataframe_type=dataframe_type)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 756, in _do_get_training_dataset
    training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 196, in _find_training_dataset
    training_dataset_names))
hops.featurestore_impl.exceptions.exceptions.TrainingDatasetNotFound: Could not find the requested training dataset with name: team_position_prediction_tsv and version: 1 among the list of available training datasets: []



In [121]:
tmp = featurestore.get_training_dataset("team_position_prediction_npy")
assert tmp.count() == 50

Could not find the requested training dataset with name: team_position_prediction_npy and version: 1 among the list of available training datasets: []
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 868, in get_training_dataset
    dataframe_type=dataframe_type)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 756, in _do_get_training_dataset
    training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 196, in _find_training_dataset
    training_dataset_names))
hops.featurestore_impl.exceptions.exceptions.TrainingDatasetNotFound: Could not find the requested training dataset with name: team_position_prediction_npy and version: 1 among the list of available training datasets: []



In [122]:
tmp = featurestore.get_training_dataset("team_position_prediction_parquet")
assert set(tmp.columns) == set(cols)
assert tmp.count() == 50

Could not find the requested training dataset with name: team_position_prediction_parquet and version: 1 among the list of available training datasets: []
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 868, in get_training_dataset
    dataframe_type=dataframe_type)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 756, in _do_get_training_dataset
    training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/query_planner/query_planner.py", line 196, in _find_training_dataset
    training_dataset_names))
hops.featurestore_impl.exceptions.exceptions.TrainingDatasetNotFound: Could not find the requested training dataset with name: team_position_prediction_parquet and version: 1 among the list of available training datasets: []



##### Test Featurestore Get Statistics

- `featurestore.get_featuregroup_statistics()`
- `featurestore.get_training_dataset_statistics()`

In [123]:
stats = featurestore.get_featuregroup_statistics("teams_features")
assert not stats.cluster_analysis is None
assert not stats.cluster_analysis.clusters is None
assert not stats.cluster_analysis.datapoints is None
assert len(stats.cluster_analysis.clusters) == len(stats.cluster_analysis.datapoints)
assert not stats.cluster_analysis.clusters[0].datapoint_name is None
assert not stats.cluster_analysis.clusters[0].cluster is None
assert not stats.correlation_matrix is None
assert not stats.correlation_matrix.feature_correlations is None
assert len(stats.correlation_matrix.feature_correlations) > 0
assert len(stats.correlation_matrix.feature_correlations) < constants.FEATURE_STORE.MAX_CORRELATION_MATRIX_COLUMNS
assert not stats.correlation_matrix.feature_correlations[0].feature_name is None
assert not stats.correlation_matrix.feature_correlations[0].correlation_values is None
assert len(stats.correlation_matrix.feature_correlations[0].correlation_values) == \
len(stats.correlation_matrix.feature_correlations)
assert not stats.descriptive_stats is None
assert not stats.descriptive_stats.descriptive_stats is None
assert len(stats.descriptive_stats.descriptive_stats) > 0
assert not stats.descriptive_stats.descriptive_stats[0].feature_name is None
assert not stats.descriptive_stats.descriptive_stats[0].metric_values is None
assert len(stats.descriptive_stats.descriptive_stats[0].metric_values) > 0
assert not stats.descriptive_stats.descriptive_stats[0].metric_values[0].metric_name is None
assert not stats.descriptive_stats.descriptive_stats[0].metric_values[0].value is None
assert not stats.feature_histograms is None
assert not stats.feature_histograms.feature_distributions is None
assert len(stats.feature_histograms.feature_distributions) > 0
assert not stats.feature_histograms.feature_distributions[0].feature_name is None
assert not stats.feature_histograms.feature_distributions[0].frequency_distribution is None
assert len(stats.feature_histograms.feature_distributions[0].frequency_distribution) > 0
assert not stats.feature_histograms.feature_distributions[0].frequency_distribution[0].bin is None
assert not stats.feature_histograms.feature_distributions[0].frequency_distribution[0].frequency is None

The featuregroup teams_features with version: 1 was not found in the feature store collect_featurestore
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 1787, in get_featuregroup_statistics
    return core._do_get_featuregroup_statistics(featuregroup_name, featurestore, featuregroup_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 1534, in _do_get_featuregroup_statistics
    featuregroup_id = _get_featuregroup_id(featurestore, featuregroup_name, featuregroup_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 309, in _get_featuregroup_id
    featurestore))
hops.featurestore_impl.exceptions.exceptions.FeaturegroupNotFound: The featuregroup teams_features with version: 1 was not found in the feature store collect_featurestore



In [124]:
stats = featurestore.get_training_dataset_statistics("team_position_prediction")
assert not stats.cluster_analysis is None
assert not stats.cluster_analysis.clusters is None
assert not stats.cluster_analysis.datapoints is None
assert len(stats.cluster_analysis.clusters) == len(stats.cluster_analysis.datapoints)
assert not stats.cluster_analysis.clusters[0].datapoint_name is None
assert not stats.cluster_analysis.clusters[0].cluster is None
assert not stats.correlation_matrix is None
assert not stats.correlation_matrix.feature_correlations is None
assert len(stats.correlation_matrix.feature_correlations) > 0
assert len(stats.correlation_matrix.feature_correlations) < constants.FEATURE_STORE.MAX_CORRELATION_MATRIX_COLUMNS
assert not stats.correlation_matrix.feature_correlations[0].feature_name is None
assert not stats.correlation_matrix.feature_correlations[0].correlation_values is None
assert len(stats.correlation_matrix.feature_correlations[0].correlation_values) == len(stats.correlation_matrix.feature_correlations)
assert not stats.descriptive_stats is None
assert not stats.descriptive_stats.descriptive_stats is None
assert len(stats.descriptive_stats.descriptive_stats) > 0
assert not stats.descriptive_stats.descriptive_stats[0].feature_name is None
assert not stats.descriptive_stats.descriptive_stats[0].metric_values is None
assert len(stats.descriptive_stats.descriptive_stats[0].metric_values) > 0
assert not stats.descriptive_stats.descriptive_stats[0].metric_values[0].metric_name is None
assert not stats.descriptive_stats.descriptive_stats[0].metric_values[0].value is None
assert not stats.feature_histograms is None
assert not stats.feature_histograms.feature_distributions is None
assert len(stats.feature_histograms.feature_distributions) > 0
assert not stats.feature_histograms.feature_distributions[0].feature_name is None
assert not stats.feature_histograms.feature_distributions[0].frequency_distribution is None
assert len(stats.feature_histograms.feature_distributions[0].frequency_distribution) > 0
assert not stats.feature_histograms.feature_distributions[0].frequency_distribution[0].bin is None
assert not stats.feature_histograms.feature_distributions[0].frequency_distribution[0].frequency is None

The training dataset team_position_prediction with version: 1 was not found in the feature store collect_featurestore
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 1815, in get_training_dataset_statistics
    return core._do_get_training_dataset_statistics(training_dataset_name, featurestore, training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 1557, in _do_get_training_dataset_statistics
    training_dataset_id = _get_training_dataset_id(featurestore, training_dataset_name, training_dataset_version)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore_impl/core.py", line 1120, in _get_training_dataset_id
    training_dataset_name, training_dataset_version, featurestore))
hops.featurestore_impl.exceptions.exceptions.TrainingDatasetNotFound: The training dataset t

##### Test Featurestore Visualizations

- `featurestore.visualize_featuregroup_distributions()`
- `featurestore.visualize_featuregroup_correlations()`
- `featurestore.visualize_featuregroup_clusters()`
- `featurestore.visualize_featuregroup_descriptive_stats()`
- `featurestore.visualize_training_dataset_distributions()`
- `featurestore.visualize_training_dataset_correlations()`
- `featurestore.visualize_traniing_dataset_clusters()`
- `featurestore.visualize_training_dataset_descriptive_stats()`

In [125]:
fig = featurestore.visualize_featuregroup_distributions("teams_features", plot=False)
fig.savefig("teams_features_distributions.png")

There was an error in visualizing the feature distributions for feature group: teams_features with version: 1 in featurestore: collect_featurestore. Error: The featuregroup teams_features with version: 1 was not found in the feature store collect_featurestore
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 1317, in visualize_featuregroup_distributions
    featuregroup_name, featuregroup_version, featurestore, str(e)))
hops.featurestore_impl.exceptions.exceptions.FeatureVisualizationError: There was an error in visualizing the feature distributions for feature group: teams_features with version: 1 in featurestore: collect_featurestore. Error: The featuregroup teams_features with version: 1 was not found in the feature store collect_featurestore



In [126]:
fig = featurestore.visualize_featuregroup_correlations("teams_features", plot=False)
fig.savefig("teams_features_correlations.png")

There was an error in visualizing the feature correlations for feature group: teams_features with version: 1 in featurestore: collect_featurestore. Error: The featuregroup teams_features with version: 1 was not found in the feature store collect_featurestore
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 1391, in visualize_featuregroup_correlations
    featuregroup_name, featuregroup_version, featurestore, str(e)))
hops.featurestore_impl.exceptions.exceptions.FeatureVisualizationError: There was an error in visualizing the feature correlations for feature group: teams_features with version: 1 in featurestore: collect_featurestore. Error: The featuregroup teams_features with version: 1 was not found in the feature store collect_featurestore



In [127]:
fig = featurestore.visualize_featuregroup_clusters("teams_features", plot=False)
fig.savefig("teams_features_clusters.png")

There was an error in visualizing the feature clusters for feature group: teams_features with version: 1 in featurestore: collect_featurestore. Error: The featuregroup teams_features with version: 1 was not found in the feature store collect_featurestore
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 1455, in visualize_featuregroup_clusters
    featuregroup_name, featuregroup_version, featurestore, str(e)))
hops.featurestore_impl.exceptions.exceptions.FeatureVisualizationError: There was an error in visualizing the feature clusters for feature group: teams_features with version: 1 in featurestore: collect_featurestore. Error: The featuregroup teams_features with version: 1 was not found in the feature store collect_featurestore



In [128]:
desc_stats_df = featurestore.visualize_featuregroup_descriptive_stats("teams_features")
desc_stats_df.head()

There was an error in visualizing the descriptive statistics for featuregroup: teams_features with version: 1 in featurestore: collect_featurestore. Error: The featuregroup teams_features with version: 1 was not found in the feature store collect_featurestore
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 1499, in visualize_featuregroup_descriptive_stats
    featurestore, str(e)))
hops.featurestore_impl.exceptions.exceptions.FeatureVisualizationError: There was an error in visualizing the descriptive statistics for featuregroup: teams_features with version: 1 in featurestore: collect_featurestore. Error: The featuregroup teams_features with version: 1 was not found in the feature store collect_featurestore



In [129]:
fig = featurestore.visualize_training_dataset_distributions("team_position_prediction", plot=False)
fig.savefig("team_position_prediction_distributions.png")

There was an error in visualizing the feature distributions for training dataset: team_position_prediction with version: 1 in featurestore: collect_featurestore. Error: The training dataset team_position_prediction with version: 1 was not found in the feature store collect_featurestore
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 1573, in visualize_training_dataset_distributions
    featurestore, str(e)))
hops.featurestore_impl.exceptions.exceptions.FeatureVisualizationError: There was an error in visualizing the feature distributions for training dataset: team_position_prediction with version: 1 in featurestore: collect_featurestore. Error: The training dataset team_position_prediction with version: 1 was not found in the feature store collect_featurestore



In [130]:
fig = featurestore.visualize_training_dataset_correlations("team_position_prediction", plot=False)
fig.savefig("team_position_prediction_correlations.png")

There was an error in visualizing the feature correlations for training dataset: team_position_prediction with version: 1 in featurestore: collect_featurestore. Error: The training dataset team_position_prediction with version: 1 was not found in the feature store collect_featurestore
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 1650, in visualize_training_dataset_correlations
    featurestore, str(e)))
hops.featurestore_impl.exceptions.exceptions.FeatureVisualizationError: There was an error in visualizing the feature correlations for training dataset: team_position_prediction with version: 1 in featurestore: collect_featurestore. Error: The training dataset team_position_prediction with version: 1 was not found in the feature store collect_featurestore



In [131]:
fig = featurestore.visualize_training_dataset_clusters("team_position_prediction", plot=False)
fig.savefig("team_position_prediction_clusters.png")

There was an error in visualizing the feature clusters for training dataset: team_position_prediction with version: 1 in featurestore: collect_featurestore. Error: The training dataset team_position_prediction with version: 1 was not found in the feature store collect_featurestore
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 1715, in visualize_training_dataset_clusters
    featurestore, str(e)))
hops.featurestore_impl.exceptions.exceptions.FeatureVisualizationError: There was an error in visualizing the feature clusters for training dataset: team_position_prediction with version: 1 in featurestore: collect_featurestore. Error: The training dataset team_position_prediction with version: 1 was not found in the feature store collect_featurestore



In [132]:
desc_stats_df = featurestore.visualize_training_dataset_descriptive_stats("team_position_prediction")
desc_stats_df.head()

There was an error in visualizing the descriptive statistics for training dataset: team_position_prediction with version: 1 in featurestore: collect_featurestore. Error: The training dataset team_position_prediction with version: 1 was not found in the feature store collect_featurestore
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/featurestore.py", line 1759, in visualize_training_dataset_descriptive_stats
    featurestore, str(e)))
hops.featurestore_impl.exceptions.exceptions.FeatureVisualizationError: There was an error in visualizing the descriptive statistics for training dataset: team_position_prediction with version: 1 in featurestore: collect_featurestore. Error: The training dataset team_position_prediction with version: 1 was not found in the feature store collect_featurestore



##### Cleanup (Delete FS Contents so that next test run works the same)

In [133]:
# Delete feature groups
spark.sql('use ' + featurestore.project_featurestore())
for fg in featurestore.get_featuregroups():
    try:
        spark.sql("drop table " + fg)
    except:
        pass

In [134]:
# Delete training datasets
td_dir = hdfs.project_name() + "_Training_Datasets/"
for td in featurestore.get_training_datasets():
    try:
        hdfs.rmr(td_dir + td)
    except:
        pass

In [135]:
featurestore.get_featurestore_metadata(update_cache=True)
# on demand feature group will still be there.. maybe add delete endpoint in the python SDK?
#assert featurestore.get_featuregroups() == [] 
assert featurestore.get_training_datasets() == []

## Kafka Tests

##### Test default config 

- `kafka.get_default_config()`, 
- `kafka.get_security_protocol()`,
- `kafka.get_broker_endpoints_list()`

In [136]:
config = kafka.get_kafka_default_config()
assert "bootstrap.servers" in config
assert "security.protocol" in config
assert "ssl.ca.location" in config
assert "ssl.key.location" in config
assert "ssl.certificate.location" in config

In [137]:
assert len(kafka.get_security_protocol()) > 0
assert len(kafka.get_broker_endpoints_list()) > 0

## TLS Tests

##### Test access to TLS tokens

- `tls.get_key_store()`
- `tls.get_trust_store()`
- `tls.get_key_store_pwd()`
- `tls.get_trust_store_pwd()`
- `tls.get_client_certificate_location()`
- `tls.get_client_key_location()`
- `tls.get_ca_chain_location()`

In [138]:
assert len(tls.get_key_store()) > 0
assert len(tls.get_trust_store()) > 0
assert len(tls.get_key_store_pwd()) > 0
assert len(tls.get_trust_store_pwd()) > 0
assert len(tls.get_client_certificate_location()) > 0
assert len(tls.get_client_key_location()) > 0
assert len(tls.get_ca_chain_location()) > 0

## Serving Tests

These tests require that you have the following files in the Resources directory:

- `iris_model.knn`
- `iris_flower_classifier.py`
- `mnist`

Where mnist is a directory containing a tensorflow model.

These files can be downloaded from here: `http://snurran.sics.se/hops/hops-util-py_test/`

##### Test Export Model

In [139]:
model_path = "Resources/iris_knn.pkl"
serving.export(model_path, "IrisFlowerClassifier", 1, overwrite=True)
assert hdfs.exists("Models/IrisFlowerClassifier/1/iris_knn.pkl")

the provided model_path: Resources/iris_knn.pkl , does not exist in HDFS or on the local filesystem
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 404, in export
    model_path))
ValueError: the provided model_path: Resources/iris_knn.pkl , does not exist in HDFS or on the local filesystem



In [140]:
model_path = "Resources/iris_flower_classifier.py"
serving.export(model_path, "IrisFlowerClassifier", 1, overwrite=True)
assert hdfs.exists("Models/IrisFlowerClassifier/1/iris_flower_classifier.py")

the provided model_path: Resources/iris_flower_classifier.py , does not exist in HDFS or on the local filesystem
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 404, in export
    model_path))
ValueError: the provided model_path: Resources/iris_flower_classifier.py , does not exist in HDFS or on the local filesystem



In [141]:
model_path = "Resources/mnist/"
serving.export(model_path, "mnist", 2, overwrite=True)
assert hdfs.exists("Models/mnist/2/")

the provided model_path: Resources/mnist/ , does not exist in HDFS or on the local filesystem
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 404, in export
    model_path))
ValueError: the provided model_path: Resources/mnist/ , does not exist in HDFS or on the local filesystem



##### Test Serve Model

In [142]:
script_path = "Models/IrisFlowerClassifier/1/iris_flower_classifier.py"
if serving.exists("IrisFlowerClassifier"):
    serving.delete("IrisFlowerClassifier")
serving.create_or_update(script_path, "IrisFlowerClassifier", serving_type="SKLEARN", 
                                 model_version=1)

path hdfs://10.0.2.15:8020/Projects/collect/Models/IrisFlowerClassifier/1/iris_flower_classifier.py not found
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 223, in create_or_update
    artifact_path = hdfs._expand_path(artifact_path)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/hdfs.py", line 135, in _expand_path
    raise IOError("path %s not found" % hdfs_path)
OSError: path hdfs://10.0.2.15:8020/Projects/collect/Models/IrisFlowerClassifier/1/iris_flower_classifier.py not found



In [143]:
assert serving.exists("IrisFlowerClassifier")


Traceback (most recent call last):
AssertionError



In [144]:
model_path = "Models/mnist/2/"
if serving.exists("mnist"):
    serving.delete("mnist")
serving.create_or_update(model_path, "mnist", serving_type="TENSORFLOW", 
                                 model_version=2)

path hdfs://10.0.2.15:8020/Projects/collect/Models/mnist/2/ not found
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 223, in create_or_update
    artifact_path = hdfs._expand_path(artifact_path)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/hdfs.py", line 135, in _expand_path
    raise IOError("path %s not found" % hdfs_path)
OSError: path hdfs://10.0.2.15:8020/Projects/collect/Models/mnist/2/ not found



In [145]:
assert serving.exists("mnist")


Traceback (most recent call last):
AssertionError



##### Test Data Access Operations on Model

In [146]:
assert serving.get_id("IrisFlowerClassifier") is not None
assert serving.get_id("mnist") is not None
assert "Models/IrisFlowerClassifier/1/iris_flower_classifier.py" in serving.get_artifact_path("IrisFlowerClassifier")
assert "Models/mnist/2/" in serving.get_artifact_path("mnist")
assert serving.get_type("IrisFlowerClassifier") == "SKLEARN"
assert serving.get_type("mnist") == "TENSORFLOW"
assert serving.get_version("IrisFlowerClassifier") == 1
assert serving.get_version("mnist") == 2
assert serving.get_kafka_topic("IrisFlowerClassifier") is not None
assert serving.get_kafka_topic("mnist") is not None
assert serving.get_status("IrisFlowerClassifier") == "Stopped"
assert serving.get_status("mnist") == "Stopped"

No serving with name: IrisFlowerClassifier could be found among the list of available servings: 
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 507, in get_id
    serving = _find_serving_with_name(serving_name, servings)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 648, in _find_serving_with_name
    "available servings: {}".format(serving_name, serving_names_str))
hops.serving.ServingNotFound: No serving with name: IrisFlowerClassifier could be found among the list of available servings: 



##### Test Start/Stop Serving

In [147]:
serving.start("IrisFlowerClassifier")
serving.start("mnist")

No serving with name: IrisFlowerClassifier could be found among the list of available servings: 
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 123, in start
    serving_id = get_id(serving_name)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 507, in get_id
    serving = _find_serving_with_name(serving_name, servings)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 648, in _find_serving_with_name
    "available servings: {}".format(serving_name, serving_names_str))
hops.serving.ServingNotFound: No serving with name: IrisFlowerClassifier could be found among the list of available servings: 



In [148]:
assert serving.get_status("IrisFlowerClassifier") == "Running"
assert serving.get_status("mnist") == "Running"

No serving with name: IrisFlowerClassifier could be found among the list of available servings: 
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 607, in get_status
    serving = _find_serving_with_name(serving_name, servings)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 648, in _find_serving_with_name
    "available servings: {}".format(serving_name, serving_names_str))
hops.serving.ServingNotFound: No serving with name: IrisFlowerClassifier could be found among the list of available servings: 



In [149]:
serving.stop("IrisFlowerClassifier")
serving.stop("mnist")

No serving with name: IrisFlowerClassifier could be found among the list of available servings: 
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 144, in stop
    serving_id = get_id(serving_name)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 507, in get_id
    serving = _find_serving_with_name(serving_name, servings)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 648, in _find_serving_with_name
    "available servings: {}".format(serving_name, serving_names_str))
hops.serving.ServingNotFound: No serving with name: IrisFlowerClassifier could be found among the list of available servings: 



In [150]:
assert serving.get_status("IrisFlowerClassifier") == "Stopped"
assert serving.get_status("mnist") == "Stopped"

No serving with name: IrisFlowerClassifier could be found among the list of available servings: 
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 607, in get_status
    serving = _find_serving_with_name(serving_name, servings)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 648, in _find_serving_with_name
    "available servings: {}".format(serving_name, serving_names_str))
hops.serving.ServingNotFound: No serving with name: IrisFlowerClassifier could be found among the list of available servings: 



##### Test Send Inference Requests

In [151]:
serving.start("IrisFlowerClassifier")
serving.start("mnist")

No serving with name: IrisFlowerClassifier could be found among the list of available servings: 
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 123, in start
    serving_id = get_id(serving_name)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 507, in get_id
    serving = _find_serving_with_name(serving_name, servings)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 648, in _find_serving_with_name
    "available servings: {}".format(serving_name, serving_names_str))
hops.serving.ServingNotFound: No serving with name: IrisFlowerClassifier could be found among the list of available servings: 



In [152]:
for i in range(20):
    data = {"inputs" : [[random.uniform(1, 8) for i in range(4)]]}
    response = serving.make_inference_request("IrisFlowerClassifier", data)
    assert response is not None
    assert "predictions" or "prediction" in response

Could not create or update serving (url: /hopsworks-api/api/project/120/inference/models/IrisFlowerClassifier:predict), server response: 
 HTTP code: 404, HTTP reason: Not Found, error code: 250000, error msg: Serving instance not found, user msg: name: IrisFlowerClassifier
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 721, in make_inference_request
    return _make_inference_request_rest(serving_name, data, verb)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 759, in _make_inference_request_rest
    error_code, error_msg, user_msg))
hops.exceptions.RestAPIError: Could not create or update serving (url: /hopsworks-api/api/project/120/inference/models/IrisFlowerClassifier:predict), server response: 
 HTTP code: 404, HTTP reason: Not Found, error code: 250000, error msg: Serving instance not found, user msg: name: IrisFlowerClassifier



In [153]:
for i in range(20):
    data = {
                "signature_name": 'predict_images',
                "instances": [np.random.rand(784).tolist()]
            }
    response = serving.make_inference_request("mnist", data)
    assert response is not None
    assert "predictions" in response

Could not create or update serving (url: /hopsworks-api/api/project/120/inference/models/mnist:predict), server response: 
 HTTP code: 404, HTTP reason: Not Found, error code: 250000, error msg: Serving instance not found, user msg: name: mnist
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 721, in make_inference_request
    return _make_inference_request_rest(serving_name, data, verb)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 759, in _make_inference_request_rest
    error_code, error_msg, user_msg))
hops.exceptions.RestAPIError: Could not create or update serving (url: /hopsworks-api/api/project/120/inference/models/mnist:predict), server response: 
 HTTP code: 404, HTTP reason: Not Found, error code: 250000, error msg: Serving instance not found, user msg: name: mnist



##### Test Kafka Inference Log

In [154]:
# Avro Python is only supported in python 2
if sys.version_info[0] < 3:
    topic = serving.get_kafka_topic("IrisFlowerClassifier")
    config = kafka.get_kafka_default_config()
    config['default.topic.config'] = {'auto.offset.reset': 'earliest'}
    consumer = Consumer(config)
    topics = [topic]
    consumer.subscribe(topics)
    json_schema = kafka.get_schema(topic)
    avro_schema = kafka.convert_json_schema_to_avro(json_schema)

In [155]:
# Avro Python is only supported in python 2
if sys.version_info[0] < 3:
    for i in range(0, 10):
        msg = consumer.poll(timeout=1.5)
        if msg is not None:
            value = msg.value()
            event_dict = kafka.parse_avro_msg(value, avro_schema)
            assert "modelName" in event_dict
            assert "requestTimestamp" in event_dict
            assert "servingType" in event_dict
            assert "inferenceResponse" in event_dict
            assert event_dict["modelName"] == "IrisFlowerClassifier"
            assert event_dict["servingType"] == "SKLEARN"

In [156]:
# Avro Python is only supported in python 2
if sys.version_info[0] < 3:
    topic = serving.get_kafka_topic("mnist")
    config = kafka.get_kafka_default_config()
    config['default.topic.config'] = {'auto.offset.reset': 'earliest'}
    consumer = Consumer(config)
    topics = [topic]
    consumer.subscribe(topics)
    json_schema = kafka.get_schema(topic)
    avro_schema = kafka.convert_json_schema_to_avro(json_schema)

In [157]:
# Avro Python is only supported in python 2
if sys.version_info[0] < 3:
    for i in range(0, 10):
        msg = consumer.poll(timeout=1.5)
        if msg is not None:
            value = msg.value()
            event_dict = kafka.parse_avro_msg(value, avro_schema)
            assert "modelName" in event_dict
            assert "requestTimestamp" in event_dict
            assert "servingType" in event_dict
            assert "inferenceResponse" in event_dict
            assert event_dict["modelName"] == "mnist"
            assert event_dict["servingType"] == "TENSORFLOW"

##### Test Delete Serving

In [158]:
serving.delete("IrisFlowerClassifier")
serving.delete("mnist")

No serving with name: IrisFlowerClassifier could be found among the list of available servings: 
Traceback (most recent call last):
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 57, in delete
    serving_id = get_id(serving_name)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 507, in get_id
    serving = _find_serving_with_name(serving_name, servings)
  File "/srv/hops/anaconda/anaconda/envs/python36/lib/python3.6/site-packages/hops/serving.py", line 648, in _find_serving_with_name
    "available servings: {}".format(serving_name, serving_names_str))
hops.serving.ServingNotFound: No serving with name: IrisFlowerClassifier could be found among the list of available servings: 



In [159]:
assert not serving.exists("IrisFlowerClassifier")
assert not serving.exists("mnist")

No serving with name IrisFlowerClassifier was found in the project collect
No serving with name mnist was found in the project collect