Skip to content

Commit

Permalink
fixes for new arff api
Browse files Browse the repository at this point in the history
  • Loading branch information
gf712 committed May 24, 2019
1 parent d06fe2f commit b045a8a
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 62 deletions.
66 changes: 37 additions & 29 deletions src/shogun/io/openml/OpenMLData.cpp
Expand Up @@ -23,7 +23,8 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key)
auto reader = OpenMLFile(api_key);
auto return_string = reader.get("dataset_description", "json", id);

auto& dataset_description = check_response<BACKEND_FORMAT::JSON>(return_string, "data_set_description");
auto& dataset_description = check_response<BACKEND_FORMAT::JSON>(
return_string, "data_set_description");

auto name = return_if_possible<std::string>(
"name", dataset_description.GetObject());
Expand Down Expand Up @@ -77,7 +78,8 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key)
param_vector;
return_string = reader.get("data_features", "json", id);

auto& dataset_features = check_response<BACKEND_FORMAT::JSON>(return_string, "data_features");
auto& dataset_features =
check_response<BACKEND_FORMAT::JSON>(return_string, "data_features");

for (const auto& param : dataset_features["feature"].GetArray())
{
Expand All @@ -100,7 +102,8 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key)
std::vector<std::unordered_map<std::string, std::string>> qualities_vector;
return_string = reader.get("data_qualities", "json", id);

auto& data_qualities = check_response<BACKEND_FORMAT::JSON>(return_string, "data_qualities");
auto& data_qualities =
check_response<BACKEND_FORMAT::JSON>(return_string, "data_qualities");

for (const auto& param : data_qualities["quality"].GetArray())
{
Expand Down Expand Up @@ -129,14 +132,16 @@ OpenMLData::get_dataset(const std::string& id, const std::string& api_key)

std::shared_ptr<CFeatures> OpenMLData::get_features() noexcept
{
if (!m_cached_features)
get_data();
return m_cached_features;
// if (!m_cached_features)
// get_data();
// return m_cached_features;
SG_SNOTIMPLEMENTED
return nullptr;
}

std::shared_ptr<CFeatures> OpenMLData::get_features(const std::string& label)
{
if (!m_cached_features)
if (m_cached_features.empty())
get_data();
auto find_label =
std::find(m_feature_names.begin(), m_feature_names.end(), label);
Expand All @@ -147,39 +152,42 @@ std::shared_ptr<CFeatures> OpenMLData::get_features(const std::string& label)
feat_type_copy.erase(feat_type_copy.begin() + col_idx);
for (const auto type : feat_type_copy)
{
if (type == ARFFDeserializer::Attribute::STRING)
if (type == Attribute::STRING)
SG_SNOTIMPLEMENTED
}
// auto result = std::make_shared<CDenseFeatures>();

std::shared_ptr<CDenseFeatures<float64_t>> result;
bool first = true;
size_t n_examples = 0;
for (int i = 0; i < m_feature_types.size(); ++i)
{
if (i != col_idx && first)
{
result.reset(m_cached_features->get_feature_obj(i)
->as<CDenseFeatures<float64_t>>());
result.reset(m_cached_features[0]->as<CDenseFeatures<float64_t>>());
n_examples = result->get_num_vectors();
first = false;
}
if (i != col_idx)
result.reset(
result
->create_merged_copy(m_cached_features->get_feature_obj(i))
->as<CDenseFeatures<float64_t>>());
{
REQUIRE(
n_examples == m_cached_features[i]->get_num_vectors(),
"Expected all features to have the same number of examples!\n")
result.reset(result->create_merged_copy(m_cached_features[i].get())
->as<CDenseFeatures<float64_t>>());
}
}

REQUIRE(n_examples != 0, "No features extracted!\n")

// need to copy data as result is only in the stack and the data
// will be gone at the end of the function
auto* copy_feat = SG_MALLOC(
float64_t,
m_feature_types.size() * m_cached_features->get_num_vectors());
auto* copy_feat = SG_MALLOC(float64_t, m_feature_types.size() * n_examples);
memcpy(
copy_feat, result->get_feature_matrix().data(),
m_feature_types.size() * m_cached_features->get_num_vectors());
m_feature_types.size() * m_cached_features.size());

result = std::make_shared<CDenseFeatures<float64_t>>(
copy_feat, m_feature_types.size(),
m_cached_features->get_num_vectors());
copy_feat, m_feature_types.size(), n_examples);

return result;
}
Expand All @@ -197,7 +205,7 @@ std::shared_ptr<CLabels> OpenMLData::get_labels(const std::string& label_name)
if (m_cached_labels && label_name == m_cached_label_name)
return m_cached_labels;

if (!m_cached_features)
if (m_cached_features.empty())
get_data();

auto find_label =
Expand All @@ -207,16 +215,16 @@ std::shared_ptr<CLabels> OpenMLData::get_labels(const std::string& label_name)
"Requested label \"%s\" not in the dataset!\n", label_name.c_str())
auto col_idx = std::distance(m_feature_names.begin(), find_label);

auto target_label_as_feat =
std::shared_ptr<CFeatures>(m_cached_features->get_feature_obj(col_idx));
std::shared_ptr<CFeatures> target_label_as_feat =
m_cached_features[col_idx];

switch (m_feature_types[col_idx])
{
// real features
case ARFFDeserializer::Attribute::REAL:
case ARFFDeserializer::Attribute::NUMERIC:
case ARFFDeserializer::Attribute::INTEGER:
case ARFFDeserializer::Attribute::DATE:
case Attribute::REAL:
case Attribute::NUMERIC:
case Attribute::INTEGER:
case Attribute::DATE:
{
auto casted_feat = std::dynamic_pointer_cast<CDenseFeatures<float64_t>>(
target_label_as_feat);
Expand All @@ -228,7 +236,7 @@ std::shared_ptr<CLabels> OpenMLData::get_labels(const std::string& label_name)
}
break;
// nominal features
case ARFFDeserializer::Attribute::NOMINAL:
case Attribute::NOMINAL:
{
auto casted_feat = std::dynamic_pointer_cast<CDenseFeatures<float64_t>>(
target_label_as_feat);
Expand Down
17 changes: 7 additions & 10 deletions src/shogun/io/openml/OpenMLData.h
Expand Up @@ -11,8 +11,8 @@
#include <shogun/io/ARFFFile.h>

#include <string>
#include <vector>
#include <unordered_map>
#include <vector>

namespace shogun
{
Expand Down Expand Up @@ -90,7 +90,6 @@ namespace shogun
*/
std::shared_ptr<CLabels> get_labels();


/**
* Returns the dataset labels given the label_name
* @return the labels
Expand All @@ -101,16 +100,14 @@ namespace shogun
* Returns the type of all attributes/features in the ARFF file
* @return
*/
SG_FORCED_INLINE std::vector<ARFFDeserializer::Attribute>

get_feature_types() const noexcept
SG_FORCED_INLINE std::vector<Attribute> get_feature_types() const
noexcept
{
return m_feature_types;
}

SG_FORCED_INLINE std::string

get_default_target_attribute() const noexcept
SG_FORCED_INLINE std::string get_default_target_attribute() const
noexcept
{
return m_default_target_attribute;
}
Expand Down Expand Up @@ -153,9 +150,9 @@ namespace shogun
m_param_qualities;
std::string m_api_key;

std::shared_ptr<CCombinedFeatures> m_cached_features;
std::vector<std::shared_ptr<CFeatures>> m_cached_features;
std::vector<std::string> m_feature_names;
std::vector<ARFFDeserializer::Attribute> m_feature_types;
std::vector<Attribute> m_feature_types;
std::shared_ptr<CLabels> m_cached_labels;
std::string m_cached_label_name;
};
Expand Down
16 changes: 8 additions & 8 deletions src/shogun/io/openml/OpenMLRun.cpp
Expand Up @@ -88,22 +88,22 @@ std::shared_ptr<OpenMLRun> OpenMLRun::run_flow_on_task(
SG_SDEBUG("End of openml run: %s\n", xval_storage->to_string().c_str());

return std::make_shared<OpenMLRun>(
nullptr, // uploader
nullptr, // uploader_name
nullptr, // setup_id
nullptr, // setup_string
nullptr, // parameter_settings
std::string{}, // uploader
std::string{}, // uploader_name
std::string{}, // setup_id
std::string{}, // setup_string
std::string{}, // parameter_settings
std::vector<float64_t>{}, // evaluations
std::vector<float64_t>{}, // fold_evaluations
std::vector<float64_t>{}, // sample_evaluations
nullptr, // data_content
std::string{}, // data_content
std::vector<std::string>{}, // output_files
task, // task
flow, // flow
nullptr, // run_id
std::string{}, // run_id
model, // model
std::vector<std::string>{}, // tags
nullptr // predictions_url
std::string{} // predictions_url
);
}

Expand Down
21 changes: 6 additions & 15 deletions src/shogun/io/openml/OpenMLSplit.cpp
Expand Up @@ -29,27 +29,18 @@ OpenMLSplit::get_split(const std::string& split_url, const std::string& api_key)
arff_parser.read();
auto arff_features = arff_parser.get_features();
REQUIRE(
arff_features->get_num_feature_obj() == 4,
arff_features.size() == 4,
"Expected a ARFF file with 4 attributes: type, rowid, repeat and "
"fold.\n")

auto train_test_feat =
std::shared_ptr<CFeatures>(arff_features->get_feature_obj(0));
auto rowid_feat =
std::shared_ptr<CFeatures>(arff_features->get_feature_obj(1));
auto repeat_feat =
std::shared_ptr<CFeatures>(arff_features->get_feature_obj(2));
auto fold_feat =
std::shared_ptr<CFeatures>(arff_features->get_feature_obj(3));

auto type_vector = nominal_feature_to_vector(train_test_feat);
auto rowid_vector = dense_feature_to_vector(rowid_feat);
auto repeat_vector = dense_feature_to_vector(repeat_feat);
auto fold_vector = dense_feature_to_vector(fold_feat);
auto type_vector = nominal_feature_to_vector(arff_features[0]);
auto rowid_vector = dense_feature_to_vector(arff_features[1]);
auto repeat_vector = dense_feature_to_vector(arff_features[2]);
auto fold_vector = dense_feature_to_vector(arff_features[3]);

std::array<std::vector<int32_t>, 3> train_idx, test_idx;

for (int i = 0; i < train_test_feat->get_num_vectors(); ++i)
for (int i = 0; i < arff_features[0]->get_num_vectors(); ++i)
{
if (type_vector[i] == LabelType::TRAIN)
{
Expand Down

0 comments on commit b045a8a

Please sign in to comment.