Skip to content

Commit

Permalink
started working datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
gf712 committed May 10, 2019
1 parent 0500504 commit df8809d
Show file tree
Hide file tree
Showing 2 changed files with 183 additions and 60 deletions.
184 changes: 129 additions & 55 deletions src/shogun/io/OpenMLFlow.cpp
Expand Up @@ -49,6 +49,8 @@ const char* OpenMLReader::list_dataset_qualities = "/data/qualities/{}";
const char* OpenMLReader::list_dataset_filter = "/data/list/{}";
/* FLOW API */
const char* OpenMLReader::flow_file = "/flow/{}";
/* TASK API */
const char* OpenMLReader::task_file = "/task/{}";

const std::unordered_map<std::string, std::string>
OpenMLReader::m_format_options = {{"xml", xml_server},
Expand Down Expand Up @@ -102,10 +104,10 @@ void OpenMLReader::openml_curl_error_helper(CURL* curl_handle, CURLcode code)
#endif // HAVE_CURL

/**
* Checks the returned flow in JSON format
* @param doc the parsed flow
* Checks the returned response from OpenML in JSON format
* @param doc the parsed OpenML JSON format response
*/
static void check_flow_response(Document& doc)
static void check_response(const Document& doc, const std::string& type)
{
if (SG_UNLIKELY(doc.HasMember("error")))
{
Expand All @@ -115,7 +117,9 @@ static void check_flow_response(Document& doc)
root["message"].GetString())
return;
}
REQUIRE(doc.HasMember("flow"), "Unexpected format of OpenML flow.\n");
REQUIRE(
doc.HasMember(type.c_str()), "Unexpected format of OpenML %s.\n",
type.c_str());
}

/**
Expand All @@ -142,8 +146,7 @@ static SG_FORCED_INLINE void emplace_string_to_map(
* @param name the name of the key
*/
static SG_FORCED_INLINE void emplace_string_to_map(
const GenericObject<
true, GenericValue<UTF8<char>>>& v,
const GenericObject<true, GenericValue<UTF8<char>>>& v,
std::unordered_map<std::string, std::string>& param_dict,
const std::string& name)
{
Expand All @@ -167,7 +170,7 @@ std::shared_ptr<OpenMLFlow> OpenMLFlow::download_flow(
auto reader = OpenMLReader(api_key);
auto return_string = reader.get("flow_file", "json", flow_id);
document.Parse(return_string.c_str());
check_flow_response(document);
check_response(document, "flow");

// store root for convenience. We know it exists from previous check.
const Value& root = document["flow"];
Expand Down Expand Up @@ -248,10 +251,63 @@ std::shared_ptr<OpenMLFlow> OpenMLFlow::from_file()
return std::shared_ptr<OpenMLFlow>();
}

std::shared_ptr<OpenMLTask>
OpenMLTask::get_dataset(const std::string& task_id, const std::string& api_key)
{
Document document;
std::string task_name;
std::string task_type;
std::string task_type_id;
std::pair<std::shared_ptr<OpenMLData>, std::shared_ptr<OpenMLSplit>>
task_descriptor;

auto reader = OpenMLReader(api_key);
auto return_string = reader.get("task_file", "json", task_id);

document.Parse(return_string.c_str());
check_response(document, "task");

const Value& root = document["flow"];

REQUIRE(
task_id == root["task_id"].GetString(),
"Expected downloaded task to have the same id as the requested task "
"id.\n")

task_name = root["task_name"].GetString();
task_type = root["task_type"].GetString();
task_type_id = root["task_type_id"].GetString();

// expect two elements in input array: dataset and split
const Value& json_input = root["input"];

REQUIRE(
json_input.IsArray(), "Currently the dataset reader can only handle "
"inputs with a dataset and split field")

auto input_array = json_input.GetArray();
REQUIRE(
input_array.Size() == 2,
"Currently the dataset reader can only handle inputs with a dataset "
"and split field. Found %d elements.",
input_array.Size())

// handle dataset
auto json_dataset = input_array[0].GetObject();

auto result = std::make_shared<OpenMLTask>(
task_id, task_name, task_type, task_type_id, task_descriptor);

return result;
}

/**
* Class using the Any visitor pattern to convert
* a string to a C++ type that can be used as a parameter
* in a Shogun model.
* in a Shogun model. If the string value is not "null" it will
* be put in its casted type in the given model with the provided parameter
* name. If the value is null nothing happens, i.e. no error is thrown
* and no value is put in model.
*/
class StringToShogun : public AnyVisitor
{
Expand All @@ -266,18 +322,20 @@ class StringToShogun : public AnyVisitor

void on(bool* v) final
{
SG_SDEBUG(
"bool: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
if (!is_null())
{
SG_SDEBUG("bool: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
bool result = strcmp(m_string_val.c_str(), "true") == 0;
m_model->put(m_parameter, result);
}
}
void on(int32_t* v) final
{
SG_SDEBUG(
"int32: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
if (!is_null())
{
SG_SDEBUG("int32: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
try
{
int32_t result = std::stoi(m_string_val);
Expand All @@ -299,84 +357,98 @@ class StringToShogun : public AnyVisitor
}
void on(int64_t* v) final
{
SG_SDEBUG(
"int64: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
if (!is_null())
{
SG_SDEBUG("int64: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())

int64_t result = std::stol(m_string_val);
m_model->put(m_parameter, result);
}
}
void on(float* v) final
{
SG_SDEBUG(
"float: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
if (!is_null())
{
SG_SDEBUG("float: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
char* end;
float32_t result = std::strtof(m_string_val.c_str(), &end);
float32_t result = std::stof(m_string_val);
m_model->put(m_parameter, result);
}
}
void on(double* v) final
{
SG_SDEBUG(
"double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
if (!is_null())
{
SG_SDEBUG("double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
char* end;
float64_t result = std::strtod(m_string_val.c_str(), &end);
float64_t result = std::stod(m_string_val);
m_model->put(m_parameter, result);
}
}
void on(long double* v)
{
SG_SDEBUG(
"long double: %s=%s\n", m_parameter.c_str(),
m_string_val.c_str())
if (!is_null())
{
SG_SDEBUG("long double: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
char* end;
floatmax_t result = std::strtold(m_string_val.c_str(), &end);
floatmax_t result = std::stold(m_string_val);
m_model->put(m_parameter, result);
}
}
void on(CSGObject** v) final
{
SG_SDEBUG("CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
SG_SDEBUG(
"CSGObject: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
}
void on(SGVector<int>* v) final
{
SG_SDEBUG("SGVector<int>: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
SG_SDEBUG(
"SGVector<int>: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
}
void on(SGVector<float>* v) final
{
SG_SDEBUG("SGVector<float>: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
SG_SDEBUG(
"SGVector<float>: %s=%s\n", m_parameter.c_str(),
m_string_val.c_str())
}
void on(SGVector<double>* v) final
{
SG_SDEBUG("SGVector<double>: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
SG_SDEBUG(
"SGVector<double>: %s=%s\n", m_parameter.c_str(),
m_string_val.c_str())
}
void on(SGMatrix<int>* mat) final
{
SG_SDEBUG("SGMatrix<int>: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
SG_SDEBUG(
"SGMatrix<int>: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
}
void on(SGMatrix<float>* mat) final
{
SG_SDEBUG("SGMatrix<float>: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
}
void on(SGMatrix<double>* mat) final
{
SG_SDEBUG("SGMatrix<double>: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())
SG_SDEBUG(
"SGMatrix<float>: %s=%s\n", m_parameter.c_str(),
m_string_val.c_str())
}

bool is_null()
void on(SGMatrix<double>* mat) final{SG_SDEBUG(
"SGMatrix<double>: %s=%s\n", m_parameter.c_str(), m_string_val.c_str())}

/**
* In OpenML "null" is an empty parameter value field.
* @return whether the field is "null"
*/
SG_FORCED_INLINE bool is_null()
{
bool result = strcmp(m_string_val.c_str(), "null") == 0;
return result;
}

void set_parameter_name(const std::string& name)
SG_FORCED_INLINE void set_parameter_name(const std::string& name)
{
m_parameter = name;
}

void set_string_value(const std::string& value)
SG_FORCED_INLINE void set_string_value(const std::string& value)
{
m_string_val = value;
}
Expand All @@ -396,17 +468,16 @@ class StringToShogun : public AnyVisitor
std::shared_ptr<CSGObject> instantiate_model_from_factory(
const std::string& factory_name, const std::string& algo_name)
{
std::shared_ptr<CSGObject> obj;
if (factory_name == "machine")
obj = std::shared_ptr<CSGObject>(machine(algo_name));
else if (factory_name == "kernel")
obj = std::shared_ptr<CSGObject>(kernel(algo_name));
else if (factory_name == "distance")
obj = std::shared_ptr<CSGObject>(distance(algo_name));
else
SG_SERROR("Unsupported factory \"%s\".\n", factory_name.c_str())
return std::shared_ptr<CSGObject>(machine(algo_name));
if (factory_name == "kernel")
return std::shared_ptr<CSGObject>(kernel(algo_name));
if (factory_name == "distance")
return std::shared_ptr<CSGObject>(distance(algo_name));

return obj;
SG_SERROR("Unsupported factory \"%s\".\n", factory_name.c_str())

return nullptr;
}

/**
Expand All @@ -426,19 +497,21 @@ void cast_and_put(
// temporary fix until shared_ptr PR merged
auto* tmp_clone = dynamic_cast<CMachine*>(casted_obj->clone());
obj->put(parameter_name, tmp_clone);
return;
}
else if (auto casted_obj = std::dynamic_pointer_cast<CKernel>(nested_obj))
if (auto casted_obj = std::dynamic_pointer_cast<CKernel>(nested_obj))
{
auto* tmp_clone = dynamic_cast<CKernel*>(casted_obj->clone());
obj->put(parameter_name, tmp_clone);
return;
}
else if (auto casted_obj = std::dynamic_pointer_cast<CDistance>(nested_obj))
if (auto casted_obj = std::dynamic_pointer_cast<CDistance>(nested_obj))
{
auto* tmp_clone = dynamic_cast<CDistance*>(casted_obj->clone());
obj->put(parameter_name, tmp_clone);
return;
}
else
SG_SERROR("Could not cast SGObject.\n")
SG_SERROR("Could not cast SGObject.\n")
}

std::shared_ptr<CSGObject> ShogunOpenML::flow_to_model(
Expand All @@ -447,8 +520,8 @@ std::shared_ptr<CSGObject> ShogunOpenML::flow_to_model(
auto params = flow->get_parameters();
auto components = flow->get_components();
auto class_name = get_class_info(flow->get_class_name());
auto module_name = std::get<0>(class_name);
auto algo_name = std::get<1>(class_name);
auto module_name = class_name.first;
auto algo_name = class_name.second;

auto obj = instantiate_model_from_factory(module_name, algo_name);
auto obj_param = obj->get_params();
Expand Down Expand Up @@ -486,12 +559,12 @@ ShogunOpenML::model_to_flow(const std::shared_ptr<CSGObject>& model)
return std::shared_ptr<OpenMLFlow>();
}

std::tuple<std::string, std::string>
std::pair<std::string, std::string>
ShogunOpenML::get_class_info(const std::string& class_name)
{
std::vector<std::string> class_components;
auto begin = class_name.begin();
std::tuple<std::string, std::string> result;
std::pair<std::string, std::string> result;

for (auto it = class_name.begin(); it != class_name.end(); ++it)
{
Expand All @@ -503,15 +576,16 @@ ShogunOpenML::get_class_info(const std::string& class_name)
if (std::next(it) == class_name.end())
class_components.emplace_back(std::string(begin, std::next(it)));
}
if (class_components[0] == "shogun")
result = std::make_tuple(class_components[1], class_components[2]);

if (class_components[0] == "shogun" && class_components.size() == 3)
result = std::make_pair(class_components[1], class_components[2]);
else if (class_components[0] == "shogun" && class_components.size() != 3)
SG_SERROR("Invalid class name format %s.\n", class_name.c_str())
else
SG_SERROR(
"The provided flow is not meant for shogun deserialisation! The "
"required library is \"%s\".\n",
class_components[0].c_str())
if (class_components.size() != 3)
SG_SERROR("Invalid class name format %s.\n", class_name.c_str())

return result;
}

0 comments on commit df8809d

Please sign in to comment.