Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Doc Improve. Replace timeseries_split unit-tests with a simple test #4070

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,16 @@ RegressionLabels labels(f_labels)

#![set parameters]
int num_subsets = 5
int min_subset_size = 4
int future_steps = 4
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd suggest future_offset but that is minor

TimeSeriesSplitting splitting(labels, num_subsets)
splitting.set_min_subset_size(min_subset_size)
splitting.set_min_future_steps(future_steps)
#![set parameters]

#![build subsets]
splitting.build_subsets()
#![build subsets]

#![generate subsets and inverse (aka test labels and train labels)]
IntVector test_labels_indices = splitting.generate_subset_indices(1)
#![generate subsets and inverse (aka vaildation set and train set)]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a tag for the cookbooks, not free form comment. So pls make it super short (see other examples)

IntVector validation_labels_indices = splitting.generate_subset_indices(1)
IntVector train_labels_indices = splitting.generate_subset_inverse(1)
#![generate subsets and inverse (aka test labels and train labels)]
#![generate subsets and inverse (aka vaildation set and train set)]
47 changes: 24 additions & 23 deletions src/shogun/evaluation/TimeSeriesSplitting.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,59 +49,60 @@ CTimeSeriesSplitting::CTimeSeriesSplitting(CLabels* labels, index_t num_subsets)

void CTimeSeriesSplitting::init()
{
m_rng = sg_rand;
m_min_subset_size = 1;
m_min_future_steps = 1;
}

void CTimeSeriesSplitting::build_subsets()
{
REQUIRE(m_num_subsets > 0, "Number of subsets should be greater than 0.");
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing newline


reset_subsets();
m_is_filled = true;

SGVector<index_t> indices(m_labels->get_num_labels());
indices.range_fill();
index_t num_subsets = m_subset_indices->get_num_elements();
index_t split_index;

for (auto i = 0; i < num_subsets; ++i)
for (auto i = 0; i < m_num_subsets; ++i)
{
CDynamicArray<index_t>* current =
(CDynamicArray<index_t>*)m_subset_indices->get_element(i);

if (i == num_subsets - 1)
split_index = indices.vlen - m_min_subset_size;
if (i == m_num_subsets - 1)
split_index = indices.vlen - m_min_future_steps;
else
split_index = (i + 1) * (indices.vlen / num_subsets);
split_index = (i + 1) * (indices.vlen / m_num_subsets);

/* filling current with indices on right end */
/* filling current with indices on right side */
for (auto k = split_index; k < indices.vlen; ++k)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

auto k : Range(indices.vlen) but it is minor

{
current->append_element(indices.vector[k]);
}

SG_UNREF(current);
}

m_subset_indices->shuffle(m_rng);
}

void CTimeSeriesSplitting::set_min_subset_size(index_t min_size)
void CTimeSeriesSplitting::set_min_future_steps(index_t future_steps)
{
index_t num_subsets = m_subset_indices->get_num_elements();
REQUIRE(m_num_subsets > 0, "Number of subsets should be greater than 0.");

index_t num_labels = m_labels->get_num_labels();

/* min_size should be less than difference between number of labels
* and split index of second last split */
REQUIRE(min_size > 0, "Minimum subset size should be atleast 1.")
/* future_steps should be less than the difference between number of labels
* and split index of second last fold */
index_t future_steps_upperbound =
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

max_future_steps
or better max_future_offset

num_labels - (m_num_subsets - 1) * (num_labels / m_num_subsets);

REQUIRE(future_steps > 0, "Future steps should be a positive number.\n")
REQUIRE(
min_size < num_labels - (num_subsets - 1) * (num_labels / num_subsets),
"Minimum subset size can be atmost %d, constrained by number of "
"subsets and labels.",
num_labels - (num_subsets - 1) * (num_labels / num_subsets) - 1);
m_min_subset_size = min_size;
future_steps < future_steps_upperbound,
"Future steps can be atmost %d. Try reducing number of subsets for "
"larger future steps.\n",
future_steps_upperbound - 1);
m_min_future_steps = future_steps;
}

index_t CTimeSeriesSplitting::get_min_subset_size()
index_t CTimeSeriesSplitting::get_min_future_steps()
{
return m_min_subset_size;
return m_min_future_steps;
}
57 changes: 31 additions & 26 deletions src/shogun/evaluation/TimeSeriesSplitting.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,55 +40,60 @@
namespace shogun
{
class CLabels;

/** @brief Implements a timeseries splitting strategy for cross-validation.
* The strategy builds a given number of subsets each filled with labels
* indices greater than a split index. The split indices are \f$ c[N/K] \f$
/** @brief Implements a timeseries splitting strategy for cross-validation,
* respecting time.
* Each fold splits timeseries into train (subset_inverse) and validation
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls remove the (subset_inverse) here. This is documented in the base class
The newlines are awkwards, could you remove them?

* (subset) set.
* Train set contains indices less than a split index
* and validation set contains rest of the indices.
* The split indices are \f$ C*floor(N/K) \f$
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in latex math, you don't use * for multiplication but \cdot or just nothing

* where \f$N\f$ is number of labels,\f$K\f$ is number of subsets and
* \f$c = 1,2,3,...,K-1\f$. The last split index is \f$ N-h \f$.
* \f$C = 1,2,3,...,K-1\f$. The split index for last fold is \f$ N -
* min_future_steps \f$. Note:
* If forecasting h-step ahead, set min_future_steps to h (default 1).
*
* If forecasting h-step ahead, set minimum subset size to h(Default is 1).
* For example, if \f$h = 2\f$ and \f$K = 2\f$ and \f$A =
* [0,1,2,3,4,5,6,7,8,9]\f$.
* The two folds give the following train and vaildation sets :
*
* Given, \f$h = 2\f$ and \f$K = 2\f$ and \f$A = [0,1,2,3,4,5,6,7,8,9]\f$.
* The two splits are \f$ S1 = [5,6,7,8,9] \f$ and \f$ S2 = [8,9]\f$
* \f$ Train_1 = [0,1,2,3,4], Val_1 = [5,6,7,8,9] \f$ and
* \f$ Train_2 = [0,1,2,3,4,5,6,7], Val_2 = [8,9]\f$
*/

class CTimeSeriesSplitting : public CSplittingStrategy
{
public:
/** constructor */
/** Constructor */
CTimeSeriesSplitting();

/** constructor
/** Constructor
*
* @param labels labels to be (possibly) used for splitting
* @param num_subsets desired number of subsets, the labels are split
* into
* @param labels labels required for the size of timeseries to split
* @param num_folds number of folds
*/
CTimeSeriesSplitting(CLabels* labels, index_t num_subsets);
CTimeSeriesSplitting(CLabels* labels, index_t num_folds);

/** Sets the minimum subset size for subsets. If forecasting h-step
* ahead, set min_size to h.
/** Sets the minimum number of future steps required in validation set
* (default 1).
* If forecasting h-step
* ahead, set future_steps to h.
*
* @param min_size Minimum subset size. */
void set_min_subset_size(index_t min_size);
* @param future_steps Minimum number of future steps. */
void set_min_future_steps(index_t future_steps);

/** @return Minimum subset size. */
index_t get_min_subset_size();
/** @return Minimum number of future steps in validation set. */
index_t get_min_future_steps();

/** @return name of the SGSerializable */
/** @return Name of the SGSerializable */
virtual const char* get_name() const
{
return "TimeSeriesSplitting";
}

void build_subsets() override;

/** Custom rng if using cross validation across different threads */
CRandom* m_rng;

/** The minimum subset size for test set.*/
index_t m_min_subset_size;
/** The minimum number of future_steps in validation set.*/
index_t m_min_future_steps;

private:
void init();
Expand Down
107 changes: 41 additions & 66 deletions tests/unit/evaluation/SplittingStrategy_unittest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
*/

#include <gtest/gtest.h>
#include <shogun/base/range.h>
#include <shogun/evaluation/CrossValidationSplitting.h>
#include <shogun/evaluation/LOOCrossValidationSplitting.h>
#include <shogun/evaluation/StratifiedCrossValidationSplitting.h>
Expand Down Expand Up @@ -297,89 +298,63 @@ TEST(SplittingStrategy,LOO)
}
}

TEST(SplittingStrategy, timeseries_subset_linear_splits)
class TimeSeriesSplitTest : public ::testing::Test
{
index_t num_labels, num_subsets, min_subset_size, base_size;
index_t runs = 10;

while (runs-- > 0)
protected:
TimeSeriesSplitTest()
{
num_labels = CMath::random(50, 150);
num_subsets = CMath::random(1, 5);
min_subset_size = CMath::random(1, 6);
base_size = num_labels / num_subsets;

CRegressionLabels* labels = new CRegressionLabels(num_labels);
for (index_t i = 0; i < num_labels; ++i)
labels->set_label(i, CMath::random(-10.0, 10.0));

CTimeSeriesSplitting* splitting =
new CTimeSeriesSplitting(labels, num_subsets);
num_labels = 15;
num_subsets = 3;
future_steps = 3;

splitting->set_min_subset_size(min_subset_size);
splitting->build_subsets();

for (index_t i = 0; i < num_subsets; ++i)
labels = new CRegressionLabels(num_labels);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

labels = some(num_labels); and then you dont have to SG_UNREF later

for (auto i : range(num_labels))
{
SGVector<index_t> subset = splitting->generate_subset_indices(i);
SGVector<index_t> inverse = splitting->generate_subset_inverse(i);

/* Subset size should be atleat min_subset_size */
EXPECT_GE(subset.vlen, splitting->get_min_subset_size());

/* check the splitting is linear */
EXPECT_TRUE(
inverse.vlen % base_size == 0 ||
inverse.vlen == num_labels - min_subset_size);
labels->set_label(i, i + 1);
}

SG_UNREF(splitting);
SG_REF(labels);
}
}

TEST(SplittingStrategy, timeseries_subsets_future_leak)
{
index_t num_labels, num_subsets, min_subset_size;
index_t runs = 10;

while (runs-- > 0)
~TimeSeriesSplitTest()
{
num_labels = CMath::random(50, 150);
num_subsets = CMath::random(1, 5);
min_subset_size = CMath::random(1, 7);
SG_UNREF(labels);
}

CRegressionLabels* labels = new CRegressionLabels(num_labels);
for (index_t i = 0; i < num_labels; ++i)
labels->set_label(i, CMath::random(-10.0, 10.0));
CRegressionLabels* labels;
index_t num_labels;
index_t num_subsets;
index_t future_steps;
};

CTimeSeriesSplitting* splitting =
new CTimeSeriesSplitting(labels, num_subsets);
TEST_F(TimeSeriesSplitTest, timeseries_splits)
{
CTimeSeriesSplitting* splitting =
new CTimeSeriesSplitting(labels, num_subsets);

splitting->set_min_subset_size(min_subset_size);
splitting->build_subsets();
splitting->set_min_future_steps(future_steps);
splitting->build_subsets();

for (index_t i = 0; i < num_subsets; ++i)
{
SGVector<index_t> subset = splitting->generate_subset_indices(i);
SGVector<index_t> inverse = splitting->generate_subset_inverse(i);
SGVector<index_t> train_indices = splitting->generate_subset_inverse(0);

/* check future leak into test set */
for (index_t j = 0; j < inverse.vlen - 1; ++j)
{
EXPECT_LT(inverse.vector[j], inverse.vector[j + 1]);
}
for (auto i : range(5))
{
EXPECT_EQ(train_indices[i], i);
}

EXPECT_LT(inverse.vector[inverse.vlen - 1], subset.vector[0]);
train_indices = splitting->generate_subset_inverse(1);

for (index_t j = 0; j < subset.vlen - 1; ++j)
{
EXPECT_LT(subset.vector[j], subset.vector[j + 1]);
}
for (auto i : range(10))
{
EXPECT_EQ(train_indices[i], i);
}

EXPECT_EQ(subset.vlen + inverse.vlen, num_labels);
}
train_indices = splitting->generate_subset_inverse(2);

/* clean up */
SG_UNREF(splitting);
for (auto i : range(12))
{
EXPECT_EQ(train_indices[i], i);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes this is a better test in fact!

}

SG_UNREF(splitting);
}