-
-
Notifications
You must be signed in to change notification settings - Fork 1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Doc Improve. Replace timeseries_split unit-tests with a simple test #4070
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,16 +6,16 @@ RegressionLabels labels(f_labels) | |
|
||
#![set parameters] | ||
int num_subsets = 5 | ||
int min_subset_size = 4 | ||
int future_steps = 4 | ||
TimeSeriesSplitting splitting(labels, num_subsets) | ||
splitting.set_min_subset_size(min_subset_size) | ||
splitting.set_min_future_steps(future_steps) | ||
#![set parameters] | ||
|
||
#![build subsets] | ||
splitting.build_subsets() | ||
#![build subsets] | ||
|
||
#![generate subsets and inverse (aka test labels and train labels)] | ||
IntVector test_labels_indices = splitting.generate_subset_indices(1) | ||
#![generate subsets and inverse (aka vaildation set and train set)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a tag for the cookbooks, not free form comment. So pls make it super short (see other examples) |
||
IntVector validation_labels_indices = splitting.generate_subset_indices(1) | ||
IntVector train_labels_indices = splitting.generate_subset_inverse(1) | ||
#![generate subsets and inverse (aka test labels and train labels)] | ||
#![generate subsets and inverse (aka vaildation set and train set)] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,59 +49,60 @@ CTimeSeriesSplitting::CTimeSeriesSplitting(CLabels* labels, index_t num_subsets) | |
|
||
void CTimeSeriesSplitting::init() | ||
{ | ||
m_rng = sg_rand; | ||
m_min_subset_size = 1; | ||
m_min_future_steps = 1; | ||
} | ||
|
||
void CTimeSeriesSplitting::build_subsets() | ||
{ | ||
REQUIRE(m_num_subsets > 0, "Number of subsets should be greater than 0."); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Missing newline |
||
|
||
reset_subsets(); | ||
m_is_filled = true; | ||
|
||
SGVector<index_t> indices(m_labels->get_num_labels()); | ||
indices.range_fill(); | ||
index_t num_subsets = m_subset_indices->get_num_elements(); | ||
index_t split_index; | ||
|
||
for (auto i = 0; i < num_subsets; ++i) | ||
for (auto i = 0; i < m_num_subsets; ++i) | ||
{ | ||
CDynamicArray<index_t>* current = | ||
(CDynamicArray<index_t>*)m_subset_indices->get_element(i); | ||
|
||
if (i == num_subsets - 1) | ||
split_index = indices.vlen - m_min_subset_size; | ||
if (i == m_num_subsets - 1) | ||
split_index = indices.vlen - m_min_future_steps; | ||
else | ||
split_index = (i + 1) * (indices.vlen / num_subsets); | ||
split_index = (i + 1) * (indices.vlen / m_num_subsets); | ||
|
||
/* filling current with indices on right end */ | ||
/* filling current with indices on right side */ | ||
for (auto k = split_index; k < indices.vlen; ++k) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
{ | ||
current->append_element(indices.vector[k]); | ||
} | ||
|
||
SG_UNREF(current); | ||
} | ||
|
||
m_subset_indices->shuffle(m_rng); | ||
} | ||
|
||
void CTimeSeriesSplitting::set_min_subset_size(index_t min_size) | ||
void CTimeSeriesSplitting::set_min_future_steps(index_t future_steps) | ||
{ | ||
index_t num_subsets = m_subset_indices->get_num_elements(); | ||
REQUIRE(m_num_subsets > 0, "Number of subsets should be greater than 0."); | ||
|
||
index_t num_labels = m_labels->get_num_labels(); | ||
|
||
/* min_size should be less than difference between number of labels | ||
* and split index of second last split */ | ||
REQUIRE(min_size > 0, "Minimum subset size should be atleast 1.") | ||
/* future_steps should be less than the difference between number of labels | ||
* and split index of second last fold */ | ||
index_t future_steps_upperbound = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. max_future_steps |
||
num_labels - (m_num_subsets - 1) * (num_labels / m_num_subsets); | ||
|
||
REQUIRE(future_steps > 0, "Future steps should be a positive number.\n") | ||
REQUIRE( | ||
min_size < num_labels - (num_subsets - 1) * (num_labels / num_subsets), | ||
"Minimum subset size can be atmost %d, constrained by number of " | ||
"subsets and labels.", | ||
num_labels - (num_subsets - 1) * (num_labels / num_subsets) - 1); | ||
m_min_subset_size = min_size; | ||
future_steps < future_steps_upperbound, | ||
"Future steps can be atmost %d. Try reducing number of subsets for " | ||
"larger future steps.\n", | ||
future_steps_upperbound - 1); | ||
m_min_future_steps = future_steps; | ||
} | ||
|
||
index_t CTimeSeriesSplitting::get_min_subset_size() | ||
index_t CTimeSeriesSplitting::get_min_future_steps() | ||
{ | ||
return m_min_subset_size; | ||
return m_min_future_steps; | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,55 +40,60 @@ | |
namespace shogun | ||
{ | ||
class CLabels; | ||
|
||
/** @brief Implements a timeseries splitting strategy for cross-validation. | ||
* The strategy builds a given number of subsets each filled with labels | ||
* indices greater than a split index. The split indices are \f$ c[N/K] \f$ | ||
/** @brief Implements a timeseries splitting strategy for cross-validation, | ||
* respecting time. | ||
* Each fold splits timeseries into train (subset_inverse) and validation | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. pls remove the (subset_inverse) here. This is documented in the base class |
||
* (subset) set. | ||
* Train set contains indices less than a split index | ||
* and validation set contains rest of the indices. | ||
* The split indices are \f$ C*floor(N/K) \f$ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. in latex math, you don't use * for multiplication but \cdot or just nothing |
||
* where \f$N\f$ is number of labels,\f$K\f$ is number of subsets and | ||
* \f$c = 1,2,3,...,K-1\f$. The last split index is \f$ N-h \f$. | ||
* \f$C = 1,2,3,...,K-1\f$. The split index for last fold is \f$ N - | ||
* min_future_steps \f$. Note: | ||
* If forecasting h-step ahead, set min_future_steps to h (default 1). | ||
* | ||
* If forecasting h-step ahead, set minimum subset size to h(Default is 1). | ||
* For example, if \f$h = 2\f$ and \f$K = 2\f$ and \f$A = | ||
* [0,1,2,3,4,5,6,7,8,9]\f$. | ||
* The two folds give the following train and vaildation sets : | ||
* | ||
* Given, \f$h = 2\f$ and \f$K = 2\f$ and \f$A = [0,1,2,3,4,5,6,7,8,9]\f$. | ||
* The two splits are \f$ S1 = [5,6,7,8,9] \f$ and \f$ S2 = [8,9]\f$ | ||
* \f$ Train_1 = [0,1,2,3,4], Val_1 = [5,6,7,8,9] \f$ and | ||
* \f$ Train_2 = [0,1,2,3,4,5,6,7], Val_2 = [8,9]\f$ | ||
*/ | ||
|
||
class CTimeSeriesSplitting : public CSplittingStrategy | ||
{ | ||
public: | ||
/** constructor */ | ||
/** Constructor */ | ||
CTimeSeriesSplitting(); | ||
|
||
/** constructor | ||
/** Constructor | ||
* | ||
* @param labels labels to be (possibly) used for splitting | ||
* @param num_subsets desired number of subsets, the labels are split | ||
* into | ||
* @param labels labels required for the size of timeseries to split | ||
* @param num_folds number of folds | ||
*/ | ||
CTimeSeriesSplitting(CLabels* labels, index_t num_subsets); | ||
CTimeSeriesSplitting(CLabels* labels, index_t num_folds); | ||
|
||
/** Sets the minimum subset size for subsets. If forecasting h-step | ||
* ahead, set min_size to h. | ||
/** Sets the minimum number of future steps required in validation set | ||
* (default 1). | ||
* If forecasting h-step | ||
* ahead, set future_steps to h. | ||
* | ||
* @param min_size Minimum subset size. */ | ||
void set_min_subset_size(index_t min_size); | ||
* @param future_steps Minimum number of future steps. */ | ||
void set_min_future_steps(index_t future_steps); | ||
|
||
/** @return Minimum subset size. */ | ||
index_t get_min_subset_size(); | ||
/** @return Minimum number of future steps in validation set. */ | ||
index_t get_min_future_steps(); | ||
|
||
/** @return name of the SGSerializable */ | ||
/** @return Name of the SGSerializable */ | ||
virtual const char* get_name() const | ||
{ | ||
return "TimeSeriesSplitting"; | ||
} | ||
|
||
void build_subsets() override; | ||
|
||
/** Custom rng if using cross validation across different threads */ | ||
CRandom* m_rng; | ||
|
||
/** The minimum subset size for test set.*/ | ||
index_t m_min_subset_size; | ||
/** The minimum number of future_steps in validation set.*/ | ||
index_t m_min_future_steps; | ||
|
||
private: | ||
void init(); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ | |
*/ | ||
|
||
#include <gtest/gtest.h> | ||
#include <shogun/base/range.h> | ||
#include <shogun/evaluation/CrossValidationSplitting.h> | ||
#include <shogun/evaluation/LOOCrossValidationSplitting.h> | ||
#include <shogun/evaluation/StratifiedCrossValidationSplitting.h> | ||
|
@@ -297,89 +298,63 @@ TEST(SplittingStrategy,LOO) | |
} | ||
} | ||
|
||
TEST(SplittingStrategy, timeseries_subset_linear_splits) | ||
class TimeSeriesSplitTest : public ::testing::Test | ||
{ | ||
index_t num_labels, num_subsets, min_subset_size, base_size; | ||
index_t runs = 10; | ||
|
||
while (runs-- > 0) | ||
protected: | ||
TimeSeriesSplitTest() | ||
{ | ||
num_labels = CMath::random(50, 150); | ||
num_subsets = CMath::random(1, 5); | ||
min_subset_size = CMath::random(1, 6); | ||
base_size = num_labels / num_subsets; | ||
|
||
CRegressionLabels* labels = new CRegressionLabels(num_labels); | ||
for (index_t i = 0; i < num_labels; ++i) | ||
labels->set_label(i, CMath::random(-10.0, 10.0)); | ||
|
||
CTimeSeriesSplitting* splitting = | ||
new CTimeSeriesSplitting(labels, num_subsets); | ||
num_labels = 15; | ||
num_subsets = 3; | ||
future_steps = 3; | ||
|
||
splitting->set_min_subset_size(min_subset_size); | ||
splitting->build_subsets(); | ||
|
||
for (index_t i = 0; i < num_subsets; ++i) | ||
labels = new CRegressionLabels(num_labels); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. labels = some(num_labels); and then you dont have to SG_UNREF later |
||
for (auto i : range(num_labels)) | ||
{ | ||
SGVector<index_t> subset = splitting->generate_subset_indices(i); | ||
SGVector<index_t> inverse = splitting->generate_subset_inverse(i); | ||
|
||
/* Subset size should be atleat min_subset_size */ | ||
EXPECT_GE(subset.vlen, splitting->get_min_subset_size()); | ||
|
||
/* check the splitting is linear */ | ||
EXPECT_TRUE( | ||
inverse.vlen % base_size == 0 || | ||
inverse.vlen == num_labels - min_subset_size); | ||
labels->set_label(i, i + 1); | ||
} | ||
|
||
SG_UNREF(splitting); | ||
SG_REF(labels); | ||
} | ||
} | ||
|
||
TEST(SplittingStrategy, timeseries_subsets_future_leak) | ||
{ | ||
index_t num_labels, num_subsets, min_subset_size; | ||
index_t runs = 10; | ||
|
||
while (runs-- > 0) | ||
~TimeSeriesSplitTest() | ||
{ | ||
num_labels = CMath::random(50, 150); | ||
num_subsets = CMath::random(1, 5); | ||
min_subset_size = CMath::random(1, 7); | ||
SG_UNREF(labels); | ||
} | ||
|
||
CRegressionLabels* labels = new CRegressionLabels(num_labels); | ||
for (index_t i = 0; i < num_labels; ++i) | ||
labels->set_label(i, CMath::random(-10.0, 10.0)); | ||
CRegressionLabels* labels; | ||
index_t num_labels; | ||
index_t num_subsets; | ||
index_t future_steps; | ||
}; | ||
|
||
CTimeSeriesSplitting* splitting = | ||
new CTimeSeriesSplitting(labels, num_subsets); | ||
TEST_F(TimeSeriesSplitTest, timeseries_splits) | ||
{ | ||
CTimeSeriesSplitting* splitting = | ||
new CTimeSeriesSplitting(labels, num_subsets); | ||
|
||
splitting->set_min_subset_size(min_subset_size); | ||
splitting->build_subsets(); | ||
splitting->set_min_future_steps(future_steps); | ||
splitting->build_subsets(); | ||
|
||
for (index_t i = 0; i < num_subsets; ++i) | ||
{ | ||
SGVector<index_t> subset = splitting->generate_subset_indices(i); | ||
SGVector<index_t> inverse = splitting->generate_subset_inverse(i); | ||
SGVector<index_t> train_indices = splitting->generate_subset_inverse(0); | ||
|
||
/* check future leak into test set */ | ||
for (index_t j = 0; j < inverse.vlen - 1; ++j) | ||
{ | ||
EXPECT_LT(inverse.vector[j], inverse.vector[j + 1]); | ||
} | ||
for (auto i : range(5)) | ||
{ | ||
EXPECT_EQ(train_indices[i], i); | ||
} | ||
|
||
EXPECT_LT(inverse.vector[inverse.vlen - 1], subset.vector[0]); | ||
train_indices = splitting->generate_subset_inverse(1); | ||
|
||
for (index_t j = 0; j < subset.vlen - 1; ++j) | ||
{ | ||
EXPECT_LT(subset.vector[j], subset.vector[j + 1]); | ||
} | ||
for (auto i : range(10)) | ||
{ | ||
EXPECT_EQ(train_indices[i], i); | ||
} | ||
|
||
EXPECT_EQ(subset.vlen + inverse.vlen, num_labels); | ||
} | ||
train_indices = splitting->generate_subset_inverse(2); | ||
|
||
/* clean up */ | ||
SG_UNREF(splitting); | ||
for (auto i : range(12)) | ||
{ | ||
EXPECT_EQ(train_indices[i], i); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes this is a better test in fact! |
||
} | ||
|
||
SG_UNREF(splitting); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd suggest
future_offset
but that is minor