-
-
Notifications
You must be signed in to change notification settings - Fork 1k
/
DataManager.h
271 lines (237 loc) · 8.66 KB
/
DataManager.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
/*
* Copyright (c) The Shogun Machine Learning Toolbox
* Written (w) 2014 - 2016 Soumyajit De
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
* The views and conclusions contained in the software and documentation are those
* of the authors and should not be interpreted as representing official policies,
* either expressed or implied, of the Shogun Development Team.
*/
#ifndef DATA_MANAGER_H__
#define DATA_MANAGER_H__
#include <vector>
#include <memory>
#include <shogun/statistical_testing/internals/InitPerFeature.h>
#include <shogun/lib/common.h>
namespace shogun
{
class CFeatures;
namespace internal
{
class DataFetcher;
class NextSamples;
/**
* @brief Class DataManager for fetching/streaming test data block-wise.
* It can handle data coming from multiple sources. The number of data
* sources is represented by the num_distributions parameter in the constructor
* of the data manager. It can handle heterogenous data sources, and it can
* stream multiple blocks per burst, as the computation would require. The size
* of the blocks and the number of blocks to be fetched per burst can be set
* externally.
*
* This class is designed to be used on a stack. An instance of DataManager
* should not be serialzied or copied or moved around. In Shogun, it is helpful
* when used inside just the implementation inside a PIMPL.
*/
class DataManager
{
public:
/**
* Default constructor.
*
* @param num_distributions number of data sources (i.e. CFeature objects)
*/
DataManager(size_t num_distributions);
/**
* Disabled copy constructor
* @param other other instance
*/
DataManager(const DataManager& other) = delete;
/**
* Disabled assignment operator
* @param other other instance
*/
DataManager& operator=(const DataManager& other) = delete;
/**
* Destructor
*/
~DataManager();
/**
* Sets the blocksize for block-wise data fetching. It divides the block-size
* per data source according to the total number of feature vectors available
* from that source. More formally, if there are \f$K\f$ data sources, \f$X_k\f$,
* \f$k=\[1,K]\f$, with number of feature vectors \f$n_{X_k}\f$ from each, then
* setting a block-size of \f$B\f$ would mean that in each next() call of the
* data manager instance, it will fetch \f$rho_{X_k} B\f$ samples from each
* \f$X_k\f$, where \f$rho_{X_k}=n_{X_k}/n\f$, \f$n=sum_k n_{X_k}\f$.
*
* @param blocksize The size of the block consisting of data from all the sources.
*/
void set_blocksize(index_t blocksize);
/**
* In order to speed up the computation, usually a number of blocks are fetched at
* once per next() call. This method sets that number.
*
* @param num_blocks_per_burst The number of blocks to be fetched in a burst.
*/
void set_num_blocks_per_burst(index_t num_blocks_per_burst);
/**
* Setter for feature object as a data source. Since multiple data sources are
* supported, this method takes an index in which the feature object is set.
* Internally, it initializes a data fetcher object for the provided feature
* object.
*
* Example usage:
* @code
*
* DataManager data_mgr;
* // feats_0 = some CFeatures instance
* // feats_1 = some CFeatures instance
* data_mgr.sample_at(0) = feats_0;
* data_mgr.sample_at(1) = feats_1;
*
* @endcode
*
* @param i The data source index, at which the feature object is to be set as a
* data source.
* @return An initializer for the specified data source (that sets up a fetcher
* for this feature), to be used as lvalue.
*/
InitPerFeature samples_at(size_t i);
/**
* Getter for feature object at a give data source index.
*
* @param i The data source index, from which the feature object is to be obtained
* @return The underlying CFeatures object at the specified data source.
*/
CFeatures* samples_at(size_t i) const;
/**
* Setter for the number of samples. Setting this number is mandatory for
* streaming features. For other type of feature objects, this number equals
* the number of vectors, and is set internally.
*
* Example usage:
* @code
*
* DataManager data_mgr;
* data_mgr.num_sample_at(0) = 10;
* data_mgr.num_sample_at(1) = 15;
*
* @endcode
*
* @param i The data source index, at which the number of samples is to be set.
* @return A reference for the number of samples for the specified data source
* to be used as lvalue.
*/
index_t& num_samples_at(size_t i);
/**
* Getter for the number of samples.
*
* @param i The data source index, from which the number of samples is to be obtained.
* @return The number of samples for the specified data source.
*/
const index_t num_samples_at(size_t i) const;
/**
* Getter for the number of samples from a specified data source in a block.
*
* @param i The data source index.
* @return The number of samples from i-th data source in a block.
*/
const index_t blocksize_at(size_t i) const;
/**
* @return True if block-wise fetching is on, False otherwise.
*/
const bool is_blockwise() const;
/**
* Turns on blockwise fetching if True is passed. Turns off blockwise fetching if
* False is passed. The blockwise details are not destroyed when set to False, i.e.
* turning blockwise fetching back on again, we can get blocks as we would have got
* in the original setup.
*/
void set_blockwise(bool blockwise);
/**
* @return Total number of samples that can be fetched from all the data sources.
*/
index_t get_num_samples() const;
/**
* @return The minimum block-size that can be fetched from the specified data sources.
* For example, if there are two data sources, with samples 20 and 30, respectively,
* then minimum blocksize can be 5 (2 from 1st data source, 3 from the 2nd), and there
* can be then 10 such blocks.
*/
index_t get_min_blocksize() const;
/**
* @param train_test_ratio The split ratio for train-test data. The default value is 0
* which means that all of the data would be used for testing.
*/
void set_train_test_ratio(float64_t train_test_ratio);
/**
* @return The split ratio for train-test data. The default value is 0, which means
* that all of the data would be used for testing.
*/
float64_t get_train_test_ratio() const;
/**
* @param train_mode If set to true, then the training data would be returned by the data
* fetching API of this data manager. Otherwise, test data would be returend.
*/
void set_train_mode(bool train_mode);
/**
* @param xvalidation_mode If set to true, then the data would be split in N fold (the value
* of N is determined from the train_test_ratio).
*/
void set_xvalidation_mode(bool xvalidation_mode);
/**
* @return The number of folds that can be used based on the train-test ratio. Returns
* an integer if xvalidation mode is ON, 0 otherwise.
*/
index_t get_num_folds() const;
/**
* @param idx The index of the fold in X-validation scenario, has to be within the range of
* \f$[0, N)\f$, where N is the number of folds as returned by get_num_folds() method.
*/
void use_fold(index_t idx);
/**
* Call this method before fetching the data from the data manager
*/
void start();
/**
* @return The next bunch of blocks fetched at any given burst.
*/
NextSamples next();
/**
* call this method after fetching the data is done.
*/
void end();
/**
* Resets the fetchers to the initial states.
*/
void reset();
private:
/**
* The internal data fetcher instances.
*/
std::vector<std::unique_ptr<DataFetcher>> fetchers;
};
}
}
#endif // DATA_MANAGER_H__