-
-
Notifications
You must be signed in to change notification settings - Fork 1k
/
Features.h
365 lines (317 loc) · 10.2 KB
/
Features.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
/*
* This software is distributed under BSD 3-clause license (see LICENSE file).
*
* Authors: Heiko Strathmann, Soeren Sonnenburg, Sergey Lisitsyn,
* Saurabh Mahindre, Evgeniy Andreev, Wu Lin, Vladislav Horbatiuk,
* Yuyu Zhang, Bjoern Esser, Soumyajit De
*/
#ifndef _CFEATURES__H__
#define _CFEATURES__H__
#include <shogun/lib/config.h>
#include <shogun/lib/common.h>
#include <shogun/io/File.h>
#include <shogun/base/SGObject.h>
#include <shogun/preprocessor/Preprocessor.h>
#include <shogun/features/FeatureTypes.h>
#include <shogun/features/SubsetStack.h>
#include <shogun/lib/List.h>
#include <shogun/lib/DynamicObjectArray.h>
#include <shogun/lib/DynamicArray.h>
#include <shogun/base/range.h>
namespace shogun
{
class CFile;
class CPreprocessor;
class CKernel;
}
namespace shogun
{
/** @brief The class Features is the base class of all feature objects.
*
* It can be understood as a dense real valued feature matrix (with e.g.
* columns as single feature vectors), a set of strings, graphs or any other
* arbitrary collection of objects. As a result this class is kept very general
* and implements only very weak interfaces to
*
* - duplicate the Feature object
* - obtain the feature type (like F_DREAL, F_SHORT ...)
* - obtain the feature class (like Simple dense matrices, sparse or strings)
* - obtain the number of feature "vectors"
*
* In addition it provides helpers to check e.g. for compatibility of feature objects.
*
* Currently there are 3 general feature classes, which are CDenseFeatures
* (dense matrices), CSparseFeatures (sparse matrices), CStringFeatures (a
* set of strings) from which all the specific features like CDenseFeatures<float64_t>
* (dense real valued feature matrices) are derived.
*
*
* (Multiple) Subsets (of subsets) are supported.
* Sub-classes may want to overwrite the subset_changed_post() method which is
* called automatically after each subset change. See method documentations to
* see how behaviour is changed when subsets are active.
* A subset is put onto a stack using the add_subset() method. The last added
* subset may be removed via remove_subset(). There is also the possibility to
* add subsets in place (this only stores one index vector in memory as opposed
* to many when add_subset() is called many times) with add_subset_in_place().
* The latter does not allow to remove such modifications one-by-one.
*/
class CFeatures : public CSGObject
{
public:
/** constructor
*
* @param size cache size
*/
CFeatures(int32_t size=0);
/** copy constructor */
CFeatures(const CFeatures& orig);
/** constructor
*
* @param loader File object via which data shall be loaded
*/
CFeatures(CFile* loader);
/** duplicate feature object
*
* abstract base method
*
* @return feature object
*/
virtual CFeatures* duplicate() const=0;
virtual ~CFeatures();
/** get feature type
*
* abstract base method
*
* @return templated feature type
*/
virtual EFeatureType get_feature_type() const=0;
/** get feature class
*
* abstract base method
*
* @return feature class like STRING, SIMPLE, SPARSE...
*/
virtual EFeatureClass get_feature_class() const=0;
#ifndef SWIG
/** returns an iterator of indices
* from 0 to @ref CFeatures::get_num_vectors
*
* Should be used in algorithms in the following way:
* @code
* for (auto idx : features->index_iterator()) { ... }
* @endcode
*
*/
virtual Range<int32_t> index_iterator() const
{
return range(0, get_num_vectors());
}
#endif
/** add preprocessor
*
* @param p preprocessor to set
*/
virtual void add_preprocessor(CPreprocessor* p);
/** delete preprocessor from list
*
* @param num index of preprocessor in list
*/
virtual void del_preprocessor(int32_t num);
/** get specified preprocessor
*
* @param num index of preprocessor in list
*/
CPreprocessor* get_preprocessor(int32_t num) const;
/** get number of preprocessors
*
* @return number of preprocessors
*/
int32_t get_num_preprocessors() const;
/** clears all preprocs */
void clean_preprocessors();
/** print preprocessors */
void list_preprocessors();
/** get cache size
*
* @return cache size
*/
int32_t get_cache_size() const;
/** get number of examples/vectors, possibly corresponding to the current subset
*
* abstract base method
*
* @return number of examples/vectors (possibly of subset, if implemented)
*/
virtual int32_t get_num_vectors() const=0;
/** in case there is a feature matrix allow for reshaping
*
* NOT IMPLEMENTED!
*
* @param num_features new number of features
* @param num_vectors new number of vectors
* @return if reshaping was successful
*/
virtual bool reshape(int32_t num_features, int32_t num_vectors);
/** list feature object */
void list_feature_obj() const;
/** load features from file
*
* @param loader File object via which data shall be loaded
*/
virtual void load(CFile* loader);
/** save features to file
*
* @param writer File object via which data shall be saved
*/
virtual void save(CFile* writer);
/** check feature compatibility
*
* @param f features to check for compatibility
* @return if features are compatible
*/
bool check_feature_compatibility(CFeatures* f) const;
/** check if features have given property
*
* @param p feature property
* @return if features have given property
*/
bool has_property(EFeatureProperty p) const;
/** set property
*
* @param p kernel property to set
*/
void set_property(EFeatureProperty p);
/** unset property
*
* @param p kernel property to unset
*/
void unset_property(EFeatureProperty p);
/** Takes a list of feature instances and returns a new instance being
* a concatenation of a copy of this instace's data and the given
* instancess data. Note that the feature types have to be equal.
*
* NOT IMPLEMENTED!
*
* @param others list of feature objects to append
* @return new feature object which contains copy of data of this
* instance and given ones
*/
virtual CFeatures* create_merged_copy(CList* others)
{
SG_ERROR("%s::create_merged_copy() is not yet implemented!\n")
return NULL;
}
/** Convenience method for method with same name and list as parameter.
*
* NOT IMPLEMENTED!
*
* @param other feature object to append
* @return new feature object which contains copy of data of this
* instance and of given one
*/
virtual CFeatures* create_merged_copy(CFeatures* other)
{
SG_ERROR("%s::create_merged_copy() is not yet implemented!\n")
return NULL;
}
/** Adds a subset of indices on top of the current subsets (possibly
* subset of subset). Every call causes a new active index vector
* to be stored. Added subsets can be removed one-by-one. If this is not
* needed, add_subset_in_place() should be used (does not store
* intermediate index vectors)
*
* Calls subset_changed_post() afterwards
*
* @param subset subset of indices to add
* */
virtual void add_subset(SGVector<index_t> subset);
/** Sets/changes latest added subset. This allows to add multiple subsets
* with in-place memory requirements. They cannot be removed one-by-one
* afterwards, only the latest active can. If this is needed, use
* add_subset(). If no subset is active, this just adds.
*
* Calls subset_changed_post() afterwards
*
* @param subset subset of indices to replace the latest one with.
* */
virtual void add_subset_in_place(SGVector<index_t> subset);
/** removes that last added subset from subset stack, if existing
* Calls subset_changed_post() afterwards */
virtual void remove_subset();
/** removes all subsets
* Calls subset_changed_post() afterwards */
virtual void remove_all_subsets();
/** returns subset stack
*
* @return subset stack
*/
virtual CSubsetStack* get_subset_stack();
/** method may be overwritten to update things that depend on subset */
virtual void subset_changed_post() {}
/** Creates a new CFeatures instance containing copies of the elements
* which are specified by the provided indices.
*
* This method is needed for a KernelMachine to store its model data.
* NOT IMPLEMENTED!
*
* @param indices indices of feature elements to copy
* @return new CFeatures instance with copies of feature data
*/
virtual CFeatures* copy_subset(SGVector<index_t> indices);
/** Creates a new CFeatures instance containing only the dimensions
* of the feature vector which are specified by the provided indices.
*
* This method is needed for feature selection tasks
* NOT IMPLEMENTED!
*
* @param dims indices of feature dimensions to copy
* @return new CFeatures instance with copies of specified features
*/
virtual CFeatures* copy_dimension_subset(SGVector<index_t> dims);
/** does this class support compatible computation bewteen difference classes?
* for example, this->dot(rhs_prt),
* can rhs_prt be an instance of a difference class?
*
* @return whether this class supports compatible computation
*/
virtual bool support_compatible_class() const {return false;}
/** Given a class in right hand side, does this class support compatible computation?
*
* for example, is this->dot(rhs_prt) valid,
* where rhs_prt is the class in right hand side
*
* @param rhs the class in right hand side
* @return whether this class supports compatible computation
*/
virtual bool get_feature_class_compatibility (EFeatureClass rhs) const;
/** Throws an error, as features are immutable */
template <typename T>
void put(const Tag<T>& _tag, const T& value) throw(ShogunException)
{
SG_ERROR(
"Cannot put %s::%s, features are immutable.\n", get_name(),
_tag.name().c_str());
}
#ifndef SWIG // SWIG should skip this part
virtual CFeatures* shallow_subset_copy()
{
SG_SNOTIMPLEMENTED;
return NULL;
}
#endif
private:
void init();
private:
/** feature properties */
uint64_t properties;
/** size of cache in MB */
int32_t cache_size;
/** list of preprocessors */
CDynamicObjectArray* preproc;
protected:
/** subset used for index transformations */
CSubsetStack* m_subset_stack;
};
}
#endif