From 6d395aa1af961fb29af75fbf655885d971078bc4 Mon Sep 17 00:00:00 2001 From: van51 Date: Mon, 5 Aug 2013 13:52:11 +0300 Subject: [PATCH] Added HashedDocDotFeatures in modular interfaces --- .../features_hasheddocdot_modular.py | 37 +++++++++++++++++++ src/interfaces/modular/Features.i | 2 + src/interfaces/modular/Features_includes.i | 1 + src/interfaces/modular/Library.i | 6 +++ src/interfaces/modular/Library_includes.i | 3 ++ src/shogun/features/HashedDocDotFeatures.cpp | 5 +++ 6 files changed, 54 insertions(+) create mode 100644 examples/undocumented/python_modular/features_hasheddocdot_modular.py diff --git a/examples/undocumented/python_modular/features_hasheddocdot_modular.py b/examples/undocumented/python_modular/features_hasheddocdot_modular.py new file mode 100644 index 00000000000..def4aa5a22c --- /dev/null +++ b/examples/undocumented/python_modular/features_hasheddocdot_modular.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python +strings=['hey','guys','i','am','a','string'] + +parameter_list=[[strings]] + +def features_hasheddocdot_modular(strings): + from shogun.Features import StringCharFeatures, RAWBYTE + from shogun.Features import HashedDocDotFeatures + from shogun.Library import NGramTokenizer + from numpy import array + + #create string features + f=StringCharFeatures(strings, RAWBYTE) + + #set the number of bits of the target dimension + #means a dim of size 2^5=32 + num_bits=5 + + #create the ngram tokenizer of size 8 to parse the strings + tokenizer=NGramTokenizer(8) + + #normalize results + normalize=True + + #create HashedDocDot features + hddf=HashedDocDotFeatures(num_bits, f, tokenizer, normalize) + + #should expect 32 + #print('Feature space dimensionality is', hddf.get_dim_feature_space()) + + #print('Self dot product of string 0', hddf.dot(0, hddf, 0)) + + return hddf + +if __name__=='__main__': + print('HashedDocDotFeatures') + features_hasheddocdot_modular(*parameter_list[0]) diff --git a/src/interfaces/modular/Features.i b/src/interfaces/modular/Features.i index 3ae144b7ccc..0ff0c3418be 100644 --- a/src/interfaces/modular/Features.i +++ b/src/interfaces/modular/Features.i @@ -50,6 +50,7 @@ %rename(AttributeFeatures) CAttributeFeatures; %rename(CombinedFeatures) CCombinedFeatures; %rename(CombinedDotFeatures) CCombinedDotFeatures; +%rename(HashedDocDotFeatures) CHashedDocDotFeatures; %rename(Labels) CLabels; %rename(LabelsFactory) CLabelsFactory; @@ -477,6 +478,7 @@ namespace shogun %include %include %include +%include %include %include diff --git a/src/interfaces/modular/Features_includes.i b/src/interfaces/modular/Features_includes.i index 3f15c489d67..82421076574 100644 --- a/src/interfaces/modular/Features_includes.i +++ b/src/interfaces/modular/Features_includes.i @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/src/interfaces/modular/Library.i b/src/interfaces/modular/Library.i index 90b6b7eec8c..9e860d941c1 100644 --- a/src/interfaces/modular/Library.i +++ b/src/interfaces/modular/Library.i @@ -21,6 +21,9 @@ %rename(Hash) CHash; %rename(StructuredData) CStructuredData; %rename(DynamicObjectArray) CDynamicObjectArray; +%rename(Tokenizer) CTokenizer; +%rename(DelimiterTokenizer) CDelimiterTokenizer; +%rename(NGramTokenizer) CNGramTokenizer; %rename(IndexBlock) CIndexBlock; %rename(IndexBlockRelation) CIndexBlockRelation; @@ -458,6 +461,9 @@ namespace shogun /* Hash */ %include +%include +%include +%include %include %include %include diff --git a/src/interfaces/modular/Library_includes.i b/src/interfaces/modular/Library_includes.i index 77ae0c463be..7304e586384 100644 --- a/src/interfaces/modular/Library_includes.i +++ b/src/interfaces/modular/Library_includes.i @@ -26,4 +26,7 @@ #include #include #include +#include +#include +#include %} diff --git a/src/shogun/features/HashedDocDotFeatures.cpp b/src/shogun/features/HashedDocDotFeatures.cpp index 607ed24555a..6b42130b117 100644 --- a/src/shogun/features/HashedDocDotFeatures.cpp +++ b/src/shogun/features/HashedDocDotFeatures.cpp @@ -27,6 +27,11 @@ CHashedDocDotFeatures::CHashedDocDotFeatures(const CHashedDocDotFeatures& orig) init(orig.num_bits, orig.doc_collection, orig.tokenizer, orig.should_normalize); } +CHashedDocDotFeatures::CHashedDocDotFeatures(CFile* loader) +{ + SG_NOTIMPLEMENTED; +} + void CHashedDocDotFeatures::init(int32_t hash_bits, CStringFeatures* docs, CTokenizer* tzer, bool normalize) {