Skip to content

Commit 33957ed

Browse files
committed
implemented pruning of subsumed ngrams
1 parent 317537b commit 33957ed

File tree

6 files changed

+65
-5
lines changed

6 files changed

+65
-5
lines changed

colibricore_classes.in.pxd

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,7 @@ cdef extern from "patternmodel.h":
331331
bool DOREMOVEFLEXGRAMS
332332
bool DORESET
333333
int PRUNENONSUBSUMED
334+
int PRUNESUBSUMED
334335

335336
cdef cppclass IndexedDataHandler:
336337
unsigned int count(IndexedData &)

colibricore_wrapper.in.pyx

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -899,7 +899,8 @@ cdef class PatternModelOptions:
899899
* DOREMOVESKIPGRAMS - Remove skipgrams from the model
900900
* DOREMOVEFLEXGRAMS - Remove flexgrams from the model
901901
* DORESET - Reset all counts before training
902-
* PRUNENONSUBSUMED - Prune all n-grams up to this length that are not subsumed by higher-order ngrams
902+
* PRUNENONSUBSUMED - Prune all n-grams up to this length that are *NOT* subsumed by higher-order ngrams
903+
* PRUNESUBSUMED - Prune all n-grams up to this length that are subsumed by higher-order ngrams
903904
* DEBUG
904905
* QUIET (default: False)
905906
@@ -950,6 +951,8 @@ cdef class PatternModelOptions:
950951
self.coptions.QUIET = value
951952
elif key == 'PRUNENONSUBSUMED':
952953
self.coptions.PRUNENONSUBSUMED = value
954+
elif key == 'PRUNESUBSUMED':
955+
self.coptions.PRUNESUBSUMED = value
953956
else:
954957
raise KeyError
955958

@@ -990,6 +993,8 @@ cdef class PatternModelOptions:
990993
return self.coptions.QUIET
991994
elif key == 'PRUNENONSUBSUMED':
992995
return self.coptions.PRUNENONSUBSUMED
996+
elif key == 'PRUNESUBSUMED':
997+
return self.coptions.PRUNESUBSUMED
993998
else:
994999
raise KeyError
9951000

configure.ac

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
# $URL: https://ilk.uvt.nl/svn/sources/ucto/trunk/configure.ac $
55

66
AC_PREREQ([2.67])
7-
AC_INIT([colibri-core],[2.5.3],[proycon@anaproy.nl])
7+
AC_INIT([colibri-core],[2.5.4],[proycon@anaproy.nl])
88
AC_CONFIG_SRCDIR([configure.ac])
99
AC_CONFIG_MACRO_DIR([m4])
1010
AC_CONFIG_HEADER([config.h])

include/patternmodel.h

Lines changed: 55 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,8 @@ class PatternModelOptions {
141141
bool DOREVERSEINDEX; ///< Obsolete now, only here for backward-compatibility with v1
142142
bool DOPATTERNPERLINE; ///< Assume each line contains one integral pattern, rather than actively extracting all subpatterns on a line (default: false)
143143

144-
int PRUNENONSUBSUMED; //< Prune all n-grams that are not subsumed by higher-order ngrams
144+
int PRUNENONSUBSUMED; //< Prune all n-grams that are **NOT** subsumed by higher-order ngrams
145+
int PRUNESUBSUMED; //< Prune all n-grams that are subsumed by higher-order ngrams
145146

146147
bool DOREMOVEINDEX; ///< Do not load index information (for indexed models), loads just the patterns without any counts
147148
bool DOREMOVENGRAMS; ///< Remove n-grams from the model upon loading it
@@ -178,7 +179,8 @@ class PatternModelOptions {
178179
DOREMOVESKIPGRAMS = false;
179180
DOREMOVEFLEXGRAMS = false;
180181

181-
PRUNENONSUBSUMED = false;
182+
PRUNENONSUBSUMED = 0;
183+
PRUNESUBSUMED = 0;
182184

183185
DEBUG = false;
184186
QUIET = false;
@@ -210,6 +212,7 @@ class PatternModelOptions {
210212
DOREMOVEFLEXGRAMS = ref.DOREMOVEFLEXGRAMS;
211213

212214
PRUNENONSUBSUMED = ref.PRUNENONSUBSUMED;
215+
PRUNESUBSUMED = ref.PRUNESUBSUMED;
213216

214217
DEBUG = ref.DEBUG;
215218
QUIET = ref.QUIET;
@@ -1209,6 +1212,28 @@ class PatternModel: public MapType, public PatternModelInterface {
12091212
if (!options.QUIET) std::cerr << " pruned " << prunednonsubsumed << " non-subsumed " << (n-1) << "-grams" << std::endl;
12101213
}
12111214
}
1215+
if (options.PRUNESUBSUMED) {
1216+
if (!options.QUIET) std::cerr << "Pruning subsumed n-grams" << std::endl;
1217+
int end_n = options.PRUNESUBSUMED;
1218+
if ((end_n > options.MAXLENGTH)) end_n = options.MAXLENGTH;
1219+
for (int n = 2; n <= end_n; n++) {
1220+
std::unordered_set<Pattern> subsumed;
1221+
unsigned int prunedsubsumed = 0;
1222+
PatternModel::iterator iter = this->begin();
1223+
while (iter != this->end()) {
1224+
const unsigned int pattern_n = iter->first.n();
1225+
if (pattern_n == (unsigned int) n) {
1226+
subngrams.clear();
1227+
iter->first.ngrams(subngrams, n-1);
1228+
for (std::vector<PatternPointer>::iterator iter2 = subngrams.begin(); iter2 != subngrams.end(); iter2++) subsumed.insert(Pattern(*iter2));
1229+
}
1230+
iter++;
1231+
};
1232+
prunedsubsumed += this->pruneinset(subsumed, n-1);
1233+
if (!options.QUIET) std::cerr << " pruned " << prunedsubsumed << " subsumed " << (n-1) << "-grams" << std::endl;
1234+
}
1235+
1236+
}
12121237
if ((options.MINLENGTH > 1) && (options.DOSKIPGRAMS || options.DOSKIPGRAMS_EXHAUSTIVE)) {
12131238
unsigned int pruned = this->prunebylength(options.MINLENGTH-1);
12141239
if (!options.QUIET) std::cerr << " pruned " << pruned << " patterns below minimum length (" << options.MINLENGTH << ")" << std::endl;
@@ -2081,6 +2106,34 @@ class PatternModel: public MapType, public PatternModelInterface {
20812106
return pruned;
20822107
}
20832108

2109+
/**
2110+
* Prune all patterns that are in the specified set.
2111+
* @param s The set containing the patterns not to prune
2112+
* @param _n The size constraint, limit to patterns of this size only (set to 0 for no constraint, default)
2113+
* @return the number of distinct patterns pruned
2114+
*/
2115+
unsigned int pruneinset(const std::unordered_set<Pattern> & s, int _n) {
2116+
unsigned int pruned = 0;
2117+
if (s.empty()) {
2118+
return pruned;
2119+
}
2120+
PatternModel::iterator iter = this->begin();
2121+
while (iter != this->end()) {
2122+
const PatternType pattern = iter->first;
2123+
if ( (_n == 0) || (pattern.n() == (unsigned int) _n) ) {
2124+
if (s.find(pattern) != s.end()) {
2125+
//found in set
2126+
iter = this->erase(iter);
2127+
pruned++;
2128+
continue;
2129+
}
2130+
}
2131+
iter++;
2132+
};
2133+
2134+
return pruned;
2135+
}
2136+
20842137
/**
20852138
* Prune all patterns that are not in the second model
20862139
* @return the number of distinct patterns pruned

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def read(fname):
183183
license = "GPL",
184184
keywords = "nlp computational_linguistics frequency ngram skipgram pmi cooccurrence linguistics",
185185
long_description=read('README.rst'),
186-
version = '2.5.3',
186+
version = '2.5.4',
187187
ext_modules = extensions,
188188
cmdclass = {'build_ext': build_ext},
189189
classifiers=[

src/patternmodeller.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ void usage() {
4747
cerr << "\t-W|--wordthreshold <number> Word occurrence threshold (secondary threshold): only count patterns in which the words/unigrams occur at least this many times, only effective when the primary " << endl;
4848
cerr << "\t occurrence threshold (-t) is lower than this threshold (default: disabled)" << endl;
4949
cerr << "\t-p|--prune <number> Prune all lower-order n-grams below the specified order that are *NOT* subsumed by higher order n-grams (default: 0, disabled). Only effective when used with -l, usually set to equal values" << endl;
50+
cerr << "\t --prunesubsumed <number> Prune all lower-order n-grams below the specified order that are subsumed by higher order n-grams (default: 0, disabled). Only effective when used with -l, usually set to equal values" << endl;
5051
cerr << "\t-s|--skipgrams Compute skipgrams (costs extra memory and time)" << endl;
5152
cerr << "\t-y|--skipthreshold <number> Occurrence threshold for skipgrams (overrides -t for skipgrams, defaults to -t). Skipgrams occurring less than this will be pruned. Value must be equal to or higher than -t." << endl;
5253
cerr << "\t-T|--skiptypes <number> Skip type threshold (for use with -s): only skipgrams with at least x possible types for the skip will be considered, otherwise the skipgram " << endl;

0 commit comments

Comments
 (0)