Skip to content

Commit

Permalink
adapted naming to harry configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
rieck committed Mar 10, 2015
1 parent eb4203e commit a0bf2f1
Show file tree
Hide file tree
Showing 11 changed files with 37 additions and 37 deletions.
2 changes: 1 addition & 1 deletion doc/example.cfg
Expand Up @@ -37,7 +37,7 @@ features = {
ngram_len = 1;

# Granularity of n-grams: bytes or tokens
ngram_gran = "tokens";
granularity = "tokens";

# Delimiters for token n-grams, e.g. " %0a%0d"
ngram_delim = " %0a%0d";
Expand Down
14 changes: 7 additions & 7 deletions doc/sally.pod
Expand Up @@ -191,12 +191,12 @@ following parameters.

=item B<ngram_len = 2;>

=item B<ngram_gran = "bytes";>
=item B<granularity = "bytes";>

The parameter B<ngram_len> specifies the numbers of consecutive symbols that
are considered as one feature, while the parameter B<ngram_gran> defines the
are considered as one feature, while the parameter B<granularity> defines the
granularity of these symbols. If the granularity is set to I<bytes>,
B<sally> considers bytes as symbols, whereas if B<ngram_gran> is set to
B<sally> considers bytes as symbols, whereas if B<granularity> is set to
I<tokens>, all strings (token) separated by a set of delimiters are
considered as symbols. The following types of different feature types can
be extracted using these parameters:
Expand All @@ -209,7 +209,7 @@ The strings are partitioned into substrings (tokens) using a set of
delimiter characters. Such partitioning is typical for natural language
processing, where the delimiters are usually defined as white-space and
punctuation symbols. An embedding using tokens is selected by choosing
I<tokens> as granularity (B<ngram_gran>), defining a set of delimiter
I<tokens> as granularity (B<granularity>), defining a set of delimiter
characters (B<ngram_delim>) and setting the n-gram length to 1
(B<ngram_len>).

Expand All @@ -219,7 +219,7 @@ The strings are characterized by all possible byte sequences of a fixed
length n (byte n-grams). These features are frequently used if no
information about the structure of strings is available, such as in
bioinformatics or computer security. An embedding using byte n-grams is
selected by choosing I<bytes> as granularity (B<ngram_gran>) and defining
selected by choosing I<bytes> as granularity (B<granularity>) and defining
the n-gram length (B<ngram_len>).

=item I<token n-grams>
Expand All @@ -229,7 +229,7 @@ length n (token n-grams). These features require the definition of a set of
delimiters and a length n. They are often used in natural language
processing as a coarse way for capturing structure of text. An embedding
using token n-grams is selected by choosing I<tokens> as granularity
(B<ngram_gran>), defining a set of delimiter characters (B<ngram_delim>) and
(B<granularity>), defining a set of delimiter characters (B<ngram_delim>) and
choosing an n-gram length (B<ngram_len>).

=back
Expand Down Expand Up @@ -555,7 +555,7 @@ can be changed by using the command-line option B<--xx

-n, --ngram_len <num> Set length of n-grams.
-d, --ngram_delim <delim> Set delimiters of tokens in n-grams.
-g --ngram_gran <type> Set granularity: bytes, tokens.
-g --granularity <type> Set granularity: bytes, tokens.
-p, --ngram_pos Enable positional n-grams.
--pos_shift <num> Set shift of positional n-grams.
-B, --ngram_blend Enable blended n-grams.
Expand Down
2 changes: 1 addition & 1 deletion src/fvec/fvec.c
Expand Up @@ -127,7 +127,7 @@ fvec_t *fvec_extract_intern2(char *x, int l, int n)
}

/* Get configuration */
config_lookup_string(&cfg, "features.ngram_gran", &granu);
config_lookup_string(&cfg, "features.granularity", &granu);
config_lookup_bool(&cfg, "features.ngram_pos", &pos);
config_lookup_int(&cfg, "features.pos_shift", &shift);

Expand Down
10 changes: 5 additions & 5 deletions src/sally.c
Expand Up @@ -46,7 +46,7 @@ static struct option longopts[] = {
{"reverse_str", 0, NULL, 1007},
{"stoptoken_file", 1, NULL, 1008},
{"ngram_len", 1, NULL, 'n'},
{"ngram_gran", 1, NULL, 'g'},
{"granularity", 1, NULL, 'g'},
{"ngram_delim", 1, NULL, 'd'},
{"ngram_pos", 0, NULL, 'p'},
{"pos_shift", 1, NULL, 1012}, /* <- last entry */
Expand Down Expand Up @@ -114,7 +114,7 @@ static void print_usage(void)
" -k, --skip_null Skip null vectors in output.\n"
"\nFeature options:\n"
" -n, --ngram_len <num> Set length of n-grams.\n"
" -g --ngram_gran <type> Set granularity: bytes, tokens.\n"
" -g --granularity <type> Set granularity: bytes, tokens.\n"
" -d, --ngram_delim <delim> Set delimiters of tokens in n-grams.\n"
" -p, --ngram_pos Enable positional n-grams.\n"
" --pos_shift <num> Set shift of positional n-grams.\n"
Expand Down Expand Up @@ -215,7 +215,7 @@ static void sally_parse_options(int argc, char **argv)
config_set_int(&cfg, "features.ngram_len", atoi(optarg));
break;
case 'g':
config_set_string(&cfg, "features.ngram_gran", optarg);
config_set_string(&cfg, "features.granularity", optarg);
break;
case 'd':
config_set_string(&cfg, "features.ngram_delim", optarg);
Expand Down Expand Up @@ -343,9 +343,9 @@ static void sally_load_config(int argc, char **argv)
config_error_text(&cfg), config_error_line(&cfg));

/* Check for new granularity parameter */
ret = config_lookup_string(&cfg, "features.ngram_gran", &str);
ret = config_lookup_string(&cfg, "features.granularity", &str);
if (ret == CONFIG_FALSE)
fatal("Your configuration is missing the new 'ngram_gran' "
fatal("Your configuration is missing the new 'granularity' "
"parameter. Please consult the manual page and upgrade "
"your configuration.");
}
Expand Down
4 changes: 2 additions & 2 deletions src/sconfig.c
Expand Up @@ -36,7 +36,7 @@ static config_default_t defaults[] = {
{"input", "reverse_str", CONFIG_TYPE_BOOL, {.num = CONFIG_FALSE}},
{"input", "stoptoken_file", CONFIG_TYPE_STRING, {.str = ""}},
{"features", "ngram_len", CONFIG_TYPE_INT, {.num = 1}},
{"features", "ngram_gran", CONFIG_TYPE_STRING, {.str = "tokens"}},
{"features", "granularity", CONFIG_TYPE_STRING, {.str = "tokens"}},
{"features", "ngram_delim", CONFIG_TYPE_STRING, {.str = "%20%0a%0d"}},
{"features", "ngram_pos", CONFIG_TYPE_BOOL, {.num = CONFIG_FALSE}},
{"features", "pos_shift", CONFIG_TYPE_INT, {.num = 0}},
Expand Down Expand Up @@ -236,7 +236,7 @@ int config_check(config_t *cfg)
return 0;
}

config_lookup_string(cfg, "features.ngram_gran", &s1);
config_lookup_string(cfg, "features.granularity", &s1);
config_lookup_string(cfg, "features.ngram_delim", &s2);
if (!strcasecmp(s1, "tokens") && strlen(s2) == 0) {
error("Delimiters are required if the granularity is tokens.");
Expand Down
2 changes: 1 addition & 1 deletion tests/config1.cfg
Expand Up @@ -37,7 +37,7 @@ features = {
ngram_len = 2;

# Granulatiy of n-grams: bytes or tokens
ngram_gran = "tokens";
granularity = "tokens";

# Delimiters for n-grams, e.g. " %0a%0d"
ngram_delim = "%0a%0d ";
Expand Down
2 changes: 1 addition & 1 deletion tests/config2.cfg
Expand Up @@ -37,7 +37,7 @@ features = {
ngram_len = 2;

# Granularity of n-grams: bytes or tokens
ngram_gran = "bytes";
granularity = "bytes";

# Delimiters for n-grams, e.g. " %0a%0d"
ngram_delim = "";
Expand Down
2 changes: 1 addition & 1 deletion tests/config3.cfg
Expand Up @@ -37,7 +37,7 @@ features = {
ngram_len = 1;

# Granularity of n-grams: bytes or tokens
ngram_gran = "tokens";
granularity = "tokens";

# Delimiters for n-grams, e.g. " %0a%0d"
ngram_delim = " %0a%0d";
Expand Down
2 changes: 1 addition & 1 deletion tests/test_embed.c
Expand Up @@ -192,7 +192,7 @@ int main(int argc, char **argv)
config_init(&cfg);
config_check(&cfg);

config_set_string(&cfg, "features.ngram_gran", "tokens");
config_set_string(&cfg, "features.granularity", "tokens");
config_set_string(&cfg, "features.ngram_delim", " .,%0a%0d");
config_set_int(&cfg, "features.ngram_len", 1);
config_set_string(&cfg, "input.input_format", "lines");
Expand Down
4 changes: 2 additions & 2 deletions tests/test_fvec.c
Expand Up @@ -60,9 +60,9 @@ void init_sally(test_t t)

/* Set granularity depending on delimiters */
if (strlen(t.dlm) > 0)
config_set_string(&cfg, "features.ngram_gran", "tokens");
config_set_string(&cfg, "features.granularity", "tokens");
else
config_set_string(&cfg, "features.ngram_gran", "bytes");
config_set_string(&cfg, "features.granularity", "bytes");
}


Expand Down
30 changes: 15 additions & 15 deletions tests/test_ngrams.c
Expand Up @@ -7,7 +7,7 @@
* under the terms of the GNU General Public License as published by the
* Free Software Foundation; either version 3 of the License, or (at your
* option) any later version. This program is distributed without any
* warranty. See the GNU General Public License for more details.
* warranty. See the GNU General Public License for more details.
*/

#include "tests.h"
Expand All @@ -27,7 +27,7 @@ typedef struct
int nlen;
int flag;
/* Number of n-grams */
int len;
int len;
} test_t;

int test_sorted_ngrams()
Expand All @@ -47,13 +47,13 @@ int test_sorted_ngrams()
test_printf("Testing sorted n-grams");

/* Hack to set delimiters */
config_set_string(&cfg, "features.ngram_gran", "tokens");
config_set_string(&cfg, "features.granularity", "tokens");
config_set_string(&cfg, "features.ngram_delim", " ");
fvec_delim_set(" ");
fvec_delim_set(" ");

for (i = 0; t[i].str; i++) {
config_set_int(&cfg, "features.ngram_len", t[i].nlen);
config_set_bool(&cfg, "features.ngram_sort", t[i].flag);
config_set_bool(&cfg, "features.ngram_sort", t[i].flag);

/* Extract features */
f = fvec_extract(t[i].str, strlen(t[i].str));
Expand All @@ -67,7 +67,7 @@ int test_sorted_ngrams()
fvec_destroy(f);
}

config_set_bool(&cfg, "features.ngram_sort", 0);
config_set_bool(&cfg, "features.ngram_sort", 0);

test_return(err, i);
return err;
Expand All @@ -90,13 +90,13 @@ int test_blended_ngrams()
test_printf("Testing blended n-grams");

/* Hack to set delimiters */
config_set_string(&cfg, "features.ngram_gran", "tokens");
config_set_string(&cfg, "features.granularity", "tokens");
config_set_string(&cfg, "features.ngram_delim", " ");
fvec_delim_set(" ");
fvec_delim_set(" ");

for (i = 0; t[i].str; i++) {
config_set_int(&cfg, "features.ngram_len", t[i].nlen);
config_set_bool(&cfg, "features.ngram_blend", t[i].flag);
config_set_bool(&cfg, "features.ngram_blend", t[i].flag);

/* Extract features */
f = fvec_extract(t[i].str, strlen(t[i].str));
Expand All @@ -110,7 +110,7 @@ int test_blended_ngrams()
fvec_destroy(f);
}

config_set_bool(&cfg, "features.ngram_blend", 0);
config_set_bool(&cfg, "features.ngram_blend", 0);

test_return(err, i);
return err;
Expand All @@ -134,15 +134,15 @@ int test_pos_ngrams()
test_printf("Testing positional n-grams");

/* Hack to set delimiters */
config_set_string(&cfg, "features.ngram_gran", "tokens");
config_set_string(&cfg, "features.granularity", "tokens");
config_set_string(&cfg, "features.ngram_delim", " ");
fvec_delim_set(" ");
fvec_delim_set(" ");

for (i = 0; t[i].str; i++) {

config_set_int(&cfg, "features.ngram_len", t[i].nlen);
config_set_bool(&cfg, "features.ngram_pos", t[i].flag);
config_set_int(&cfg, "features.pos_shift", 0);
config_set_int(&cfg, "features.pos_shift", 0);

/* Extract features */
f = fvec_extract(t[i].str, strlen(t[i].str));
Expand Down Expand Up @@ -173,7 +173,7 @@ int main(int argc, char **argv)
/* Create config */
config_init(&cfg);
config_check(&cfg);

fhash_init();

err |= test_sorted_ngrams();
Expand Down

0 comments on commit a0bf2f1

Please sign in to comment.