Skip to content

Commit

Permalink
Improve tokenization method
Browse files Browse the repository at this point in the history
  • Loading branch information
sergioburdisso committed May 4, 2020
1 parent 82b9ba8 commit 26fff88
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 14 deletions.
27 changes: 15 additions & 12 deletions pyss3/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
PARA_DELTR = "\n"
SENT_DELTR = r"\."
WORD_DELTR = r"\s"
WORD_REGEX = r"\w+"

STR_UNKNOWN, STR_MOST_PROBABLE = "unknown", "most-probable"
STR_UNKNOWN_CATEGORY = "[unknown]"
Expand Down Expand Up @@ -111,6 +112,7 @@ class SS3:
__parag_delimiter__ = PARA_DELTR
__sent_delimiter__ = SENT_DELTR
__word_delimiter__ = WORD_DELTR
__word_regex__ = WORD_REGEX

def __init__(
self, s=None, l=None, p=None, a=None,
Expand Down Expand Up @@ -367,25 +369,28 @@ def __classify_sentence__(self, sent, prep, json=False):
cats = xrange(len(self.__categories__))
word_index = self.get_word_index
word_delimiter = self.__word_delimiter__
word_regex = self.__word_regex__

if not json:
regex = "%s|[^%s]+" % (word_regex, word_delimiter)
if prep:
sent = Pp.clean_and_ready(sent)
sent_words = [
(w, w)
for w in re.split(word_delimiter, sent)
for w in re.findall(regex, sent)
if w
]
else:
if prep:
sent_words = [
(w, Pp.clean_and_ready(w, dots=False))
for w in re_split_keep(word_delimiter, sent)
for w in re_split_keep(word_regex, sent)
if w
]
else:
sent_words = [
(w, w)
for w in re_split_keep(word_delimiter, sent)
for w in re_split_keep(word_regex, sent)
if w
]

Expand All @@ -409,13 +414,10 @@ def __classify_sentence__(self, sent, prep, json=False):
if iw == len(words) - 1:
word_iend = len(raw_seq)
else:
if not word.startswith("nnbrr"):
try:
word_iend = re.search(word, raw_seq, re.I).end()
except AttributeError:
word_iend = len(word)
else:
word_iend = re.search(r"\d+", raw_seq).end()
try:
word_iend = re.search(word, raw_seq, re.I).end()
except AttributeError:
word_iend = len(word)

flat_sent.append(wordi)
flat_raw_sent.append(raw_seq[:word_iend])
Expand Down Expand Up @@ -1908,13 +1910,14 @@ def learn(self, doc, cat, n_grams=1, prep=True, update=True):
icat = self.__get_category__(cat)
cat = self.__categories__[icat]
word_to_index = self.__word_to_index__
word_regex = self.__word_regex__

if prep:
Print.info("preprocessing document...", offset=1)
stime = time()
doc = Pp.clean_and_ready(doc)
Print.info("finished --time: %.1fs" % (time() - stime), offset=1)
doc = doc.replace("\n", "").split(" ")
doc = re.findall("%s|[^%s]+" % (word_regex, self.__word_delimiter__), doc)

text_len = len(doc)
Print.info(
Expand All @@ -1931,7 +1934,7 @@ def learn(self, doc, cat, n_grams=1, prep=True, update=True):
Print.info("learning...", offset=1)
tips = []
for word in doc:
if word and word != '.':
if re.match(word_regex, word):
self.__prun_counter__ += 1
# if word doesn't exist yet, then...
try:
Expand Down
16 changes: 14 additions & 2 deletions pyss3/resources/visual_classifier/js/app.js
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ app.controller("mainCtrl", function($scope) {
else
crow_pars = par.cv.clone();
$chart.pars.push(
[par.sents[0].words[0].lexeme + "..."].concat(crow_pars)
[__get_sent_beginning__(par.sents[0]) + "..."].concat(crow_pars)
);
$chart.parsi.push(par);
}
Expand All @@ -324,8 +324,9 @@ app.controller("mainCtrl", function($scope) {
crow_sents.add(sent.cv);
else
crow_sents = sent.cv.clone();

$chart.sents.push(
[sent.words[0].lexeme + "..."].concat(crow_sents)
[__get_sent_beginning__(sent) + "..."].concat(crow_sents)
);
$chart.sentsi.push(sent);
}
Expand Down Expand Up @@ -516,3 +517,14 @@ function __update_textarea__(){
$('#document').height("10px");
$('#document').focus();
}

function __get_sent_beginning__(sent){
var sent_beginning = sent.words[0].lexeme;
for (var i=0; i < sent.words.length; i++)
if (sent.words[i].token != ""){
sent_beginning = sent.words[i].lexeme;
if (i + 1 < sent.words.length)
sent_beginning += " " + sent.words[i + 1].lexeme;
return sent_beginning;
}
}

0 comments on commit 26fff88

Please sign in to comment.