Change the default regex used for tokenizing words

Now all the words with an apostrophe in-between will be considered as single words (e.g. I've, don't, haven't, etc.)
sergioburdisso · May 5, 2020 · 4af8e80 · 4af8e80
1 parent 7966279
commit 4af8e80
Show file tree

Hide file tree

Showing 2 changed files with 3 additions and 4 deletions.
diff --git a/examples/extract_insight.ipynb b/examples/extract_insight.ipynb
@@ -287,7 +287,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "we can see that, unlike the previous ones, these fragments focus less on health-related aspects and much more on science/scientific ones, SS3 even gave us the Method and Objective well-known sections of research papers. For instance, if we read the first fragment without any context, \"Method: This study used a parallel randomized control group design to compare pre-test and post\", we as humans, can clearly see it is related to science."
+    "we can see that, unlike the previous ones, these fragments focus less on health-related aspects and much more on science/scientific ones, SS3 even gave us the Method and Objective well-known sections of research papers. When we, as humans, read any of these 3 fragments with no context, we can clearly see they are related to science."
    ]
   },
   {

diff --git a/pyss3/__init__.py b/pyss3/__init__.py
@@ -28,7 +28,7 @@
 PARA_DELTR = "\n"
 SENT_DELTR = r"\."
 WORD_DELTR = r"\s"
-WORD_REGEX = r"\w+"
+WORD_REGEX = r"\w+(?:'\w+)?"
 
 STR_UNKNOWN, STR_MOST_PROBABLE = "unknown", "most-probable"
 STR_UNKNOWN_CATEGORY = "[unknown]"
@@ -373,13 +373,12 @@ def __classify_sentence__(self, sent, prep, json=False, prep_func=None):
         word_regex = self.__word_regex__
 
         if not json:
-            regex = "%s|[^%s]+" % (word_regex, word_delimiter)
             if prep:
                 prep_func = prep_func or Pp.clean_and_ready
                 sent = prep_func(sent)
             sent_words = [
                 (w, w)
-                for w in re.findall(regex, sent)
+                for w in re_split_keep(word_regex, sent)
                 if w
             ]
         else: