Merge pull request #828 from snipsco/release/0.20.0

Release 0.20.0
snipsco · Jul 16, 2019 · 76a4f87 · 76a4f87
2 parents d23ff8a + e08b5f0
commit 76a4f87
Show file tree

Hide file tree

Showing 35 changed files with 1,833 additions and 90 deletions.
diff --git a/.appveyor.yml b/.appveyor.yml
@@ -20,7 +20,7 @@ cache:
 
 install:
   - "%PYTHON%\\python.exe -m pip install wheel"
-  - "%PYTHON%\\python.exe -m pip install -e .[test] --verbose"
+  - "%PYTHON%\\python.exe -m pip install -e .[test]"
 
 build: false
 

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,22 @@
 # Changelog
 All notable changes to this project will be documented in this file.
 
+## [0.20.0]
+### Added
+- Add new intent parser: `LookupIntentParser` [#759](https://github.com/snipsco/snips-nlu/pull/759)
+
+### Changed
+- Replace `DeterministicIntentParser` by `LookupIntentParser` in default configs [#829](https://github.com/snipsco/snips-nlu/pull/829)
+- Bumped `snips-nlu-parsers` to `0.3.x` introducing new builtin entities:
+    - `snips/time`
+    - `snips/timePeriod`
+    - `snips/date`
+    - `snips/datePeriod`
+    - `snips/city`
+    - `snips/country`
+    - `snips/region`
+
+
 ## [0.19.8]
 ### Added
 - Add filter for entity match feature [#814](https://github.com/snipsco/snips-nlu/pull/814)
@@ -296,6 +312,7 @@ several commands.
 - Fix compiling issue with `bindgen` dependency when installing from source
 - Fix issue in `CRFSlotFiller` when handling builtin entities
 
+[0.20.0]: https://github.com/snipsco/snips-nlu/compare/0.19.8...0.20.0
 [0.19.8]: https://github.com/snipsco/snips-nlu/compare/0.19.7...0.19.8
 [0.19.7]: https://github.com/snipsco/snips-nlu/compare/0.19.6...0.19.7
 [0.19.6]: https://github.com/snipsco/snips-nlu/compare/0.19.5...0.19.6

diff --git a/README.rst b/README.rst
@@ -256,6 +256,15 @@ Licence
 
 This library is provided by `Snips <https://www.snips.ai>`_ as Open Source software. See `LICENSE <LICENSE>`_ for more information.
 
+
+Geonames Licence
+----------------
+
+The `snips/city`, `snips/country` and `snips/region` builtin entities rely on
+software from Geonames, which is made available under a Creative Commons Attribution 4.0
+license international. For the license and warranties for Geonames please refer to: https://creativecommons.org/licenses/by/4.0/legalcode.
+
+
 .. _external language resources: https://github.com/snipsco/snips-nlu-language-resources
 .. _forum: https://forum.snips.ai/
 .. _blog post: https://medium.com/snips-ai/an-introduction-to-snips-nlu-the-open-source-library-behind-snips-embedded-voice-platform-b12b1a60a41a

diff --git a/docs/source/api.rst b/docs/source/api.rst
@@ -27,6 +27,9 @@ Intent Parser
 .. autoclass:: DeterministicIntentParser
    :members:
 
+.. autoclass:: LookupIntentParser
+   :members:
+
 .. autoclass:: ProbabilisticIntentParser
    :members:
 
@@ -89,6 +92,9 @@ Configurations
 .. autoclass:: DeterministicIntentParserConfig
    :members:
 
+.. autoclass:: LookupIntentParserConfig
+   :members:
+
 .. autoclass:: ProbabilisticIntentParserConfig
    :members:
 

diff --git a/docs/source/custom_processing_units.rst b/docs/source/custom_processing_units.rst
@@ -4,7 +4,7 @@ Custom Processing Units
 =======================
 
 The Snips NLU library provides a default NLU pipeline containing built-in
-processing units such as the :class:`.DeterministicIntentParser` or the
+processing units such as the :class:`.LookupIntentParser` or the
 :class:`.ProbabilisticIntentParser`.
 
 However, it is possible to define custom processing units and use them in a
@@ -14,7 +14,7 @@ The main processing unit of the Snips NLU processing pipeline is the
 :class:`.SnipsNLUEngine`. This engine relies on a list of :class:`.IntentParser`
 that are called successively until one of them manages to extract an intent.
 By default, two parsers are used by the engine: a
-:class:`.DeterministicIntentParser` and a :class:`.ProbabilisticIntentParser`.
+:class:`.LookupParser` and a :class:`.ProbabilisticIntentParser`.
 
 Let's focus on the probabilistic intent parser. This parser parses text using
 two steps: first it classifies the intent using an
@@ -82,12 +82,12 @@ naive keyword matching logic:
                "slots_keywords": self.slots_keywords,
                "config": self.config.to_dict()
            }
-           with path.open(mode="w") as f:
+           with path.open(mode="w", encoding="utf8") as f:
                f.write(json_string(model))
 
        @classmethod
        def from_path(cls, path, **shared):
-           with path.open() as f:
+           with path.open(encoding="utf8") as f:
                model = json.load(f)
            slot_filler = cls()
            slot_filler.language = model["language"]
@@ -188,12 +188,12 @@ this:
                "slots_keywords": self.slots_keywords,
                "config": self.config.to_dict()
            }
-           with path.open(mode="w") as f:
+           with path.open(mode="w", encoding="utf8") as f:
                f.write(json_string(model))
 
        @classmethod
        def from_path(cls, path, **shared):
-           with path.open() as f:
+           with path.open(encoding="utf8") as f:
                model = json.load(f)
            slot_filler = cls()
            slot_filler.language = model["language"]

diff --git a/setup.py b/setup.py
@@ -30,9 +30,8 @@
     "scikit-learn>=0.21.1,<0.22; python_version>='3.5'",
     "scipy>=1.0,<2.0",
     "sklearn-crfsuite>=0.3.6,<0.4",
-    "snips-nlu-parsers>=0.2,<0.3",
-    "snips-nlu-utils>=0.8,<0.9",
-    "deprecation>=2,<3",
+    "snips-nlu-parsers>=0.3,<0.4",
+    "snips_nlu_utils>=0.9,<0.10",
 ]
 
 extras_require = {

diff --git a/snips_nlu/__about__.py b/snips_nlu/__about__.py
@@ -13,13 +13,13 @@
 __email__ = "clement.doumouro@snips.ai, adrien.ball@snips.ai"
 __license__ = "Apache License, Version 2.0"
 
-__version__ = "0.19.8"
-__model_version__ = "0.19.0"
+__version__ = "0.20.0"
+__model_version__ = "0.20.0"
 
 __download_url__ = "https://github.com/snipsco/snips-nlu-language-resources/releases/download"
 __compatibility__ = "https://raw.githubusercontent.com/snipsco/snips-nlu-language-resources/master/compatibility.json"
 __shortcuts__ = "https://raw.githubusercontent.com/snipsco/snips-nlu-language-resources/master/shortcuts.json"
 
-__entities_download_url__ = "https://resources.snips.ai/nlu-lm/gazetteer-entities"
+__entities_download_url__ = "https://resources.snips.ai/nlu/gazetteer-entities"
 
 # pylint:enable=line-too-long
diff --git a/snips_nlu/common/utils.py b/snips_nlu/common/utils.py
@@ -163,6 +163,10 @@ def get_package_path(name):
 
 
 def deduplicate_overlapping_items(items, overlap_fn, sort_key_fn):
+    """Deduplicates the items by looping over the items, sorted using
+    sort_key_fn, and checking overlaps with previously seen items using
+    overlap_fn
+    """
     sorted_items = sorted(items, key=sort_key_fn)
     deduplicated_items = []
     for item in sorted_items:
@@ -173,6 +177,9 @@ def deduplicate_overlapping_items(items, overlap_fn, sort_key_fn):
 
 
 def replace_entities_with_placeholders(text, entities, placeholder_fn):
+    """Processes the text in order to replace entity values with placeholders
+    as defined by the placeholder function
+    """
     if not entities:
         return dict(), text
 
@@ -207,6 +214,8 @@ def replace_entities_with_placeholders(text, entities, placeholder_fn):
 
 
 def deduplicate_overlapping_entities(entities):
+    """Deduplicates entities based on overlapping ranges"""
+
     def overlap(lhs_entity, rhs_entity):
         return ranges_overlap(lhs_entity[RES_MATCH_RANGE],
                               rhs_entity[RES_MATCH_RANGE])

diff --git a/snips_nlu/dataset/utils.py b/snips_nlu/dataset/utils.py
@@ -52,3 +52,16 @@ def get_dataset_gazetteer_entities(dataset, intent=None):
     if intent is not None:
         return extract_intent_entities(dataset, is_gazetteer_entity)[intent]
     return {e for e in dataset[ENTITIES] if is_gazetteer_entity(e)}
+
+
+def get_stop_words_whitelist(dataset, stop_words):
+    """Extracts stop words whitelists per intent consisting of entity values
+    that appear in the stop_words list"""
+    entity_values_per_intent = extract_entity_values(
+        dataset, apply_normalization=True)
+    stop_words_whitelist = dict()
+    for intent, entity_values in iteritems(entity_values_per_intent):
+        whitelist = stop_words.intersection(entity_values)
+        if whitelist:
+            stop_words_whitelist[intent] = whitelist
+    return stop_words_whitelist
diff --git a/snips_nlu/default_configs/config_de.py b/snips_nlu/default_configs/config_de.py
@@ -4,9 +4,7 @@
     "unit_name": "nlu_engine",
     "intent_parsers_configs": [
         {
-            "unit_name": "deterministic_intent_parser",
-            "max_queries": 500,
-            "max_pattern_length": 1000,
+            "unit_name": "lookup_intent_parser",
             "ignore_stop_words": True
         },
         {

diff --git a/snips_nlu/default_configs/config_en.py b/snips_nlu/default_configs/config_en.py
@@ -4,9 +4,7 @@
     "unit_name": "nlu_engine",
     "intent_parsers_configs": [
         {
-            "unit_name": "deterministic_intent_parser",
-            "max_queries": 500,
-            "max_pattern_length": 1000,
+            "unit_name": "lookup_intent_parser",
             "ignore_stop_words": True
         },
         {

diff --git a/snips_nlu/default_configs/config_es.py b/snips_nlu/default_configs/config_es.py
@@ -4,9 +4,7 @@
     "unit_name": "nlu_engine",
     "intent_parsers_configs": [
         {
-            "unit_name": "deterministic_intent_parser",
-            "max_queries": 500,
-            "max_pattern_length": 1000,
+            "unit_name": "lookup_intent_parser",
             "ignore_stop_words": True
         },
         {

diff --git a/snips_nlu/default_configs/config_fr.py b/snips_nlu/default_configs/config_fr.py
@@ -4,9 +4,7 @@
     "unit_name": "nlu_engine",
     "intent_parsers_configs": [
         {
-            "unit_name": "deterministic_intent_parser",
-            "max_queries": 500,
-            "max_pattern_length": 1000,
+            "unit_name": "lookup_intent_parser",
             "ignore_stop_words": True
         },
         {

diff --git a/snips_nlu/default_configs/config_it.py b/snips_nlu/default_configs/config_it.py
@@ -4,9 +4,7 @@
     "unit_name": "nlu_engine",
     "intent_parsers_configs": [
         {
-            "unit_name": "deterministic_intent_parser",
-            "max_queries": 500,
-            "max_pattern_length": 1000,
+            "unit_name": "lookup_intent_parser",
             "ignore_stop_words": True
         },
         {

diff --git a/snips_nlu/default_configs/config_ja.py b/snips_nlu/default_configs/config_ja.py
@@ -4,9 +4,7 @@
     "unit_name": "nlu_engine",
     "intent_parsers_configs": [
         {
-            "unit_name": "deterministic_intent_parser",
-            "max_queries": 500,
-            "max_pattern_length": 1000,
+            "unit_name": "lookup_intent_parser",
             "ignore_stop_words": False
         },
         {

diff --git a/snips_nlu/default_configs/config_ko.py b/snips_nlu/default_configs/config_ko.py
@@ -4,9 +4,7 @@
     "unit_name": "nlu_engine",
     "intent_parsers_configs": [
         {
-            "unit_name": "deterministic_intent_parser",
-            "max_queries": 500,
-            "max_pattern_length": 1000,
+            "unit_name": "lookup_intent_parser",
             "ignore_stop_words": False
         },
         {

diff --git a/snips_nlu/default_configs/config_pt_br.py b/snips_nlu/default_configs/config_pt_br.py
@@ -4,9 +4,7 @@
     "unit_name": "nlu_engine",
     "intent_parsers_configs": [
         {
-            "unit_name": "deterministic_intent_parser",
-            "max_queries": 500,
-            "max_pattern_length": 1000,
+            "unit_name": "lookup_intent_parser",
             "ignore_stop_words": True
         },
         {

diff --git a/snips_nlu/default_configs/config_pt_pt.py b/snips_nlu/default_configs/config_pt_pt.py
@@ -4,9 +4,7 @@
     "unit_name": "nlu_engine",
     "intent_parsers_configs": [
         {
-            "unit_name": "deterministic_intent_parser",
-            "max_queries": 500,
-            "max_pattern_length": 1000,
+            "unit_name": "lookup_intent_parser",
             "ignore_stop_words": True
         },
         {

diff --git a/snips_nlu/intent_classifier/featurizer.py b/snips_nlu/intent_classifier/featurizer.py
@@ -758,7 +758,7 @@ def persist(self, path):
         }
         vectorizer_json = json_string(self_as_dict)
         vectorizer_path = path / "vectorizer.json"
-        with vectorizer_path.open(mode="w") as f:
+        with vectorizer_path.open(mode="w", encoding="utf8") as f:
             f.write(vectorizer_json)
         self.persist_metadata(path)
 

diff --git a/snips_nlu/intent_classifier/log_reg_classifier.py b/snips_nlu/intent_classifier/log_reg_classifier.py
@@ -222,7 +222,8 @@ def persist(self, path):
         }
 
         classifier_json = json_string(self_as_dict)
-        with (path / "intent_classifier.json").open(mode="w") as f:
+        with (path / "intent_classifier.json").open(mode="w",
+                                                    encoding="utf8") as f:
             f.write(classifier_json)
         self.persist_metadata(path)
 

diff --git a/snips_nlu/intent_parser/__init__.py b/snips_nlu/intent_parser/__init__.py
@@ -1,3 +1,4 @@
 from .deterministic_intent_parser import DeterministicIntentParser
 from .intent_parser import IntentParser
+from .lookup_intent_parser import LookupIntentParser
 from .probabilistic_intent_parser import ProbabilisticIntentParser
diff --git a/snips_nlu/intent_parser/deterministic_intent_parser.py b/snips_nlu/intent_parser/deterministic_intent_parser.py
@@ -21,7 +21,7 @@
     RES_MATCH_RANGE, RES_SLOTS, RES_VALUE, SLOT_NAME, START, TEXT, UTTERANCES,
     RES_PROBA)
 from snips_nlu.dataset import validate_and_format_dataset
-from snips_nlu.dataset.utils import extract_entity_values
+from snips_nlu.dataset.utils import get_stop_words_whitelist
 from snips_nlu.entity_parser.builtin_entity_parser import is_builtin_entity
 from snips_nlu.exceptions import IntentNotFoundError, LoadingError
 from snips_nlu.intent_parser.intent_parser import IntentParser
@@ -143,7 +143,7 @@ def fit(self, dataset, force_retrain=True):
         self.slot_names_to_entities = get_slot_name_mappings(dataset)
         self.group_names_to_slot_names = _get_group_names_to_slot_names(
             self.slot_names_to_entities)
-        self._stop_words_whitelist = _get_stop_words_whitelist(
+        self._stop_words_whitelist = get_stop_words_whitelist(
             dataset, self._stop_words)
 
         # Do not use ambiguous patterns that appear in more than one intent
@@ -239,11 +239,12 @@ def placeholder_fn(entity_name):
             cleaned_processed_text = self._preprocess_text(processed_text,
                                                            intent)
             for regex in self.regexes_per_intent[intent]:
-                res = self._get_matching_result(text, cleaned_processed_text,
-                                                regex, intent, mapping)
+                res = self._get_matching_result(text, cleaned_text, regex,
+                                                intent)
                 if res is None and cleaned_text != cleaned_processed_text:
-                    res = self._get_matching_result(text, cleaned_text, regex,
-                                                    intent)
+                    res = self._get_matching_result(
+                        text, cleaned_processed_text, regex, intent, mapping)
+
                 if res is not None:
                     results.append(res)
                     break
@@ -300,6 +301,7 @@ def get_slots(self, text, intent):
 
         if intent not in self.regexes_per_intent:
             raise IntentNotFoundError(intent)
+
         slots = self.parse(text, intents=[intent])[RES_SLOTS]
         if slots is None:
             slots = []
@@ -408,7 +410,7 @@ def persist(self, path):
         parser_json = json_string(self.to_dict())
         parser_path = path / "intent_parser.json"
 
-        with parser_path.open(mode="w") as f:
+        with parser_path.open(mode="w", encoding="utf8") as f:
             f.write(parser_json)
         self.persist_metadata(path)
 
@@ -514,14 +516,3 @@ def sort_key_fn(slot):
 def _get_entity_name_placeholder(entity_label, language):
     return "%%%s%%" % "".join(
         tokenize_light(entity_label, language)).upper()
-
-
-def _get_stop_words_whitelist(dataset, stop_words):
-    entity_values_per_intent = extract_entity_values(
-        dataset, apply_normalization=True)
-    stop_words_whitelist = dict()
-    for intent, entity_values in iteritems(entity_values_per_intent):
-        whitelist = stop_words.intersection(entity_values)
-        if whitelist:
-            stop_words_whitelist[intent] = whitelist
-    return stop_words_whitelist