feat: get synsets by domains of interest. Update README and tests

argilla-io · Dec 17, 2018 · ebda285 · ebda285
1 parent e3bf746
commit ebda285
Show file tree

Hide file tree

Showing 6 changed files with 99 additions and 39 deletions.
diff --git a/.gitignore b/.gitignore
@@ -48,5 +48,9 @@ MANIFEST
 # Per-project virtualenvs
 .venv*/
 
+# vscode
+.history/
+.vscode/
+
 
 wordnet_domains/
diff --git a/README.md b/README.md
@@ -1,27 +1,34 @@
-# Spacy wordnet annotator
+# spaCy WordNet
 
-`spacy-wordnet` creates annotations that easily allow the use of wordnet 
-and [wordnet domains](http://wndomains.fbk.eu/) by using 
-the [nltk wordnet interface](http://www.nltk.org/howto/wordnet.html)
-
+spaCy Wordnet is a simple custom component for using [WordNet](https://wordnet.princeton.edu/), [MultiWordnet](http://multiwordnet.fbk.eu/english/home.php) and [WordNet domains](http://wndomains.fbk.eu/) with [spaCy](http://spacy.io).
 
+The component combines the [NLTK wordnet interface](http://www.nltk.org/howto/wordnet.html) with WordNet domains to allow users to:
 
+* Get all synsets for a processed token. For example, getting all the synsets (word senses) of the word ``bank``.
+* Get and filter synsets by domain. For example, getting synonyms of the verb ``withdraw`` in the financial domain.
 
-## Install
+
+## Getting started
+The spaCy WordNet component can be easily integrated into spaCy pipelines. You just need the following:
+### Prerequisites
 
-````bash
-pip install spacy-wordnet
-````
+* Python 3.X
+* spaCy
 
-## Requirements
-Some nltk data must be installed before using this package
+You also need to install the following NLTK wordnet data:
 
 ````bash
 python -m nltk.downloader wordnet
 python -m nltk.downloader omw
+````
+### Install
 
+````bash
+pip install spacy-wordnet
 ````
 
+
+
 ## Usage
 
 ````python
@@ -42,6 +49,32 @@ token._.wordnet.lemmas()
 
 # And automatically tags with wordnet domains
 token._.wordnet.wordnet_domains()
+
+# Imagine we want to enrich the following sentence with synonyms
+sentence = nlp('I want to withdraw 5,000 euros')
+
+# spaCy WordNet lets you find synonyms by domain of interest
+# for example economy
+economy_domains = ['finance', 'banking']
+enriched_sentence = []
+
+# For each token in the sentence
+for token in sentence:
+    # We get those synsets within the desired domains
+    synsets = token._.wordnet.wordnet_synsets_for_domain(economy_domains)
+    if synsets:
+        lemmas_for_synset = []
+        for s in synsets:
+            # If we found a synset in the economy domains
+            # we get the variants and add them to the enriched sentence
+            lemmas_for_synset.extend(s.lemma_names())
+            enriched_sentence.append('({})'.format('|'.join(set(lemmas_for_synset))))
+    else:
+        enriched_sentence.append(token.text)
+
+# Let's see our enriched sentence
+print(' '.join(enriched_sentence))
+# >> I (need|want|require) to (draw|withdraw|draw_off|take_out) 5,000 euros
 
 ````
 
diff --git a/setup.cfg b/setup.cfg
@@ -25,7 +25,8 @@ include_package_data = True
 # DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD!
 setup_requires = pyscaffold>=3.1a0,<3.2a0
 # Add here dependencies of your project (semicolon/line-separated), e.g.
-# install_requires = numpy; scipy
+install_requires = 
+    nltk>=3.3,<3.4
 # The usage of test_requires is discouraged, see `Dependency Management` docs
 # tests_require = pytest; pytest-cov
 # Require a specific Python version, e.g. Python 2.7 or >= 3.4

diff --git a/spacy_wordnet/wordnet_annotator.py b/spacy_wordnet/wordnet_annotator.py
@@ -1,20 +1,20 @@
 from spacy.tokens.doc import Doc
 from spacy.tokens.token import Token
 
-from spacy_wordnet.wordnet_domains import Wordnet, _WordnetDomains
+from spacy_wordnet.wordnet_domains import Wordnet, load_wordnet_domains
 
 
 class WordnetAnnotator(object):
     __FIELD = 'wordnet'
 
     def __init__(self, lang: str = 'es'):
         Token.set_extension(WordnetAnnotator.__FIELD, default=None, force=True)
-        self.__wn_domains = _WordnetDomains()
+        load_wordnet_domains()
         self.__lang = lang
 
     def __call__(self, doc: Doc):
         for token in doc:
-            wordnet = Wordnet(token=token, wn_domains=self.__wn_domains, lang=self.__lang)
+            wordnet = Wordnet(token=token, lang=self.__lang)
             token._.set(WordnetAnnotator.__FIELD, wordnet)
 
         return doc
diff --git a/spacy_wordnet/wordnet_domains.py b/spacy_wordnet/wordnet_domains.py
@@ -11,8 +11,7 @@
 def wordnet_domains_path() -> str:
     return __WN_DOMAINS_PATH
 
-
-def load_wordnet_domains(path: str):
+def load_wordnet_domains(path: Optional[str] = wordnet_domains_path()):
     if __WN_DOMAINS_BY_SSID:
         return
 
@@ -25,19 +24,9 @@ def get_domains_for_synset(synset: Synset) -> List[str]:
     ssid = '{}-{}'.format(str(synset.offset()).zfill(8), synset.pos())
     return __WN_DOMAINS_BY_SSID.get(ssid, [])
 
-
-class _WordnetDomains(object):
-    def __init__(self):
-        load_wordnet_domains(wordnet_domains_path())
-
-    def domains_for_synset(self, synset: Synset) -> List[str]:
-        return get_domains_for_synset(synset)
-
-
 class Wordnet(object):
 
-    def __init__(self, token: Token, wn_domains: _WordnetDomains, lang: str = 'es'):
-        self.__wn_domains = wn_domains
+    def __init__(self, token: Token, lang: str = 'es'):
         self.__token = token
         self.__lang = fetch_wordnet_lang(lang)
         self.__synsets = self.__find_synsets(token, self.__lang)
@@ -53,9 +42,12 @@ def lemmas(self):
     def wordnet_domains(self):
         return self.__wordnet_domains
 
-    def wordnet_domains_for_synset(self, synset):
+    def wordnet_domains_for_synset(self, synset: Synset):
         return get_domains_for_synset(synset)
 
+    def wordnet_synsets_for_domain(self, domains: List[str]):
+        return [synset for synset in self.synsets() if self.__has_domains(synset, domains)]
+
     @staticmethod
     def __find_synsets(token: Token, lang: str):
         word_variants = [token.text]
@@ -70,6 +62,10 @@ def __find_synsets(token: Token, lang: str):
 
         return []
 
+    @staticmethod
+    def __has_domains(synset: Synset, domains: List[str]) -> bool:
+        return not set(domains).isdisjoint(get_domains_for_synset(synset))
+
     def __find_wordnet_domains(self):
         return [domain for synset in self.synsets() for domain in get_domains_for_synset(synset)]
 

diff --git a/tests/test_wordnet_annotator.py b/tests/test_wordnet_annotator.py
@@ -1,32 +1,58 @@
 import unittest
+from collections import defaultdict
 
 import spacy
 
+import numpy as np
+
+from itertools import product
+
 from spacy_wordnet.wordnet_annotator import WordnetAnnotator
 
 
 class WordnetAnnotatorTest(unittest.TestCase):
 
+    def __init__(self, *args, **kwargs):
+
+        super().__init__(*args, **kwargs)
+
+        self.nlp_en = spacy.load('en')
+        self.nlp_es = spacy.load('es')
+
+        # Add wordnet component
+        self.nlp_en.add_pipe(WordnetAnnotator(self.nlp_en.lang))
+        self.nlp_es.add_pipe(WordnetAnnotator(self.nlp_es.lang))
+
     def test_english_annotations(self):
-        nlp = spacy.load('en')
-        nlp.add_pipe(WordnetAnnotator(nlp.lang))
 
-        token = nlp('contracts')[0]
+        token = self.nlp_en('contracts')[0]
 
         assert token._.wordnet.synsets()
         assert token._.wordnet.lemmas()
         assert token._.wordnet.wordnet_domains()
 
-        del nlp
+    def test_generate_variants_from_domain_list(self):
 
-    def test_spanish_annotations(self):
-        nlp = spacy.load('es')
-        nlp.add_pipe(WordnetAnnotator(nlp.lang))
+        economy_domains = ['finance', 'banking']
+        enriched_sentence = []
+
+        sentence = self.nlp_en('I want to withdraw 5,000 euros')
+
+        for token in sentence:
+            synsets = token._.wordnet.wordnet_synsets_for_domain(economy_domains)
 
-        token = nlp('contratos')[0]
+            if synsets:
+                lemmas_for_synset = []
+                for s in synsets:
+                    lemmas_for_synset.extend(s.lemma_names())
+                enriched_sentence.append('({})'.format('|'.join(set(lemmas_for_synset))))
+            else:
+                enriched_sentence.append(token.text)
+        print(' '.join(enriched_sentence))
+
+    def test_spanish_annotations(self):
+        token = self.nlp_es('contratos')[0]
 
         assert token._.wordnet.synsets()
         assert token._.wordnet.lemmas()
         assert token._.wordnet.wordnet_domains()
-
-        del nlp