Skip to content

Commit

Permalink
feat: get synsets by domains of interest. Update README and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
dvsrepo authored and frascuchon committed Dec 17, 2018
1 parent e3bf746 commit ebda285
Show file tree
Hide file tree
Showing 6 changed files with 99 additions and 39 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Expand Up @@ -48,5 +48,9 @@ MANIFEST
# Per-project virtualenvs
.venv*/

# vscode
.history/
.vscode/


wordnet_domains/
55 changes: 44 additions & 11 deletions README.md
@@ -1,27 +1,34 @@
# Spacy wordnet annotator
# spaCy WordNet

`spacy-wordnet` creates annotations that easily allow the use of wordnet
and [wordnet domains](http://wndomains.fbk.eu/) by using
the [nltk wordnet interface](http://www.nltk.org/howto/wordnet.html)

spaCy Wordnet is a simple custom component for using [WordNet](https://wordnet.princeton.edu/), [MultiWordnet](http://multiwordnet.fbk.eu/english/home.php) and [WordNet domains](http://wndomains.fbk.eu/) with [spaCy](http://spacy.io).

The component combines the [NLTK wordnet interface](http://www.nltk.org/howto/wordnet.html) with WordNet domains to allow users to:

* Get all synsets for a processed token. For example, getting all the synsets (word senses) of the word ``bank``.
* Get and filter synsets by domain. For example, getting synonyms of the verb ``withdraw`` in the financial domain.

## Install

## Getting started
The spaCy WordNet component can be easily integrated into spaCy pipelines. You just need the following:
### Prerequisites

````bash
pip install spacy-wordnet
````
* Python 3.X
* spaCy

## Requirements
Some nltk data must be installed before using this package
You also need to install the following NLTK wordnet data:

````bash
python -m nltk.downloader wordnet
python -m nltk.downloader omw
````
### Install

````bash
pip install spacy-wordnet
````



## Usage

````python
Expand All @@ -42,6 +49,32 @@ token._.wordnet.lemmas()

# And automatically tags with wordnet domains
token._.wordnet.wordnet_domains()

# Imagine we want to enrich the following sentence with synonyms
sentence = nlp('I want to withdraw 5,000 euros')

# spaCy WordNet lets you find synonyms by domain of interest
# for example economy
economy_domains = ['finance', 'banking']
enriched_sentence = []

# For each token in the sentence
for token in sentence:
# We get those synsets within the desired domains
synsets = token._.wordnet.wordnet_synsets_for_domain(economy_domains)
if synsets:
lemmas_for_synset = []
for s in synsets:
# If we found a synset in the economy domains
# we get the variants and add them to the enriched sentence
lemmas_for_synset.extend(s.lemma_names())
enriched_sentence.append('({})'.format('|'.join(set(lemmas_for_synset))))
else:
enriched_sentence.append(token.text)

# Let's see our enriched sentence
print(' '.join(enriched_sentence))
# >> I (need|want|require) to (draw|withdraw|draw_off|take_out) 5,000 euros

````

3 changes: 2 additions & 1 deletion setup.cfg
Expand Up @@ -25,7 +25,8 @@ include_package_data = True
# DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD!
setup_requires = pyscaffold>=3.1a0,<3.2a0
# Add here dependencies of your project (semicolon/line-separated), e.g.
# install_requires = numpy; scipy
install_requires =
nltk>=3.3,<3.4
# The usage of test_requires is discouraged, see `Dependency Management` docs
# tests_require = pytest; pytest-cov
# Require a specific Python version, e.g. Python 2.7 or >= 3.4
Expand Down
6 changes: 3 additions & 3 deletions spacy_wordnet/wordnet_annotator.py
@@ -1,20 +1,20 @@
from spacy.tokens.doc import Doc
from spacy.tokens.token import Token

from spacy_wordnet.wordnet_domains import Wordnet, _WordnetDomains
from spacy_wordnet.wordnet_domains import Wordnet, load_wordnet_domains


class WordnetAnnotator(object):
__FIELD = 'wordnet'

def __init__(self, lang: str = 'es'):
Token.set_extension(WordnetAnnotator.__FIELD, default=None, force=True)
self.__wn_domains = _WordnetDomains()
load_wordnet_domains()
self.__lang = lang

def __call__(self, doc: Doc):
for token in doc:
wordnet = Wordnet(token=token, wn_domains=self.__wn_domains, lang=self.__lang)
wordnet = Wordnet(token=token, lang=self.__lang)
token._.set(WordnetAnnotator.__FIELD, wordnet)

return doc
24 changes: 10 additions & 14 deletions spacy_wordnet/wordnet_domains.py
Expand Up @@ -11,8 +11,7 @@
def wordnet_domains_path() -> str:
return __WN_DOMAINS_PATH


def load_wordnet_domains(path: str):
def load_wordnet_domains(path: Optional[str] = wordnet_domains_path()):
if __WN_DOMAINS_BY_SSID:
return

Expand All @@ -25,19 +24,9 @@ def get_domains_for_synset(synset: Synset) -> List[str]:
ssid = '{}-{}'.format(str(synset.offset()).zfill(8), synset.pos())
return __WN_DOMAINS_BY_SSID.get(ssid, [])


class _WordnetDomains(object):
def __init__(self):
load_wordnet_domains(wordnet_domains_path())

def domains_for_synset(self, synset: Synset) -> List[str]:
return get_domains_for_synset(synset)


class Wordnet(object):

def __init__(self, token: Token, wn_domains: _WordnetDomains, lang: str = 'es'):
self.__wn_domains = wn_domains
def __init__(self, token: Token, lang: str = 'es'):
self.__token = token
self.__lang = fetch_wordnet_lang(lang)
self.__synsets = self.__find_synsets(token, self.__lang)
Expand All @@ -53,9 +42,12 @@ def lemmas(self):
def wordnet_domains(self):
return self.__wordnet_domains

def wordnet_domains_for_synset(self, synset):
def wordnet_domains_for_synset(self, synset: Synset):
return get_domains_for_synset(synset)

def wordnet_synsets_for_domain(self, domains: List[str]):
return [synset for synset in self.synsets() if self.__has_domains(synset, domains)]

@staticmethod
def __find_synsets(token: Token, lang: str):
word_variants = [token.text]
Expand All @@ -70,6 +62,10 @@ def __find_synsets(token: Token, lang: str):

return []

@staticmethod
def __has_domains(synset: Synset, domains: List[str]) -> bool:
return not set(domains).isdisjoint(get_domains_for_synset(synset))

def __find_wordnet_domains(self):
return [domain for synset in self.synsets() for domain in get_domains_for_synset(synset)]

Expand Down
46 changes: 36 additions & 10 deletions tests/test_wordnet_annotator.py
@@ -1,32 +1,58 @@
import unittest
from collections import defaultdict

import spacy

import numpy as np

from itertools import product

from spacy_wordnet.wordnet_annotator import WordnetAnnotator


class WordnetAnnotatorTest(unittest.TestCase):

def __init__(self, *args, **kwargs):

super().__init__(*args, **kwargs)

self.nlp_en = spacy.load('en')
self.nlp_es = spacy.load('es')

# Add wordnet component
self.nlp_en.add_pipe(WordnetAnnotator(self.nlp_en.lang))
self.nlp_es.add_pipe(WordnetAnnotator(self.nlp_es.lang))

def test_english_annotations(self):
nlp = spacy.load('en')
nlp.add_pipe(WordnetAnnotator(nlp.lang))

token = nlp('contracts')[0]
token = self.nlp_en('contracts')[0]

assert token._.wordnet.synsets()
assert token._.wordnet.lemmas()
assert token._.wordnet.wordnet_domains()

del nlp
def test_generate_variants_from_domain_list(self):

def test_spanish_annotations(self):
nlp = spacy.load('es')
nlp.add_pipe(WordnetAnnotator(nlp.lang))
economy_domains = ['finance', 'banking']
enriched_sentence = []

sentence = self.nlp_en('I want to withdraw 5,000 euros')

for token in sentence:
synsets = token._.wordnet.wordnet_synsets_for_domain(economy_domains)

token = nlp('contratos')[0]
if synsets:
lemmas_for_synset = []
for s in synsets:
lemmas_for_synset.extend(s.lemma_names())
enriched_sentence.append('({})'.format('|'.join(set(lemmas_for_synset))))
else:
enriched_sentence.append(token.text)
print(' '.join(enriched_sentence))

def test_spanish_annotations(self):
token = self.nlp_es('contratos')[0]

assert token._.wordnet.synsets()
assert token._.wordnet.lemmas()
assert token._.wordnet.wordnet_domains()

del nlp

0 comments on commit ebda285

Please sign in to comment.