Skip to content
Permalink
Browse files

feat: get synsets by domains of interest. Update README and tests

  • Loading branch information...
dvsrepo authored and Francisco Aranda committed Dec 14, 2018
1 parent e3bf746 commit ebda2859cc8b589f24b2c11ce536b0443c862a60
Showing with 99 additions and 39 deletions.
  1. +4 −0 .gitignore
  2. +44 −11 README.md
  3. +2 −1 setup.cfg
  4. +3 −3 spacy_wordnet/wordnet_annotator.py
  5. +10 −14 spacy_wordnet/wordnet_domains.py
  6. +36 −10 tests/test_wordnet_annotator.py
@@ -48,5 +48,9 @@ MANIFEST
# Per-project virtualenvs
.venv*/

# vscode
.history/
.vscode/


wordnet_domains/
@@ -1,27 +1,34 @@
# Spacy wordnet annotator
# spaCy WordNet

`spacy-wordnet` creates annotations that easily allow the use of wordnet
and [wordnet domains](http://wndomains.fbk.eu/) by using
the [nltk wordnet interface](http://www.nltk.org/howto/wordnet.html)

spaCy Wordnet is a simple custom component for using [WordNet](https://wordnet.princeton.edu/), [MultiWordnet](http://multiwordnet.fbk.eu/english/home.php) and [WordNet domains](http://wndomains.fbk.eu/) with [spaCy](http://spacy.io).

The component combines the [NLTK wordnet interface](http://www.nltk.org/howto/wordnet.html) with WordNet domains to allow users to:

* Get all synsets for a processed token. For example, getting all the synsets (word senses) of the word ``bank``.
* Get and filter synsets by domain. For example, getting synonyms of the verb ``withdraw`` in the financial domain.

## Install

## Getting started
The spaCy WordNet component can be easily integrated into spaCy pipelines. You just need the following:
### Prerequisites

````bash
pip install spacy-wordnet
````
* Python 3.X
* spaCy

## Requirements
Some nltk data must be installed before using this package
You also need to install the following NLTK wordnet data:

````bash
python -m nltk.downloader wordnet
python -m nltk.downloader omw
````
### Install

````bash
pip install spacy-wordnet
````



## Usage

````python
@@ -42,6 +49,32 @@ token._.wordnet.lemmas()
# And automatically tags with wordnet domains
token._.wordnet.wordnet_domains()
# Imagine we want to enrich the following sentence with synonyms
sentence = nlp('I want to withdraw 5,000 euros')
# spaCy WordNet lets you find synonyms by domain of interest
# for example economy
economy_domains = ['finance', 'banking']
enriched_sentence = []
# For each token in the sentence
for token in sentence:
# We get those synsets within the desired domains
synsets = token._.wordnet.wordnet_synsets_for_domain(economy_domains)
if synsets:
lemmas_for_synset = []
for s in synsets:
# If we found a synset in the economy domains
# we get the variants and add them to the enriched sentence
lemmas_for_synset.extend(s.lemma_names())
enriched_sentence.append('({})'.format('|'.join(set(lemmas_for_synset))))
else:
enriched_sentence.append(token.text)
# Let's see our enriched sentence
print(' '.join(enriched_sentence))
# >> I (need|want|require) to (draw|withdraw|draw_off|take_out) 5,000 euros
````

@@ -25,7 +25,8 @@ include_package_data = True
# DON'T CHANGE THE FOLLOWING LINE! IT WILL BE UPDATED BY PYSCAFFOLD!
setup_requires = pyscaffold>=3.1a0,<3.2a0
# Add here dependencies of your project (semicolon/line-separated), e.g.
# install_requires = numpy; scipy
install_requires =
nltk>=3.3,<3.4
# The usage of test_requires is discouraged, see `Dependency Management` docs
# tests_require = pytest; pytest-cov
# Require a specific Python version, e.g. Python 2.7 or >= 3.4
@@ -1,20 +1,20 @@
from spacy.tokens.doc import Doc
from spacy.tokens.token import Token

from spacy_wordnet.wordnet_domains import Wordnet, _WordnetDomains
from spacy_wordnet.wordnet_domains import Wordnet, load_wordnet_domains


class WordnetAnnotator(object):
__FIELD = 'wordnet'

def __init__(self, lang: str = 'es'):
Token.set_extension(WordnetAnnotator.__FIELD, default=None, force=True)
self.__wn_domains = _WordnetDomains()
load_wordnet_domains()
self.__lang = lang

def __call__(self, doc: Doc):
for token in doc:
wordnet = Wordnet(token=token, wn_domains=self.__wn_domains, lang=self.__lang)
wordnet = Wordnet(token=token, lang=self.__lang)
token._.set(WordnetAnnotator.__FIELD, wordnet)

return doc
@@ -11,8 +11,7 @@
def wordnet_domains_path() -> str:
return __WN_DOMAINS_PATH


def load_wordnet_domains(path: str):
def load_wordnet_domains(path: Optional[str] = wordnet_domains_path()):
if __WN_DOMAINS_BY_SSID:
return

@@ -25,19 +24,9 @@ def get_domains_for_synset(synset: Synset) -> List[str]:
ssid = '{}-{}'.format(str(synset.offset()).zfill(8), synset.pos())
return __WN_DOMAINS_BY_SSID.get(ssid, [])


class _WordnetDomains(object):
def __init__(self):
load_wordnet_domains(wordnet_domains_path())

def domains_for_synset(self, synset: Synset) -> List[str]:
return get_domains_for_synset(synset)


class Wordnet(object):

def __init__(self, token: Token, wn_domains: _WordnetDomains, lang: str = 'es'):
self.__wn_domains = wn_domains
def __init__(self, token: Token, lang: str = 'es'):
self.__token = token
self.__lang = fetch_wordnet_lang(lang)
self.__synsets = self.__find_synsets(token, self.__lang)
@@ -53,9 +42,12 @@ def lemmas(self):
def wordnet_domains(self):
return self.__wordnet_domains

def wordnet_domains_for_synset(self, synset):
def wordnet_domains_for_synset(self, synset: Synset):
return get_domains_for_synset(synset)

def wordnet_synsets_for_domain(self, domains: List[str]):
return [synset for synset in self.synsets() if self.__has_domains(synset, domains)]

@staticmethod
def __find_synsets(token: Token, lang: str):
word_variants = [token.text]
@@ -70,6 +62,10 @@ def __find_synsets(token: Token, lang: str):

return []

@staticmethod
def __has_domains(synset: Synset, domains: List[str]) -> bool:
return not set(domains).isdisjoint(get_domains_for_synset(synset))

def __find_wordnet_domains(self):
return [domain for synset in self.synsets() for domain in get_domains_for_synset(synset)]

@@ -1,32 +1,58 @@
import unittest
from collections import defaultdict

import spacy

import numpy as np

from itertools import product

from spacy_wordnet.wordnet_annotator import WordnetAnnotator


class WordnetAnnotatorTest(unittest.TestCase):

def __init__(self, *args, **kwargs):

super().__init__(*args, **kwargs)

self.nlp_en = spacy.load('en')
self.nlp_es = spacy.load('es')

# Add wordnet component
self.nlp_en.add_pipe(WordnetAnnotator(self.nlp_en.lang))
self.nlp_es.add_pipe(WordnetAnnotator(self.nlp_es.lang))

def test_english_annotations(self):
nlp = spacy.load('en')
nlp.add_pipe(WordnetAnnotator(nlp.lang))

token = nlp('contracts')[0]
token = self.nlp_en('contracts')[0]

assert token._.wordnet.synsets()
assert token._.wordnet.lemmas()
assert token._.wordnet.wordnet_domains()

del nlp
def test_generate_variants_from_domain_list(self):

def test_spanish_annotations(self):
nlp = spacy.load('es')
nlp.add_pipe(WordnetAnnotator(nlp.lang))
economy_domains = ['finance', 'banking']
enriched_sentence = []

sentence = self.nlp_en('I want to withdraw 5,000 euros')

for token in sentence:
synsets = token._.wordnet.wordnet_synsets_for_domain(economy_domains)

token = nlp('contratos')[0]
if synsets:
lemmas_for_synset = []
for s in synsets:
lemmas_for_synset.extend(s.lemma_names())
enriched_sentence.append('({})'.format('|'.join(set(lemmas_for_synset))))
else:
enriched_sentence.append(token.text)
print(' '.join(enriched_sentence))

def test_spanish_annotations(self):
token = self.nlp_es('contratos')[0]

assert token._.wordnet.synsets()
assert token._.wordnet.lemmas()
assert token._.wordnet.wordnet_domains()

del nlp

0 comments on commit ebda285

Please sign in to comment.
You can’t perform that action at this time.