Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: get synsets by domains of interest. Update README and tests
- Loading branch information
1 parent
e3bf746
commit ebda285
Showing
6 changed files
with
99 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,5 +48,9 @@ MANIFEST | |
# Per-project virtualenvs | ||
.venv*/ | ||
|
||
# vscode | ||
.history/ | ||
.vscode/ | ||
|
||
|
||
wordnet_domains/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,20 @@ | ||
from spacy.tokens.doc import Doc | ||
from spacy.tokens.token import Token | ||
|
||
from spacy_wordnet.wordnet_domains import Wordnet, _WordnetDomains | ||
from spacy_wordnet.wordnet_domains import Wordnet, load_wordnet_domains | ||
|
||
|
||
class WordnetAnnotator(object): | ||
__FIELD = 'wordnet' | ||
|
||
def __init__(self, lang: str = 'es'): | ||
Token.set_extension(WordnetAnnotator.__FIELD, default=None, force=True) | ||
self.__wn_domains = _WordnetDomains() | ||
load_wordnet_domains() | ||
self.__lang = lang | ||
|
||
def __call__(self, doc: Doc): | ||
for token in doc: | ||
wordnet = Wordnet(token=token, wn_domains=self.__wn_domains, lang=self.__lang) | ||
wordnet = Wordnet(token=token, lang=self.__lang) | ||
token._.set(WordnetAnnotator.__FIELD, wordnet) | ||
|
||
return doc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,32 +1,58 @@ | ||
import unittest | ||
from collections import defaultdict | ||
|
||
import spacy | ||
|
||
import numpy as np | ||
|
||
from itertools import product | ||
|
||
from spacy_wordnet.wordnet_annotator import WordnetAnnotator | ||
|
||
|
||
class WordnetAnnotatorTest(unittest.TestCase): | ||
|
||
def __init__(self, *args, **kwargs): | ||
|
||
super().__init__(*args, **kwargs) | ||
|
||
self.nlp_en = spacy.load('en') | ||
self.nlp_es = spacy.load('es') | ||
|
||
# Add wordnet component | ||
self.nlp_en.add_pipe(WordnetAnnotator(self.nlp_en.lang)) | ||
self.nlp_es.add_pipe(WordnetAnnotator(self.nlp_es.lang)) | ||
|
||
def test_english_annotations(self): | ||
nlp = spacy.load('en') | ||
nlp.add_pipe(WordnetAnnotator(nlp.lang)) | ||
|
||
token = nlp('contracts')[0] | ||
token = self.nlp_en('contracts')[0] | ||
|
||
assert token._.wordnet.synsets() | ||
assert token._.wordnet.lemmas() | ||
assert token._.wordnet.wordnet_domains() | ||
|
||
del nlp | ||
def test_generate_variants_from_domain_list(self): | ||
|
||
def test_spanish_annotations(self): | ||
nlp = spacy.load('es') | ||
nlp.add_pipe(WordnetAnnotator(nlp.lang)) | ||
economy_domains = ['finance', 'banking'] | ||
enriched_sentence = [] | ||
|
||
sentence = self.nlp_en('I want to withdraw 5,000 euros') | ||
|
||
for token in sentence: | ||
synsets = token._.wordnet.wordnet_synsets_for_domain(economy_domains) | ||
|
||
token = nlp('contratos')[0] | ||
if synsets: | ||
lemmas_for_synset = [] | ||
for s in synsets: | ||
lemmas_for_synset.extend(s.lemma_names()) | ||
enriched_sentence.append('({})'.format('|'.join(set(lemmas_for_synset)))) | ||
else: | ||
enriched_sentence.append(token.text) | ||
print(' '.join(enriched_sentence)) | ||
|
||
def test_spanish_annotations(self): | ||
token = self.nlp_es('contratos')[0] | ||
|
||
assert token._.wordnet.synsets() | ||
assert token._.wordnet.lemmas() | ||
assert token._.wordnet.wordnet_domains() | ||
|
||
del nlp |