Skip to content

Commit

Permalink
Fix output on missing fields
Browse files Browse the repository at this point in the history
  • Loading branch information
rominf committed May 18, 2024
1 parent 83466dd commit 6a4a215
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 4 deletions.
8 changes: 4 additions & 4 deletions spacy_conll/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,12 +200,12 @@ def _set_token_conll(self, token: Token, token_idx: int = 1) -> Token:
token_conll = (
token_idx,
token.text,
token.lemma_,
token.pos_,
token.tag_,
token.lemma_ if token.lemma_ else "_",
token.pos_ if token.pos_ else "_",
token.tag_ if token.tag_ else "_",
str(token.morph) if token.has_morph and str(token.morph) else "_",
head_idx,
token.dep_,
token.dep_ if token.dep_ else "_",
token._.conll_deps_graphs_field,
token._.conll_misc_field,
)
Expand Down
25 changes: 25 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from pathlib import Path

import pytest
from spacy.tokens import Doc, Token
from spacy.tokens.underscore import Underscore
from spacy.vocab import Vocab

from spacy_conll import init_parser

Expand Down Expand Up @@ -153,3 +155,26 @@ def pretokenized_conllparser_conllstr(pretokenized_conllparser):
def conllparser_parse_conllfile(spacy_vanila):
return ConllParser(spacy_vanila).parse_conll_as_spacy(
Path(__file__).parent.joinpath("en_ewt-ud-dev.conllu-sample.txt"), input_encoding="utf-8")


@pytest.fixture
def spacy_vocab():
return Vocab(strings=["hello", "world"])


@pytest.fixture
def spacy_doc(spacy_vocab):
words = ["hello", "world", "!"]
spaces = [True, False, False]
sent_starts = [True, False, False]
return Doc(
spacy_vocab,
words=words,
spaces=spaces,
sent_starts=sent_starts,
)


@pytest.fixture
def spacy_token(spacy_vocab, spacy_doc):
return Token(spacy_vocab, spacy_doc, 1)
24 changes: 24 additions & 0 deletions tests/test_formatter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from collections import OrderedDict

from spacy_conll.formatter import ConllFormatter
from spacy.tokens import Token


def test_set_token_conll(spacy_token: Token):
"""Test for https://github.com/BramVanroy/spacy_conll/issues/29"""
formatter = ConllFormatter()
assert formatter._set_token_conll(spacy_token)._.get("conll") == OrderedDict(
[
('ID', 1),
('FORM', 'world'),
('LEMMA', '_'),
('UPOS', '_'),
('XPOS', '_'),
('FEATS', '_'),
('HEAD', 2),
('DEPREL', '_'),
('DEPS', '_'),
('MISC', 'SpaceAfter=No'),
]
)
assert formatter._set_token_conll(spacy_token)._.get("conll_str") == "1\tworld\t_\t_\t_\t_\t2\t_\t_\tSpaceAfter=No\n"

0 comments on commit 6a4a215

Please sign in to comment.