# Chapter 12: Words, Parts of Speech, and Morphology
## CoNLL file readers and writers.
Use a class modeled as a vectorizer


Programs from the book: [_Python for Natural Language Processing_](https://link.springer.com/book/9783031575488)

__Author__: Pierre Nugues

## Modules

In [1]:
import regex as re
from urllib.request import urlopen
import json

In [2]:
CORPUS = 'English'

In [3]:
prefix = 'https://raw.githubusercontent.com/UniversalDependencies/'
corpus_suffixes = ['train.conllu', 'dev.conllu', 'test.conllu']
if CORPUS == 'English':
    url = prefix + 'UD_English-EWT/master/en_ewt-ud-'
elif CORPUS == 'French':
    url = prefix + 'UD_French-GSD/master/fr_gsd-ud-'
elif CORPUS == 'Spanish':
    url = prefix + 'UD_Spanish-AnCora/master/es_ancora-ud-'
else:
    pass
[train_file, val_file, test_file] = map(lambda x: url + x, corpus_suffixes)

In [4]:
[train_file, val_file, test_file]

['https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-train.conllu',
 'https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-dev.conllu',
 'https://raw.githubusercontent.com/UniversalDependencies/UD_English-EWT/master/en_ewt-ud-test.conllu']

In [5]:
train_sentences = urlopen(train_file).read().decode('utf-8').strip()
val_sentences = urlopen(val_file).read().decode('utf-8').strip()
test_sentences = urlopen(test_file).read().decode('utf-8').strip()

In [6]:

class Token(dict):
    pass

In [7]:
class CoNLLDictorizer:

    def __init__(self, column_names,
                 sent_sep='\n\n',
                 col_sep='\t+'):
        self.column_names = column_names
        self.sent_sep = sent_sep
        self.col_sep = col_sep

    def fit(self):
        pass

    def transform(self, corpus):
        corpus = corpus.strip()
        sentences = re.split(self.sent_sep, corpus)
        return list(map(self._split_in_words, sentences))

    def fit_transform(self, corpus):
        return self.transform(corpus)

    def _split_in_words(self, sentence):
        rows = re.split('\n', sentence)
        rows = [row for row in rows if row[0] != '#']
        return [Token(dict(zip(self.column_names,
                               re.split(self.col_sep, row))))
                for row in rows]

In [8]:
col_names = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS',
             'HEAD', 'DEPREL', 'HEAD', 'DEPS', 'MISC']

# column_names = list(map(str.lower, column_names))

In [9]:
conll_dict = CoNLLDictorizer(col_names)
train_dict = conll_dict.transform(train_sentences)

Align the form with the lemma and features

In [10]:
train_dict[8131][:4]

[{'ID': '1',
  'FORM': 'Or',
  'LEMMA': 'or',
  'UPOS': 'CCONJ',
  'XPOS': 'CC',
  'FEATS': '_',
  'HEAD': '4:cc',
  'DEPREL': 'cc',
  'DEPS': '_'},
 {'ID': '2',
  'FORM': 'you',
  'LEMMA': 'you',
  'UPOS': 'PRON',
  'XPOS': 'PRP',
  'FEATS': 'Case=Nom|Person=2|PronType=Prs',
  'HEAD': '4:nsubj',
  'DEPREL': 'nsubj',
  'DEPS': '_'},
 {'ID': '3',
  'FORM': 'can',
  'LEMMA': 'can',
  'UPOS': 'AUX',
  'XPOS': 'MD',
  'FEATS': 'VerbForm=Fin',
  'HEAD': '4:aux',
  'DEPREL': 'aux',
  'DEPS': '_'},
 {'ID': '4',
  'FORM': 'visit',
  'LEMMA': 'visit',
  'UPOS': 'VERB',
  'XPOS': 'VB',
  'FEATS': 'VerbForm=Inf',
  'HEAD': '0:root',
  'DEPREL': 'root',
  'DEPS': '_'}]

In [11]:
for word in train_dict[8131]:
    print('FORM: {}'.format(word['FORM']))
    print('LEMMA: {} + {}'.format(word['LEMMA'], word['FEATS']))

FORM: Or
LEMMA: or + _
FORM: you
LEMMA: you + Case=Nom|Person=2|PronType=Prs
FORM: can
LEMMA: can + VerbForm=Fin
FORM: visit
LEMMA: visit + VerbForm=Inf
FORM: temples
LEMMA: temple + Number=Plur
FORM: or
LEMMA: or + _
FORM: shrines
LEMMA: shrine + Number=Plur
FORM: in
LEMMA: in + _
FORM: Okinawa
LEMMA: Okinawa + Number=Sing
FORM: .
LEMMA: . + _


In [12]:
with open('train.json', 'w') as f:
    json.dump(train_dict, f)

## Creating a token

In [13]:
# column_names = ['id', 'form', 'lemma', 'cpos', 'pos', 'feats']
tok = Token({'ID': '1', 'FORM': 'La', 'LEMMA': 'el',
            'CPOS': 'd', 'POS': 'da', 'FEATS': 'num=s|gen=f'})
print('Keys:', tok.keys())
print('The form:', tok['FORM'])
print('Is key form in token?', 'form' in tok)

Keys: dict_keys(['ID', 'FORM', 'LEMMA', 'CPOS', 'POS', 'FEATS'])
The form: La
Is key form in token? False


In [14]:
tok_dict = {'id': '1', 'form': 'La', 'lemma': 'el',
            'cpos': 'd', 'pos': 'da', 'feats': 'num=s|gen=f'}
tok_dict2 = {'id': '1', 'form': 'La', 'lemma': 'el',
             'cpos': 'd', 'pos': 'da', 'feats': 'num=s|gen=f'}

tok_set = set(tok_dict)
print('Keys:', tok_set)
tok_set = tok_set.union(tok_dict2)
print(tok_set)

Keys: {'feats', 'cpos', 'pos', 'form', 'lemma', 'id'}
{'lemma', 'feats', 'cpos', 'pos', 'form', 'id'}


In [15]:
word_set = set(tok_dict.values())
print('Values:', list(word_set))

word_set = set(tok.values())
print(list(word_set))

word_set = set()
word_set.update(tok.values())
print('Values:', list(word_set))

word_set = set()
print("Token values:", tok.values())
word_set = word_set.union(set(tok.values()))
print('Values:', list(word_set))

Values: ['num=s|gen=f', 'La', 'da', 'd', '1', 'el']
['num=s|gen=f', 'La', 'da', 'd', '1', 'el']
Values: ['num=s|gen=f', 'La', 'da', 'd', '1', 'el']
Token values: dict_values(['1', 'La', 'el', 'd', 'da', 'num=s|gen=f'])
Values: ['num=s|gen=f', 'La', 'da', 'd', '1', 'el']


This function saves a file in the CoNLL format

In [16]:
def save_conll(file, corpus_dict, column_names):
    """
    Saves the corpus in a file
    :param file:
    :param corpus_dict:
    :param column_names:
    :return:
    """
    with open(file, 'w') as f_out:
        for sentence in corpus_dict:
            sentence_lst = []
            for row in sentence:
                items = map(lambda x: row.get(x, '_'), column_names)
                sentence_lst += '\t'.join(items) + '\n'
            sentence_lst += '\n'
            f_out.write(''.join(sentence_lst))

In [17]:
save_conll('out', train_dict, col_names)