/
FinnishPOSTagger.py
139 lines (97 loc) · 3.93 KB
/
FinnishPOSTagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
# coding: utf-8
"""
POS-tagging for Finnish Helsinki corpora
http://www.ling.helsinki.fi/kieliteknologia/tutkimus/treebank/index.shtml
"""
import re
from string import punctuation
import pandas as pd
from seqlearn.datasets import load_conll
from seqlearn.evaluation import whole_sequence_accuracy
from seqlearn.perceptron import StructuredPerceptron
from sklearn.metrics.classification import accuracy_score, f1_score, classification_report
eng_pattern = re.compile(r'^[a-z]+$', re.I)
abbr_pattern = re.compile(r'([A-Z]{2,}|([A-Z]\.)+)')
case_endings = re.compile(r'(l[lt]|s[st])[aä]$', re.I)
FI_VOWELS = "aeäöiouy"
def is_vowel(symbol):
return symbol in FI_VOWELS
def get_word_len(seq):
return str(len(seq))
def digits_count(seq):
return len([j.isdigit() for j in seq])
def non_alphabet_count(seq):
return sum(1 for j in seq if j not in punctuation and not j.isdigit() and eng_pattern.match(j))
def get_word_shape(seq):
return ''.join([
'C' if s in 'BCGFWXZ' else
'c' if s in 'bcgfwxz' else
'X' if re.match(r'^[AD-VYÄÖ]$', s) else
'x' if re.match(r'^[ad-vyäö]$', s) else
'd' if s.isdigit() else s
for s in seq
])
def get_short_word_shape(seq):
return re.sub("(\\w)(\\1)+", "\\1", get_word_shape(seq))
def features(sequence, i):
"""
Generate features from inputs
:param sequence: columns set
:param i: word number
:return: features set
"""
seq = sequence[i].split("\t")[0]
# first position in the sentence
if i == 0:
yield "first"
if i == len(sequence) - 1:
yield "last"
# word's length
yield "len=" + get_word_len(seq)
# first 4 letters
yield "first_letters=" + seq[:4] if len(seq) > 4 else seq
# last 4 letters
yield "last_letters=" + seq[-4:] if len(seq) > 4 else seq
# word shape
yield "word_shape=" + str(get_word_shape(seq))
yield "short_word_shape=" + get_short_word_shape(seq)
yield "non_en_alphabet_count=" + str(non_alphabet_count(seq))
yield "digits_count=" + str(digits_count(seq))
if abbr_pattern.search(seq):
yield "abbr"
if seq.endswith('en'):
yield "has_adj_ending"
if case_endings.match(seq):
yield "ends_with_case"
if seq.endswith('es') or seq.endswith('ed') or seq[-1] in 'prt':
yield "ends_with_foreign_consonants"
if i > 0:
prev = sequence[i - 1].split("\t")[0]
# previous word's length
yield "prev_len=" + str(get_word_len(prev))
# last letters of the previous word
yield "prev_last_letters=" + (prev[-4:] if len(prev) > 4 else prev)
yield "prev_word_shape=" + get_word_shape(prev)
yield "prev_short_word_shape=" + get_short_word_shape(prev)
if i < len(sequence) - 1:
next_ = sequence[i + 1].split("\t")[0]
# next word's length
yield "next_len=" + str(get_word_len(next_))
# last letters of the next word
yield "next_last_letters=" + (next_[-4:] if len(next_) > 4 else next_)
yield "next_word_shape=" + get_word_shape(next_)
yield "next_short_word_shape=" + get_short_word_shape(next_)
# читаем обучающее множество
X_train, y_train, lengths_train = load_conll(open("ftb1u-v1/ftb1u_train.tsv", "r"), features)
clf = StructuredPerceptron(decode="viterbi", verbose=1)
print("Fitting model " + str(clf))
clf.fit(X_train, y_train, lengths_train)
print("\nPredictions on test set")
# читаем тестовое множество
X_test, y_test, lengths_test = load_conll(open("ftb1u-v1/ftb1u_test.tsv", "r"), features)
y_pred = clf.predict(X_test, lengths_test)
print("Whole seq accuracy ", whole_sequence_accuracy(y_test, y_pred, lengths_test))
print("Element-wise accuracy ", accuracy_score(y_test, y_pred))
print("Mean F1-score macro ", f1_score(y_test, y_pred, average="macro"))
print(classification_report(y_test, y_pred))
print(pd.Series(y_pred).value_counts())