-
Notifications
You must be signed in to change notification settings - Fork 223
/
_dumpparser.py
88 lines (68 loc) · 2.62 KB
/
_dumpparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import re
class ParsedDump(object):
"""
CRFsuite model parameters. Objects of this type are returned by
:meth:`pycrfsuite.Tagger.info()` method.
Attributes
----------
transitions : dict
``{(from_label, to_label): weight}`` dict with learned transition weights
state_features : dict
``{(attribute, label): weight}`` dict with learned ``(attribute, label)`` weights
header : dict
Metadata from the file header
labels : dict
``{name: internal_id}`` dict with model labels
attributes : dict
``{name: internal_id}`` dict with known attributes
"""
def __init__(self):
self.header = {}
self.labels = {}
self.attributes = {}
self.transitions = {}
self.state_features = {}
class CRFsuiteDumpParser(object):
"""
A hack: parser for `crfsuite dump` results.
Obtaining coefficients "the proper way" is quite hard otherwise
because in CRFsuite they are hidden in private structures.
"""
def __init__(self):
self.state = None
self.result = ParsedDump()
def feed(self, line):
# Strip initial ws and line terminator, but allow for ws at the end of feature names.
line = line.lstrip().rstrip('\r\n')
if not line:
return
m = re.match(r"(FILEHEADER|LABELS|ATTRIBUTES|TRANSITIONS|STATE_FEATURES) = {", line)
if m:
self.state = m.group(1)
elif line == '}':
self.state = None
else:
getattr(self, 'parse_%s' % self.state)(line)
def parse_FILEHEADER(self, line):
m = re.match(r"(\w+): (.*)", line)
self.result.header[m.group(1)] = m.group(2)
def parse_LABELS(self, line):
m = re.match(r"(\d+): (.*)", line)
self.result.labels[m.group(2)] = m.group(1)
def parse_ATTRIBUTES(self, line):
m = re.match(r"(\d+): (.*)", line)
self.result.attributes[m.group(2)] = m.group(1)
def parse_TRANSITIONS(self, line):
m = re.match(r"\(\d+\) (.+) --> (.+): ([+-]?\d+\.\d+)", line)
from_, to_ = m.group(1), m.group(2)
assert from_ in self.result.labels
assert to_ in self.result.labels
self.result.transitions[(from_, to_)] = float(m.group(3))
def parse_STATE_FEATURES(self, line):
m = re.match(r"\(\d+\) (.+) --> (.+): ([+-]?\d+\.\d+)", line)
attr, label = m.group(1), m.group(2)
assert attr in self.result.attributes
assert label in self.result.labels
self.result.state_features[(attr, label)] = float(m.group(3))