In [1]:
from kleis.resources import dataset as kd
import pycrfsuite

In [2]:
default_corpus = kd.load_corpus()
default_corpus.load_train()
default_corpus.training(features_method="simple")

In [3]:
train = [{'tags': v['tags'], 'keyphrases': v['keyphrases']} for k,v in default_corpus.train.items()]

X_train = []
y_train = []
for element in train:
    tags = list(map(lambda t: [t[0], t[1], t[2], 'O'], element['tags']))
    for keyphrase in element['keyphrases'].values():
        # print(keyphrase)
        tokens_indices = keyphrase['tokens-indices']
        if len(tokens_indices) == 1:
            U = tokens_indices[0]
            tags[U][3] = 'U-KEYPHRASE'
        elif len(tokens_indices) > 1:
            B = tokens_indices[0]
            L = tokens_indices[-1]
            tags[B][3] = 'B-KEYPHRASE'
            tags[L][3] = 'L-KEYPHRASE'
            for x in tokens_indices[1:-1]:
                tags[x][3] = 'I-KEYPHRASE'
            
    features = [kd.simple_features(tag, i, len(tags), [], []) for i, tag in enumerate(tags)]
    labels = [tag[3] for i, tag in enumerate(tags)]
    X_train.append(features)
    y_train.append(labels)
    # print(labels)
    # print(features)
    # print(tags)
print(len(X_train), len(y_train))
# print(X_train[-1], y_train[-1])

350 350


In [4]:
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)

In [5]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    # 'max_iterations': 50,  # stop earlier
    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

trainer.params()

['feature.minfreq',
 'feature.possible_states',
 'feature.possible_transitions',
 'c1',
 'c2',
 'max_iterations',
 'num_memories',
 'epsilon',
 'period',
 'delta',
 'linesearch',
 'max_linesearch']

In [6]:
model = 'baseline.semeval2017-task10.pycrfsuite'
if not kd.path_exists(model):
    trainer.train(model)

In [7]:
tagger = pycrfsuite.Tagger()
tagger.open('baseline.semeval2017-task10.pycrfsuite')

<contextlib.closing at 0x7fa9de618780>

In [8]:
default_corpus.load_dev()

In [9]:
import os
output = 'output-baseline/dev/'
if not kd.path_exists(output):
    os.makedirs(output)
    
for i, (key, tmp_dataset) in enumerate(default_corpus.dev.items()):
    text = tmp_dataset["raw"]["txt"]
    tags = tmp_dataset["tags"]
    X_test = [kd.simple_features(tag, i, len(tags), [], []) for i, tag in enumerate(tags)]
    keyphrases_kleis = default_corpus.label_text(text)
    # print(tags)
    keyphrases = []
    keyphrases_labels = tagger.tag(X_test)
    tid = 0
    for token_index, label in enumerate(keyphrases_labels):
        if label == 'O':
            continue
        else:
            if label[:2] == 'U-':
                tid += 1
                keyphrase = None
                token_start = min(tags[token_index][2])
                token_end = max(tags[token_index][2])
                keyphrase = ("T%d" % tid, ("KEYPHRASE", (token_start, token_end)), text[token_start:token_end])
                keyphrases.append(keyphrase)
                
            if label[:2] == 'B-':
                tid += 1
                token_start = min(tags[token_index][2])
                token_end = max(tags[token_index][2])
                keyphrase = ("T%d" % tid, ("KEYPHRASE", (token_start, token_end)), text[token_start:token_end])
                
            if label[:2] == 'I-':
                token_end = max(tags[token_index][2])
                kp_id, (kp_label, (kp_start, _)), _ = keyphrase
                keyphrase = (kp_id, (kp_label, (kp_start, token_end)), text[kp_start:token_end])

            if label[:2] == 'L-':
                token_end = max(tags[token_index][2])
                kp_id, (kp_label, (kp_start, _)), _ = keyphrase
                keyphrases.append((kp_id, (kp_label, (kp_start, token_end)), text[kp_start:token_end]))
                keyphrase = None

    #print("\n".join([str(kp) for kp in keyphrases]))
    print("\n%d ----" % (i+1))
    print("\n  +++ CRF +++ \n")  
    print(kd.keyphrases2brat(keyphrases))
    print("\n  +++ KLEIS +++ \n")  
    print(kd.keyphrases2brat(keyphrases_kleis))
    print("\n  +++ RAW +++ \n") 
    print(tmp_dataset["raw"]["ann"])
    with open(output + key + ".ann", "w", encoding="utf-8") as fout:
        fout.write(kd.keyphrases2brat(keyphrases))
    with open(output + key + ".txt", "w", encoding="utf-8") as fout:
        fout.write(text)
    # print(keyphrases)


1 ----

  +++ CRF +++ 

T1	KEYPHRASE 18 21	PCM
T2	KEYPHRASE 117 135	Mock up” PCM drums
T3	KEYPHRASE 183 192	PCM drums
T4	KEYPHRASE 214 244	mild steel paint cans and lids
T5	KEYPHRASE 246 266	Fenton Packaging Ltd
T6	KEYPHRASE 270 278	PVC bags
T7	KEYPHRASE 301 323	identical PVC sheeting
T8	KEYPHRASE 325 343	Romar Workwear Ltd
T9	KEYPHRASE 351 379	metallic waste was simulated
T10	KEYPHRASE 386 423	commercial grade 18/8 stainless steel
T11	KEYPHRASE 425 473	aluminium and copper (Avus Metals & Plastics Ltd
T12	KEYPHRASE 506 535	inorganic waste was simulated
T13	KEYPHRASE 607 611	CeO2
T14	KEYPHRASE 618 632	Acros Organics
T15	KEYPHRASE 676 690	PuO2 surrogate
T16	KEYPHRASE 692 721	Commercially available ground
T17	KEYPHRASE 734 763	blast-furnace slag “Calumite”
T18	KEYPHRASE 798 827	analysed chemical composition
T19	KEYPHRASE 863 880	powdered material
T20	KEYPHRASE 897 923	particle size distribution

  +++ KLEIS +++ 

T10	KEYPHRASE 863 880	powdered material
T26	KEYPHRASE 418 423	steel
T27	KEY

In [10]:
default_corpus.load_test()

In [11]:
import os
output = 'output-baseline/test/'
if not kd.path_exists(output):
    os.makedirs(output)
    
for i, (key, tmp_dataset) in enumerate(default_corpus.test.items()):
    text = tmp_dataset["raw"]["txt"]
    tags = tmp_dataset["tags"]
    X_test = [kd.simple_features(tag, i, len(tags), [], []) for i, tag in enumerate(tags)]
    # print(tags)
    keyphrases_kleis = default_corpus.label_text(text)
    keyphrases = []
    keyphrases_labels = tagger.tag(X_test)
    tid = 0
    for token_index, label in enumerate(keyphrases_labels):
        if label == 'O':
            continue
        else:
            if label[:2] == 'U-':
                tid += 1
                keyphrase = None
                token_start = min(tags[token_index][2])
                token_end = max(tags[token_index][2])
                keyphrase = ("T%d" % tid, ("KEYPHRASE", (token_start, token_end)), text[token_start:token_end])
                keyphrases.append(keyphrase)
                
            if label[:2] == 'B-':
                tid += 1
                token_start = min(tags[token_index][2])
                token_end = max(tags[token_index][2])
                keyphrase = ("T%d" % tid, ("KEYPHRASE", (token_start, token_end)), text[token_start:token_end])
                
            if label[:2] == 'I-':
                token_end = max(tags[token_index][2])
                kp_id, (kp_label, (kp_start, _)), _ = keyphrase
                keyphrase = (kp_id, (kp_label, (kp_start, token_end)), text[kp_start:token_end])

            if label[:2] == 'L-':
                token_end = max(tags[token_index][2])
                kp_id, (kp_label, (kp_start, _)), _ = keyphrase
                keyphrases.append((kp_id, (kp_label, (kp_start, token_end)), text[kp_start:token_end]))
                keyphrase = None

    #print("\n".join([str(kp) for kp in keyphrases]))
    print("\n%d ----" % (i+1))
    print("\n  +++ CRF +++ \n")  
    print(kd.keyphrases2brat(keyphrases))
    print("\n  +++ KLEIS +++ \n")  
    print(kd.keyphrases2brat(keyphrases_kleis))
    print("\n  +++ RAW +++ \n") 
    print(tmp_dataset["raw"]["ann"])
    with open(output + key + ".ann", "w", encoding="utf-8") as fout:
        fout.write(kd.keyphrases2brat(keyphrases))
    with open(output + key + ".txt", "w", encoding="utf-8") as fout:
        fout.write(text)
    # print(keyphrases)


1 ----

  +++ CRF +++ 

T1	KEYPHRASE 87 115	nx unknown nodal coordinates
T2	KEYPHRASE 133 173	nb unknown discrete Lagrange multipliers
T3	KEYPHRASE 179 193	linear systems
T4	KEYPHRASE 228 249	Newton-based solution
T5	KEYPHRASE 359 425	tangent stiffness matrix of the unconstrained pseudo-solid problem
T6	KEYPHRASE 439 458	off-diagonal blocks
T7	KEYPHRASE 603 639	LBB stability of this discretisation
T8	KEYPHRASE 682 772	LBB stability of the Lagrange-multiplier-based imposition of Dirichlet boundary conditions
T9	KEYPHRASE 836 852	Newton iteration
T10	KEYPHRASE 859 886	symmetric positive definite
T11	KEYPHRASE 911 958	tangent stiffness matrix relative to the system
T12	KEYPHRASE 961 986	equilibrium configuration

  +++ KLEIS +++ 

T23	KEYPHRASE 350 351	E
T33	KEYPHRASE 625 639	discretisation
T49	KEYPHRASE 836 852	Newton iteration
T53	KEYPHRASE 70 71	L
T64	KEYPHRASE 854 855	E
T71	KEYPHRASE 53 61	unknowns
T109	KEYPHRASE 179 193	linear systems

  +++ RAW +++ 

T1	Material 43 61	vector of unk