In [1]:
import kleis.resources.dataset as kl
default_corpus = kl.load_corpus()

In [2]:
# Default: filter_min_count = 3
# Default: tagging_notation="BILOU"
default_corpus.training(features_method="simple-posseq", filter_min_count=10)

In [3]:
# Any text should work
text = """Information extraction is the process of extracting structured data from unstructured text, \
which is relevant for several end-to-end tasks, including question answering. \
This paper addresses the tasks of named entity recognition (NER), \
a subtask of information extraction, using conditional random fields (CRF). \
Our method is evaluated on the ConLL-2003 NER corpus.
"""

In [4]:
# First call default_corpus.training as above 
keyphrases = default_corpus.label_text(text)

In [5]:
# Print result
print("Keyphrases:", len(keyphrases))

# Each keyphrase has the fields needed for the brat format
print("\n".join([str(k) for k in keyphrases]))

# The fields are 
keyphrase_id, (keyphrase_label, (keyphrase_start, keyphrase_end)), keyphrase_text = keyphrases[0]

print("\n - Fields: ", keyphrase_id, (keyphrase_label, (keyphrase_start, keyphrase_end)), keyphrase_text)

# (keyphrase_start, keyphrase_end) are the span in the original text
print("\n Segment of text: '%s'" % text[keyphrase_start:keyphrase_end])
print(text[keyphrase_start:keyphrase_end] == keyphrase_text)

Keyphrases: 7
('T3', ('KEYPHRASE', (0, 22)), 'Information extraction')
('T55', ('KEYPHRASE', (140, 168)), 'including question answering')
('T5', ('KEYPHRASE', (210, 228)), 'entity recognition')
('T23', ('KEYPHRASE', (230, 233)), 'NER')
('T6', ('KEYPHRASE', (249, 271)), 'information extraction')
('T27', ('KEYPHRASE', (279, 304)), 'conditional random fields')
('T24', ('KEYPHRASE', (306, 309)), 'CRF')

 - Fields:  T3 ('KEYPHRASE', (0, 22)) Information extraction

 Segment of text: 'Information extraction'
True


In [6]:
# Print keyphrases in brat format 
print(kl.keyphrases2brat(keyphrases))

T3	KEYPHRASE 0 22	Information extraction
T55	KEYPHRASE 140 168	including question answering
T5	KEYPHRASE 210 228	entity recognition
T23	KEYPHRASE 230 233	NER
T6	KEYPHRASE 249 271	information extraction
T27	KEYPHRASE 279 304	conditional random fields
T24	KEYPHRASE 306 309	CRF


In [7]:
# print label, start and end
for keyphrase in keyphrases:
    print("- - - - - KEYPHRASE - - - - -")
    print("Label: ", kl.keyphrase_label(keyphrase))
    print("Span: ", kl.keyphrase_span(keyphrase))
    # Another example with span
    start, end = kl.keyphrase_span(keyphrase)
    print("Start: ", start)
    print("End: ", end)
    # Without a function
    keyphrase_id, (keyphrase_label, (start, end)), keyphrase_str = keyphrase
    print("All fields: ", keyphrase_id, keyphrase_label, start, end, keyphrase_str)

- - - - - KEYPHRASE - - - - -
Label:  KEYPHRASE
Span:  (0, 22)
Start:  0
End:  22
All fields:  T3 KEYPHRASE 0 22 Information extraction
- - - - - KEYPHRASE - - - - -
Label:  KEYPHRASE
Span:  (140, 168)
Start:  140
End:  168
All fields:  T55 KEYPHRASE 140 168 including question answering
- - - - - KEYPHRASE - - - - -
Label:  KEYPHRASE
Span:  (210, 228)
Start:  210
End:  228
All fields:  T5 KEYPHRASE 210 228 entity recognition
- - - - - KEYPHRASE - - - - -
Label:  KEYPHRASE
Span:  (230, 233)
Start:  230
End:  233
All fields:  T23 KEYPHRASE 230 233 NER
- - - - - KEYPHRASE - - - - -
Label:  KEYPHRASE
Span:  (249, 271)
Start:  249
End:  271
All fields:  T6 KEYPHRASE 249 271 information extraction
- - - - - KEYPHRASE - - - - -
Label:  KEYPHRASE
Span:  (279, 304)
Start:  279
End:  304
All fields:  T27 KEYPHRASE 279 304 conditional random fields
- - - - - KEYPHRASE - - - - -
Label:  KEYPHRASE
Span:  (306, 309)
Start:  306
End:  309
All fields:  T24 KEYPHRASE 306 309 CRF
