In [1]:
import kleis.resources.dataset as kl
default_corpus = kl.load_corpus()

In [2]:
# Default: filter_min_count = 3
# Default: tagging_notation="BILOU"
default_corpus.training(features_method="simple-posseq", filter_min_count=13)

In [3]:
# Any text should work
text = """Information extraction is the process of extracting structured data from unstructured text, \
which is relevant for several end-to-end tasks, including question answering. \
This paper addresses the tasks of named entity recognition (NER), \
a subtask of information extraction, using conditional random fields (CRF). \
Our method is evaluated on the ConLL-2003 NER corpus.
"""

In [4]:
# First call default_corpus.training as above 
keyphrases = default_corpus.label_text(text)

In [5]:
# Print result
print("Keyphrases:", len(keyphrases))

# Each keyphrase has the fields needed for the brat format
print("\n".join([str(k) for k in keyphrases]))

# The fields are 
keyphrase_id, (keyphrase_label, (keyphrase_start, keyphrase_end)), keyphrase_text = keyphrases[0]

print("\n - Fields: ", keyphrase_id, (keyphrase_label, (keyphrase_start, keyphrase_end)), keyphrase_text)

# (keyphrase_start, keyphrase_end) are the span in the original text
print("\n Segment of text: '%s'" % text[keyphrase_start:keyphrase_end])
print(text[keyphrase_start:keyphrase_end] == keyphrase_text)

Keyphrases: 13
('T1', ('KEYPHRASE', (73, 90)), 'unstructured text')
('T3', ('KEYPHRASE', (0, 22)), 'Information extraction')
('T4', ('KEYPHRASE', (150, 168)), 'question answering')
('T5', ('KEYPHRASE', (210, 228)), 'entity recognition')
('T6', ('KEYPHRASE', (249, 271)), 'information extraction')
('T11', ('KEYPHRASE', (150, 158)), 'question')
('T12', ('KEYPHRASE', (159, 168)), 'answering')
('T15', ('KEYPHRASE', (217, 228)), 'recognition')
('T23', ('KEYPHRASE', (230, 233)), 'NER')
('T24', ('KEYPHRASE', (306, 309)), 'CRF')
('T27', ('KEYPHRASE', (279, 304)), 'conditional random fields')
('T31', ('KEYPHRASE', (298, 304)), 'fields')
('T49', ('KEYPHRASE', (140, 168)), 'including question answering')

 - Fields:  T1 ('KEYPHRASE', (73, 90)) unstructured text

 Segment of text: 'unstructured text'
True


In [6]:
# Print keyphrases in brat format 
print(kl.keyphrases2brat(keyphrases))

T1	KEYPHRASE 73 90	unstructured text
T3	KEYPHRASE 0 22	Information extraction
T4	KEYPHRASE 150 168	question answering
T5	KEYPHRASE 210 228	entity recognition
T6	KEYPHRASE 249 271	information extraction
T11	KEYPHRASE 150 158	question
T12	KEYPHRASE 159 168	answering
T15	KEYPHRASE 217 228	recognition
T23	KEYPHRASE 230 233	NER
T24	KEYPHRASE 306 309	CRF
T27	KEYPHRASE 279 304	conditional random fields
T31	KEYPHRASE 298 304	fields
T49	KEYPHRASE 140 168	including question answering


In [7]:
# print label, start and end
for keyphrase in keyphrases:
    print("- - - - - KEYPHRASE - - - - -")
    print("Label: ", kl.keyphrase_label(keyphrase))
    print("Span: ", kl.keyphrase_span(keyphrase))
    # Another example with span
    start, end = kl.keyphrase_span(keyphrase)
    print("Start: ", start)
    print("End: ", end)
    # Without a function
    keyphrase_id, (keyphrase_label, (start, end)), keyphrase_str = keyphrase
    print("All fields: ", keyphrase_id, keyphrase_label, start, end, keyphrase_str)

- - - - - KEYPHRASE - - - - -
Label:  KEYPHRASE
Span:  (73, 90)
Start:  73
End:  90
All fields:  T1 KEYPHRASE 73 90 unstructured text
- - - - - KEYPHRASE - - - - -
Label:  KEYPHRASE
Span:  (0, 22)
Start:  0
End:  22
All fields:  T3 KEYPHRASE 0 22 Information extraction
- - - - - KEYPHRASE - - - - -
Label:  KEYPHRASE
Span:  (150, 168)
Start:  150
End:  168
All fields:  T4 KEYPHRASE 150 168 question answering
- - - - - KEYPHRASE - - - - -
Label:  KEYPHRASE
Span:  (210, 228)
Start:  210
End:  228
All fields:  T5 KEYPHRASE 210 228 entity recognition
- - - - - KEYPHRASE - - - - -
Label:  KEYPHRASE
Span:  (249, 271)
Start:  249
End:  271
All fields:  T6 KEYPHRASE 249 271 information extraction
- - - - - KEYPHRASE - - - - -
Label:  KEYPHRASE
Span:  (150, 158)
Start:  150
End:  158
All fields:  T11 KEYPHRASE 150 158 question
- - - - - KEYPHRASE - - - - -
Label:  KEYPHRASE
Span:  (159, 168)
Start:  159
End:  168
All fields:  T12 KEYPHRASE 159 168 answering
- - - - - KEYPHRASE - - - - -
Label:  

In [38]:
def largest_keyphrases(keyphrases):
    # kps = sorted(keyphrases, key=lambda kp: (kp[1][1][1], kp[1][1][0]))
    keyphrases = sorted(keyphrases, key=lambda kp: kp[1][1])
    kps = [True]
    prev_start, prev_end = float("inf"), -1
    for i, keyphrase in enumerate(keyphrases):
        if not i < len(keyphrases) - 1:
            break
        print(keyphrase)
        start, end = keyphrases[i+1][1][1]
        if prev_start <= start and prev_end >= end:
            kps.append(False)
        else:
            kps.append(True)
    return kps
    
[kp for kp in largest_keyphrases(keyphrases)]

('T3', ('KEYPHRASE', (0, 22)), 'Information extraction')
('T1', ('KEYPHRASE', (73, 90)), 'unstructured text')
('T49', ('KEYPHRASE', (140, 168)), 'including question answering')
('T11', ('KEYPHRASE', (150, 158)), 'question')
('T4', ('KEYPHRASE', (150, 168)), 'question answering')
('T12', ('KEYPHRASE', (159, 168)), 'answering')
('T5', ('KEYPHRASE', (210, 228)), 'entity recognition')
('T15', ('KEYPHRASE', (217, 228)), 'recognition')
('T23', ('KEYPHRASE', (230, 233)), 'NER')
('T6', ('KEYPHRASE', (249, 271)), 'information extraction')
('T27', ('KEYPHRASE', (279, 304)), 'conditional random fields')
('T31', ('KEYPHRASE', (298, 304)), 'fields')


[True,
 True,
 True,
 False,
 True,
 False,
 True,
 False,
 True,
 True,
 True,
 False,
 True]