In [2]:
import spacy
import string
import re
import nltk.data 
import nltk.tokenize.punkt 
import pickle 
import pprint

In [3]:
nlp = spacy.load('en')

In [4]:
with open('dataset/genia-pos/GENIAtrain.pos', 'r') as f:
    train_contents = f.readlines()
with open('dataset/genia-pos/GENIAtest.pos', 'r') as f:
    test_contents = f.readlines()

FileNotFoundError: [Errno 2] No such file or directory: 'dataset/genia-pos/GENIAtrain.pos'

## Iterate over the abstracts and yiels list of sentences for each abstract

In [79]:
abstracts = []
sents = []
def iter_abs(contents):
    sents = []
    for line in contents:
        parts = line.split('-/:')
        if parts[0] == 'TI/LS ':
            continue
        if len(parts) == 2:
            sent_with_ann = parts[1].strip()
        else:
            sent_with_ann = parts[0].strip()
        tokens_with_ann = sent_with_ann.split()
        tokens_without_ann = [x.split('/')[0] for x in tokens_with_ann]
        sent = ''.join([('' if c in string.punctuation else ' ')+c for c in tokens_without_ann]).strip()
        if re.match(r'[\d]+',sent) and len(sents) != 0:
            yield sents
            sents = []
        sents.append(sent)

## Splitter

In [80]:
def splitter(func, contents, trained=False, debug=False, print_wrong=False):
    correct = 0
    total = 0
    for abs_i, abstract in enumerate(iter_abs(contents)):
        text = ' '.join(abstract[1:])
        
        if func == 'spacy':
            sents = nlp(text).sents
        elif func == 'nltk':
            if trained:
                nltk_sentence_detector = nltk.data.load('tokenizers/punkt/genia.pickle')
            else:
                nltk_sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle')
            sents = nltk_sentence_detector.tokenize(text)
            if debug:
                for decision in tokenizer.debug_decisions(text):
                    print(decision)
                    print ('=' * 30)
        else:
            print('Not supported splitter')
            return -1
        
        for i, sent in enumerate(sents):
            if func == 'spacy':
                sent_text = sent.text
            else:
                sent_text = sent
            if i < len(abstract) - 1:
                if sent_text == abstract[i+1]:
                    correct += 1
                    total += 1
                else:
                    if print_wrong:
                        print('PREDICTED: {}'.format(sent_text))
                        print('TRUE: {}\n'.format(abstract[i+1]))
                    total += 1
                    
    return correct, total

In [81]:
correct_train, total_train = splitter('spacy', train_contents)
correct_test, total_test = splitter('spacy', test_contents)
print('Training accuracy: {}%'.format(correct_train/total_train*100))
print('Test accuracy: {}%'.format(correct_test/total_test*100))

Training accuracy: 88.63886595100468%
Test accuracy: 84.25449871465295%


In [82]:
correct_train, total_train = splitter('nltk', train_contents)
correct_test, total_test = splitter('nltk', test_contents)
print('Training accuracy: {}%'.format(correct_train/total_train*100))
print('Test accuracy: {}%'.format(correct_test/total_test*100))

Training accuracy: 94.777130470126%
Test accuracy: 89.96188055908515%


## Train punkt

In [51]:
tokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer() 

# Training text
abs_content = []
with open('dataset/pubmed/abstract.txt') as f:
    for i, line in enumerate(f):
        abs_content.append(line[24:-16])
train_text = '\n'.join(abs_content)

# Train tokenizer 
tokenizer.train(train_text) 

with open("/home/rohit/nltk_data/tokenizers/punkt/PY3/genia.pickle","wb") as f: 
    pickle.dump(tokenizer, f) 

30918


In [88]:
correct_train, total_train = splitter('nltk', train_contents, trained=True, print_wrong=False)
correct_test, total_test = splitter('nltk', test_contents, trained=True, print_wrong=False)
print('Training accuracy: {}%'.format(correct_train/total_train*100))
print('Test accuracy: {}%'.format(correct_test/total_test*100))

Training accuracy: 97.10027648526535%
Test accuracy: 93.5969868173258%
