In [None]:
from pathlib import Path
from nltk import ngrams
import sys
sys.path.append('/home/gpuadmin/Candice/scripts/data-extraction')
sys.path.append('/home/gpuadmin/projects/candice/AGEL/agel_backend/agel_v15/copy_editing/get_grammar_suggestions')
root_path = Path('/home/gpuadmin/Candice/Data/extracted/raw_edit_pairs')
from __parameters__ import word_tokenizer
import re
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.util import regexp_span_tokenize
bracket_regex_pattern = r"""[\(\[].*?[\)\]]"""

In [None]:
import json
def load_json(path):
    with open(path, 'r', encoding='utf-8') as f:
        return json.load(f)
    
all_preps = set(['across', 'near', 'along', 'on', 'down', 'till', 'underneath', 'beneath', 'against', 'to', 'under', 'for',
             'through', 'towards', 'above', 'of', 'until', 'between', 'by', 'among', 'during', 'after', 'over',
             'toward', 'in', 'before', 'around', 'inside', 'up', 'from', 'into', 'outside', 'below', 'onto', 'beside',
             'behind', 'at'] + ['aboard', 'about', 'above', 'across', 'after', 'against', 'along', 'amid', 'among', 'around', 'as', 'at', 'before', 'behind', 'below', 'beneath', 'beside', 'between', 'beyond', 'but', 'by', 'concerning', 'considering', 'despite', 'down', 'during', 'except', 'following', 'for', 'from', 'in', 'inside', 'into', 'like', 'minus', 'near', 'next', 'of', 'off', 'on', 'onto', 'opposite', 'out', 'outside', 'over', 'past', 'per', 'plus', 'regarding', 'round', 'save', 'since', 'than', 'through', 'till', 'to', 'toward', 'under', 'underneath', 'unlike', 'until', 'up', 'upon', 'versus', 'via', 'with', 'within', 'without'])


In [None]:
# Loading all data
%time all_data = [(load_json(path), path.name) for path in root_path.rglob('*.json') if path.parent.name in ['2FA-2T-1358', '2F', '2T','2FA', 'CE ONLY']]

In [125]:
# Splitting into train and test
len_all_data = len(all_data)
train_frac = 0.7
train_files = all_data[:int(train_frac*len_all_data)]
test_files = all_data[int(train_frac*len_all_data):]
len(train_files), len(test_files)

(6346, 2721)

In [119]:
test_file_names = [name for _, name in test_files]
train_file_names = [name for _, name in train_files]

In [120]:
class BracketTokenizer(RegexpTokenizer):
    def tokenize(self, text):
        self._check_regexp()
        # If our regexp matches gaps, use re.split:
        if self._gaps:
            if self._discard_empty:
                return [tok for tok in self._regexp.split(text) if tok]
            else:
                return self._regexp.split(text)

        # If our regexp matches tokens, use re.findall:
        else:
            return list(self._regexp.finditer(text))

    def span_tokenize(self, text):
        self._check_regexp()

        if self._gaps:
            for left, right in regexp_span_tokenize(text, self._regexp):
                if not (self._discard_empty and left == right):
                    yield left, right
        else:
            for m in re.finditer(self._regexp, text):
                if re.match(bracket_regex_pattern, m.group(0)) is not None:
                    yield {'span': m.span(), 'text': m.group(0), 'bracket_content': True}
                else:
                    yield {'span': m.span(), 'text': m.group(0), 'bracket_content': False}


bracket_tokenizer = BracketTokenizer(r"""[\(\[].*?[\)\]]|[^\(\[\)\]]+""")

def remove_brackets(text):
    new_text = ''
    for item in bracket_tokenizer.span_tokenize(text):
        new_text += ' ' * len(item['text']) if item['bracket_content'] else item['text']
    return new_text


In [None]:
# 9 gram approach
n = 9
all_trigrams = []
count = 0
for sent_list, _ in train_files:
    print(count)
    count += 1
    for dic in sent_list:
        sentence = remove_brackets(dic['edit']['text'])
        trigrams = ngrams(word_tokenizer.tokenize(sentence), n)
        for gram in trigrams:
            all_trigrams.append(gram)

In [122]:
#filtering n-grams that have a preposition in the middle.
prep_trigrams = []
for trigram in all_trigrams:
    if trigram[len(trigram)//2] in all_preps:
        prep_trigrams.append(trigram)

In [123]:
%%time
from nltk.lm import MLE
lm = MLE(n)
lm.fit([prep_trigrams], vocabulary_text=set([tok for item in prep_trigrams for tok in item]))

CPU times: user 53.6 s, sys: 199 ms, total: 53.8 s
Wall time: 53.8 s


In [124]:
import pickle
with open('all_prep_lm.pickle', 'wb') as f:
    pickle.dump(lm,f)

In [86]:
#possible candidate substitutions as observed in our data.
candidates = {'following' : ['after'], 'to': ['–', 'and'], '–': ['to'], 'as':['because'], 'among':['in'], 'since':['because'], 'upon': ['on'],
'between': ['among'], 'of':['in']}

#getting score for all possible candidates
def get_best_score(trigram):
    all_suggestions = [(trigram, lm.score(trigram[-1], trigram[:-1]))]
    for cand in candidates[trigram[len(trigram)//2]]:
        _temp = list(trigram)
        _temp[len(trigram)//2] = cand
        _temp = tuple(_temp)
        all_suggestions.append((_temp, lm.score(_temp[-1], _temp[:-1])))
    return max(all_suggestions, key=lambda x: x[1])


def update(item):
    text = item['raw_text']
    clean_text = remove_brackets(text)
    tokens = word_tokenizer.tokenize(clean_text)
    spans = list(word_tokenizer.span_tokenize(clean_text))
    
    if len(set(tokens).intersection(set(candidates.keys()))) == 0:
        return 0, 0, 0
    
    trigram_toks = list(ngrams(tokens, n))
    trigram_spans = list(ngrams(spans, n))
    
    #finding n-grams in sentence containing relevant preposition
    possible_suggestions = [idx for idx, gram in enumerate(trigram_toks) if gram[len(gram)//2] in candidates.keys()]
    final_suggestions = []
    
    for idx in possible_suggestions:
        gram = trigram_toks[idx]
        best_sugg, score = get_best_score(gram)
        if best_sugg != gram: #if suggestion is different than the original preposition
            final_suggestions.append((idx, best_sugg, score))

    fp, tp, fn = 0, 0, 0
#     Checking if the best suggestion is correct by comparing it to actual track changes made by editors
    rel_mis = [mis for mis in item['track_changes'] if mis['insertion'] and mis['deletion']]
    rel_mis = [mis for mis in rel_mis if len(mis['insertion']['tokens']) == 1 and len(clean_text[mis['deletion'][0]: mis['deletion'][1]].split(' ')) == 1]
    rel_mis = [mis for mis in rel_mis if clean_text[mis['deletion'][0]: mis['deletion'][1]] in candidates.keys()]

    for mis in rel_mis:
        flag = False
        for idx, sugg, _ in final_suggestions:
            # If the deleted word is same
            if tuple(mis['deletion']) == trigram_spans[idx][len(trigram_spans[idx])//2]:
                flag = True
                #if the suggestion is the same as the one made by editor
                if sugg[len(sugg)//2] == mis['insertion']['tokens'][0]:
                    tp += 1
                else:
                    fp += 1

        fn += 1 if not flag else 0
    return tp, fp, fn

In [126]:
all_mistakes[2937]

{'raw_text': 'The KHSC Palliative Care Consult Service was involved in the care of 14.2%        of patients in this study. All of them had a documented DNR and underwent device deactivation prior to death as part of their end-of-life care plan. The mean time to device deactivation for patients with Palliative Care consultation following establishment of DNR status was 7 days. Six of those seven patients had their device deactivated within one day of a DNR order being put in place, and the seventh patient had device deactivation 43 days after their DNR order was instituted. Patients without palliative care involvement had their device deactivated a mean of 79 days after their DNR order was instituted.',
 'track_changes': [{'deletion': [176, 184],
   'insertion': {'tokens': ['before'], 'position': 184}},
  {'deletion': [315, 324],
   'insertion': {'tokens': ['after'], 'position': 324}},
  {'deletion': [378, 383], 'insertion': {'tokens': ['7'], 'position': 383}},
  {'deletion': [429, 432]

In [89]:
from relevant_mistake_loader import LoadRelevantMistakes

In [56]:
mistake_loader = LoadRelevantMistakes(path_to_mistakes='/home/gpuadmin/Candice/Data/extracted/ce_non_bracket_lcs')

In [57]:
%time all_mistakes = mistake_loader.extract_all_mistakes(file_types=['2FA-2T-1358', '2F', '2T','2FA', 'CE ONLY'])

2019-12-17 16:25:20,874 :: INFO :: relevant_mistake_loader :: 2FA-2T-1358 track change files = 16
2019-12-17 16:25:20,875 :: INFO :: relevant_mistake_loader :: 2F track change files = 41
2019-12-17 16:25:20,876 :: INFO :: relevant_mistake_loader :: 2T track change files = 6029
2019-12-17 16:25:20,877 :: INFO :: relevant_mistake_loader :: 2FA track change files = 1890
2019-12-17 16:25:20,878 :: INFO :: relevant_mistake_loader :: CE ONLY track change files = 452


CPU times: user 7.8 s, sys: 485 ms, total: 8.28 s
Wall time: 8.26 s


In [58]:
len(all_mistakes)

186461

In [None]:
rel_mistakes = [mis for mis in all_mistakes if Path(mis['path']).name in test_file_names]

In [None]:
len(rel_mistakes)

In [112]:
#Calculating accuracy on test files
%%time
tp, fp, fn = 0, 0, 0
for item in rel_mistakes:
    _tp, _fp, _fn = update(item)
#     if _fn>0:
#         print(item)
#         break
    tp += _tp
    fp += _fp
    fn += _fn

CPU times: user 25.2 s, sys: 0 ns, total: 25.2 s
Wall time: 25.2 s


In [113]:
"Conclusions: Health literacy-associated disparities in parent use of Internet and cell phone technologies exist, but parents’ desire for use of these technologies for provider communication was overall high and did not differ by health literacy"[28:]

'-associated disparities in parent use of Internet and cell phone technologies exist, but parents’ desire for use of these technologies for provider communication was overall high and did not differ by health literacy'

In [114]:
tp, fp, fn

(22, 2, 9247)

In [115]:
tp, fp, fn

(22, 2, 9247)

In [116]:
p = tp/(fp+tp)

0.9166666666666666

In [117]:
tp/(tp + fn)

0.002373503074765347