In [49]:
import pandas as pd
import numpy as np
import json
import pickle
import copy
import os
from tqdm import tqdm
import re
from transformers import BertTokenizer
import random
import torch
import torch.nn as nn
from fuzzywuzzy import fuzz
from nltk import sent_tokenize
from difflib import SequenceMatcher
# import stanza
# stanza_nlp = stanza.Pipeline('en')
import spacy
nlp = spacy.load("en_core_web_sm")

In [6]:
path = 'ArgumentAnnotatedEssays-2.0 3/brat-project-final/'
all_filenames = [path+i.split(".")[0] for i in os.listdir(path) \
                 if (i.endswith("ann") or i.endswith("txt")) and i.startswith("essay")]
all_filenames = list(set(all_filenames))
len(all_filenames)

402

In [7]:
conj_adv = ["therefore", "however", "moreover", "furthermore", "accordingly", "similarly", "also", "hence", "anyway",
"nevertheless", "besides", "thereafter", "nonetheless", "consequently", "thus", "likewise", "further", "meanwhile",
           "due to"]
conj_adv_pattern = "("+"|".join([r"\b" + i + r"\b" for i in conj_adv])+")"

def spacy_text_process(t1):
    doc = nlp(t1)
    return " ".join([token.text for token in doc])

def split_conj_adv(txt):
    splits = re.split(conj_adv_pattern, txt)
    all_splits = []
    for ix, i in enumerate(splits):
        if i in set(conj_adv) and ix+1 < len(splits):
            splits[ix+1] = i+splits[ix+1]
        else:
            all_splits.append(i)
    return all_splits

In [8]:
para_dict = {}
for name in tqdm(all_filenames):
    with open(name+".txt") as f:
        lines = f.readlines()
        txt = "".join(lines)
    tmp_dict = {}
    for ix, i in enumerate(lines):
        tmp_dict[ix] = {}
        for ix2, j in enumerate(sent_tokenize(i.strip())):
            tmp_dict[ix][ix2] = spacy_text_process(j.strip())
    para_dict[name] = tmp_dict

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 402/402 [00:32<00:00, 12.20it/s]


# Using NeuralEDUSeg & Rules for EDU segmentation

In [5]:
para_dict["ArgumentAnnotatedEssays-2.0 3/brat-project-final/essay300"][5]

{0: 'Vacations are not only required by students but also by professionals and housekeepers .',
 1: 'As it not only provide break to them from their regular commitment but also allows them to spend some time on themselves , heel stressful mind and boost desire of resuming to the regular schedule .',
 2: 'Therefore schools should consider giving several small vocations to the students during the year .'}

In [427]:
os.mkdir("PE_paragraphs")
os.mkdir("PE_paragraphs_discourse_units")

In [428]:
tgt_path = "/home/csgrad/sougatas/research_work/argumentation/PE_paragraphs/"
for fname, v in para_dict.items():
    for para_id, sent_dict in v.items():
        if len(sent_dict) > 0:
            for sent_id, text in sent_dict.items():
                if len(text.split()) >= 5:
                    fn = tgt_path + fname.split("/")[-1]+"_"+str(para_id)+"_"+str(sent_id)+".txt"
                    with open(fn, 'w') as f:
                        f.write(text)
## Next Run the EDU segmentor on the files present in tgt_path, and save the output in the folder PE_paragraphs_discourse_units            

In [2]:
!ls -lr /home/csgrad/sougatas/research_work/argumentation/PE_paragraphs/ | wc -l

7101


In [3]:
!ls -lr /home/csgrad/sougatas/research_work/argumentation/PE_paragraphs_discourse_units/ | wc -l

7101


In [10]:
def sentence2list(sentence):
    st = 0
    punct = [".", "?", "!", ",", ";"]
    lst = []
    for ix, i in enumerate(sentence):
        if i in punct:
            if ix-1 >= 0 and ix+1 < len(sentence) and sentence[ix-1].isdigit() and sentence[ix+1].isdigit():
                pass
            else:
                lst.append(sentence[st:ix + 1])
                st = ix + 1

    if st != len(sentence):
        lst.append(sentence[st:])
    
    if len(lst) == 0:
        lst.append(sentence)
    return lst

def punct_split(txt):
    lst = sentence2list(txt)
    min_thresh, pre = 3, ""
    post_lst = []
    for i in lst:
        if len(i.split()) > min_thresh:
            post_lst.append(pre+i)
            pre = ""
            
        else:
            if len(post_lst) > 0:
                post_lst[-1] = post_lst[-1] + i
                
            else:
                pre = pre+i

    if pre != "":
        post_lst.append(pre)

    return post_lst
    

In [9]:
processed_path = '/home/csgrad/sougatas/research_work/argumentation/PE_paragraphs_discourse_units/'
processed_filenames = os.listdir(processed_path)
len(processed_filenames)

7100

In [11]:
def process_text(text):
    text = re.sub(r"[^a-zA-Z0-9 ]", " ", text).lower().strip()
    text = re.sub(r"\s+", " ", text)
    text = " ".join(text.split())
    return text

def fix_processing(txt):
    txt = txt.replace("    ", "   ").replace("I ' m", "I 'm").replace("' ve ", "'ve ")
    txt = txt.replace("I ’ m", "I ’m")
    return txt

def fix_short_lines(lst):
    tmp_lines, pre = [], ""
    for i in lst:
        if len(process_text(i).split()) >= 2:
            if process_text(pre) != "etc":
                if pre != "":
                    i = pre + " " +i
                tmp_lines.append(i)
            else:
                tmp_lines[-1] = tmp_lines[-1]+ " "+pre
                tmp_lines.append(i)
            pre = ""
        else:
            if pre != "":
                pre = pre + " " + i
            else:
                pre += i

    if pre != "":
        tmp_lines.append(pre)
    return tmp_lines

processed_para_dict = {}
for name in tqdm(processed_filenames):
    with open(processed_path+name) as f:
        lines = f.readlines()
        fname, para_id, sent_id = name.split("_")
        sent_id, _ = sent_id.split(".")
        para_id, sent_id = int(para_id), int(sent_id)
        lines = [fix_processing(i.strip()) for i in lines if len(i.strip()) > 0]
        lines = [j.strip() for i in lines for j in split_conj_adv(i) if len(j.strip()) > 0]
        lines = [j.strip() for i in lines for j in punct_split(i) if len(j.strip()) > 0]
        lines = fix_short_lines(lines)
        
        if processed_para_dict.get(fname, None) is None:
            processed_para_dict[fname] = {}
        
        if processed_para_dict[fname].get(para_id, None) is None:
            processed_para_dict[fname][para_id] = {sent_id:lines}
        else:
            processed_para_dict[fname][para_id][sent_id] = lines

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7100/7100 [00:00<00:00, 14776.63it/s]


# Collating Results & Formatting EDU with PE Annotations

In [15]:
def get_sentence_boundary_dict(dct):
    #Needs a dict para_id: {"proc_text":"...", "proc_length":xx, "proc_start":xx, "proc_end":xx}
    boundary_dict = {}
    for para_id, v in dct.items():
        sents = sent_tokenize(v["text"])#sent_tokenize(v["text"])#sent_tokenize(v["proc_text"])
        tmp = {}
        for ix, i in enumerate(sents):
            i = spacy_text_process(i)
            pre, post = v["proc_text"].split(i, 1)
            sent_st, sent_en = len(pre), v["proc_end"] - v["proc_start"] - len(post)

            tmp[ix] = {"sent":i, "sent_st":sent_st, "sent_en":sent_en, "offset":v["proc_start"]}
        boundary_dict[para_id] = tmp
        boundary_dict[para_id]["raw_text"] = v["proc_text"]
    return boundary_dict

In [16]:
def edu_boundaries(sent, edu_list, st, en):
    tmp = {}
    for ix, edu in enumerate(edu_list):
        pre, post = sent.split(edu, 1)
        assert sent[len(pre): en - st - len(post)] == edu

        tmp[ix] = {"edu": edu, "st":len(pre), "en":en - st - len(post)}
    return tmp

In [17]:
path = 'ArgumentAnnotatedEssays-2.0 3/brat-project-final/'
all_filenames = [path+i.split(".")[0] for i in os.listdir(path) \
                 if (i.endswith("ann") or i.endswith("txt")) and i.startswith("essay")]
all_filenames = list(set(all_filenames))
len(all_filenames)

402

In [327]:
from difflib import SequenceMatcher


def get_pre_post(txt, span):
    splt_lst = txt.split(span)

    if len(splt_lst) == 2:
        pre, post = splt_lst[0], splt_lst[1]
    elif len(splt_lst) == 1:
        pre, post = splt_lst[0], ""
    elif len(splt_lst) > 2:
        pre, post = "NA", "NA"
        if splt_lst[0] == "":
            pre = ""
        if splt_lst[-1] == "":
            post = ""
    else:
        pre, post = "", ""
    return len(pre.strip()), len(post.strip())

def check_span(str_a, str_b, span):
    # a = reference, b = edu
    if not any([i.startswith(" ") or i.endswith(" ") for i in str_a.split(span)]) and \
        not any([i.startswith(" ") or i.endswith(" ") for i in str_b.split(span)]) and \
        span != str_a and span != str_b:
        return False
    b_len_pre, b_len_post = get_pre_post(str_b, span)
    a_len_pre, a_len_post = get_pre_post(str_a, span)

    if (a_len_pre == 0 and a_len_post == 0) or (b_len_pre == 0 and b_len_post == 0):
        return True
    elif (a_len_post == 0 and b_len_pre == 0) or (a_len_pre == 0 and b_len_post == 0):
        return True
    else:
        return False

def match_sequence(str_a, str_b, add_lambda=False):
    span, st, en, continuing = None, None, None, False

    if not add_lambda:
        s = SequenceMatcher(None, str_a, str_b)
        if not any([block.a == 0 or block.b == 0 for block in s.get_matching_blocks()]):
            s = SequenceMatcher(lambda x: x == " ", str_a, str_b)
            if not any([block.a == 0 or block.b == 0 for block in s.get_matching_blocks()]):
                s = SequenceMatcher(None, str_a, str_b)
    else:
        s = SequenceMatcher(lambda x: x == " ", str_a, str_b)
    for block in s.get_matching_blocks():
        if block.size > 0:
            if block.a == 0:
                span = str_a[block.a:(block.a + block.size)]
                if check_span(str_a, str_b, span) or str_a.startswith(span) and str_a.endswith(span):
                    st, en = block.b, block.b + block.size
                    continuing = en == len(str_b)

            elif block.b == 0:
                span = str_b[block.b:(block.b + block.size)]
                if check_span(str_a, str_b, span):
                    st, en = block.b, block.b + block.size
                    continuing = en == len(str_b)

    if st is None or en is None:
        span = None
    return span, st, en, continuing


In [19]:
ann_dict, simple_ann_dict = {}, {}
for name in tqdm(all_filenames):
    with open(name+".txt") as f:
        paragraphs = f.readlines()
        txt = "".join(paragraphs)

    """ Create Segments """
    segments = []
    prev, prev_s= 0, 0
    for ix, paragraph in enumerate(paragraphs):
        spacy_proc_paragraph = " ".join([spacy_text_process(i) for i in sent_tokenize(paragraph)])
        orig_len, processed_len = len(paragraph), len(spacy_proc_paragraph)
        segments.append([ix, paragraph, orig_len, prev, prev+len(paragraph), spacy_proc_paragraph, 
                         processed_len, prev_s, prev_s+len(spacy_proc_paragraph)])
        prev += len(paragraph)
        prev_s += len(spacy_proc_paragraph)
    segments_df = pd.DataFrame(segments,columns=["p_id", "text", "orig_length", "orig_start", "orig_end", 
                                                 "proc_text", "proc_length", "proc_start", "proc_end"])
    segments_df = segments_df.set_index("p_id")
    segments_dict = segments_df.to_dict(orient="index")
    
    sentence_boundary_dict = get_sentence_boundary_dict(segments_dict)
    sentence_boundary_dict_cp = copy.deepcopy(sentence_boundary_dict)
    
    for para_id, v1 in sentence_boundary_dict.items():
        for sent_id, v2 in v1.items():
            if sent_id != "raw_text":
                try:
                    edu_list = processed_para_dict[name.split("/")[-1]][para_id][sent_id]
                except:
                    edu_list = [v2["sent"]]
                edu_boundary = edu_boundaries(v2["sent"], edu_list, v2["sent_st"], v2["sent_en"])
                sentence_boundary_dict_cp[para_id][sent_id]["edu_dict"] = edu_boundary
    ann_dict[name.split("/")[-1]] = {"segments": sentence_boundary_dict_cp,
                                    "raw_text":txt}
    simple_ann_dict[name.split("/")[-1]] = {}
    for para_id, v2 in sentence_boundary_dict_cp.items():
        tmp = []
        for sent_id, v3 in v2.items():
            if sent_id != "raw_text":
                tmp.extend([v4["edu"] for edu_id, v4 in v3["edu_dict"].items()])
        if len(tmp) > 0:  
            simple_ann_dict[name.split("/")[-1]][para_id] = {"orig_text": segments_dict[para_id]["text"], 
                                               "orig_st": segments_dict[para_id]["orig_start"],
                                               "orig_en": segments_dict[para_id]["orig_end"],
                                               "edu_list": tmp}
#     break

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 402/402 [01:05<00:00,  6.17it/s]


In [27]:
def get_annotations(dct):
    tmp_dct = {}
    for para_id, v1 in dct.items():
        for i in list(zip(v1["edu_list"], v1["edu_annotation"])):
            if tmp_dct.get(i[1][0], None) is None:
                tmp_dct[i[1][0]] = i[0][i[1][1]:i[1][2]]
            else:
                tmp_dct[i[1][0]] = tmp_dct[i[1][0]] + " "+ i[0][i[1][1]:i[1][2]]
    return {k:v for k,v in tmp_dct.items() if k != "O"}

def match_annotations(name):
    golden_dct = all_ann_dict[name]
    golden_dct = {k:spacy_text_process(v["text"]) for k,v in golden_dct.items()}
    anno_dct = get_annotations(simple_ann_dict_bkp[name])
    g_anns, a_anns = set(golden_dct.keys()), set(anno_dct.keys())
    not_in_anno = g_anns - a_anns
    not_in_gold = a_anns - g_anns
    if len(not_in_anno) != len(not_in_gold) and (len(not_in_anno) != 0 or len(not_in_gold) != 0):
        print("not_in_anno", not_in_anno, "not_in_gold", not_in_gold)
    assert len(not_in_anno) == len(not_in_gold) == 0
    unmatched = []
    for k,v in golden_dct.items():
        if v != anno_dct[k]:
            unmatched.append([name, k, v, anno_dct[k]])
    return unmatched



def merge_annotations(l1, l2, name):
    #existing, new
    lst = []
    zipped_lst = list(zip(l1, l2))
    for ix, i in enumerate(zipped_lst):
        if i[0][0] == i[1][0] == "O":
            lst.append(i[0])
        elif i[0][0] != "O" and i[1][0] != "O" and i[0][0] != i[1][0]:
            if (i[1][2] - i[1][1]) > (i[0][2] - i[0][1]):
                lst.append(i[1])
            elif (i[1][2] - i[1][1]) <= (i[0][2] - i[0][1]):
                lst.append(i[0])
            else:
                lst.append(i[1])
                print("\nADDING Conflicting Annotations!!",i,name,"\n")
        elif i[0][0] != "O":
            lst.append(i[0])
        else:
            lst.append(i[1])
    return lst     

def default_annotation(lst):
    return [["O", 0, len(i)] for i in lst]

def modify_annotations(lst):
    modified_anno = [lst[0]]
    for ix in range(1, len(lst)-1):
        if lst[ix][0] != "O":
            if modified_anno[ix-1][0] == lst[ix+1][0] and lst[ix][0] != lst[ix+1][0] and lst[ix+1][0] != "O":
                modified_anno.append([lst[ix-1][0]] + lst[ix][1:])
            else:
                modified_anno.append(lst[ix])
        else:
            modified_anno.append(lst[ix])
    if len(lst) > 1:
        modified_anno.append(lst[-1])
    assert len(modified_anno) == len(lst)
    return modified_anno

def modify_annotation_by_clusters(annotation):
    t_df = pd.DataFrame(annotation, columns=["tag", "st", "en"])
    t_df["leng"] = t_df["en"]-t_df["st"]
    
    # Assign sequence labels
    st = t_df.iloc[0]["tag"]
    seq = [1]
    for ix, row in t_df[1:].iterrows():
        if row["tag"] != st:
            seq.append(seq[-1] + 1)
        else:
            seq.append(seq[-1])
        st = row["tag"]
    t_df["seq"] = seq
    
    # Keep tags with largest total contiguous length
    t_df_grp = t_df.groupby(["tag", "seq"]).agg({"leng":"sum"}).reset_index()
    t_df_grp.columns = ["tag", "seq", "cum_leng"]
    t_df = t_df.merge(t_df_grp, how="inner", on=["tag", "seq"])
    
    t_df_grp2 = t_df_grp.groupby(["tag"]).agg({"cum_leng":"max"}).reset_index()
    t_df_grp2["keep"] = 1
    t_df_merged = t_df.merge(t_df_grp2, how="left", on=["tag", "cum_leng"]).fillna(0)
    tag_lst = []
    for ix, row in t_df_merged.iterrows():
        if row["tag"] != "O" and row["keep"] == 0:
            tag_lst.append(["O", 0, -1])
        else:
            tag_lst.append([row["tag"], row["st"], row["en"]])
    return tag_lst

In [34]:
def get_intersection(a, b):
    return list(set(a.split()).intersection(set(b.split())))

def get_phrases(lst):
    phrase_list, conti = [], ""
    for i in lst:
        if i[1]:
            if conti != "":
                conti += " "+i[0]
            else:
                conti += i[0]
        else:
            if conti != "":
                phrase_list.append(conti)
                conti = ""
    if conti != "":
        phrase_list.append(conti)
    return phrase_list    

def get_matched_phrases(a, b):
    matched_words = get_intersection(a, b)
    a_matched = [[i, i in matched_words] for i in a.split()]
    b_matched = [[i, i in matched_words] for i in b.split()]
    a_matched_phrases, b_matched_phrases = list(set(get_phrases(a_matched))), list(set(get_phrases(b_matched)))
    return a_matched_phrases, b_matched_phrases

def span_type(st, en, leng):
    if en-st == leng:
        return "full"
    elif st == 0:
        return "start"
    elif en == leng:
        return "end"
    else:
        return "mid"
    
def get_edu_matching_span(phrase, edu):
    phrase = phrase.replace("(","\(").replace(")","\)").replace("+","\+")
    p = re.compile(phrase)
    lst = []
    for m in p.finditer(edu):
        st, en = m.start(), m.end()
        lst.append([m.group(), m.start(), m.end(), span_type(m.start(), m.end(), len(edu))])
    return lst

def check_valid_edu_match(ongoing, current):
    if ongoing is None and current is not None:
        return True
    elif ongoing == "end" and current in ["full","start"]:
        return True
    elif ongoing == "full" and current in ["full","start"]:
        return True
    else:
        return False

def get_unique_spans(lst_a, lst_b):
    uniq_spans = []
    for i in lst_a:
        for j in lst_b:
            if fuzz.token_set_ratio(i, j) == 100:
                s = SequenceMatcher(None, i,j)
                longest = s.find_longest_match(0, len(i), 0, len(j))
                span = i[longest.a:longest.a+longest.size]
                uniq_spans.append(span)
    return list(set(uniq_spans))

def correct_spelling(txt):
    spelling_correction = {"responsibl":"responsible", "communicatio":"communication",
                          "educatio":"education","environmen":"environment", "governmen":"government"}
    for k,v in spelling_correction.items():
        if len(re.findall("\\b"+k+"\\b", txt)) > 0:
            txt = txt.replace(k, v)
    return txt

In [22]:
def match_edu_list_new(ann_txt, edu_lst, dev=False):
    completed, lst, ongoing_span = [], [], None
    for idx, edu in enumerate(edu_lst):
        if " ".join(completed) == ann_txt:
            lst.append((None, None, None, False))
            continue
        tmp_ongoing_span = None
        a_matched, b_matched = get_matched_phrases(ann_txt, edu)
        uniq_spans = get_unique_spans(a_matched, b_matched)
        if dev:
            print("For EDU:",edu,".\tUNIQ PHRASES:",uniq_spans)
        if len(uniq_spans) > 0 and len(completed) < len(ann_txt.split()):
            tmp = []
            flag = False
            for phrase in uniq_spans:
                edu_matching_spans = get_edu_matching_span(phrase, edu) #span, st, en, type: (mid, start, end, full)
                for matching_span in edu_matching_spans:
                    if dev:
                        print("for phrase:",phrase)
                    if all([i[0] == i[1] for i in list(zip(ann_txt.split(), completed + phrase.split()))]) and \
                        len(phrase.split()) > len(tmp) and \
                        check_valid_edu_match(ongoing_span, matching_span[-1]): 
                    # The phrase aligns, is larger than existing phrase, and is valid
                        tmp = phrase.split()
                        tmp_ongoing_span = matching_span#matching_span[-1]
                        flag = False
                        if dev:
                            print("inside IF")
                    elif all([i[0] == i[1] for i in list(zip(ann_txt.split(), phrase.split()))]) and \
                        len(phrase.split()) > len(tmp): 
                    # The phrase aligns, is larger than existing phrase, and is valid. But the completed align is bad
                        tmp = phrase.split()
                        tmp_ongoing_span = matching_span#matching_span[-1]
                        flag = True
                        if dev:
                            print("inside ELIF")
                    else:
                        if dev:
                            print("inside ELSE")
                        pass
            if len(tmp) > 0:
                if flag:
                    completed = [] # Reset completed
                    lst[-1] = (None, None, None, False) #Reset last matched
                completed.extend(tmp)
                ongoing_span = tmp_ongoing_span[-1]
                lst.append(tmp_ongoing_span[:-1]+[True if ongoing_span in ["full", "end"] else False])
                if dev:
                    print("TMP HAS VALUES:",tmp,".\tONGOING SPAN:",ongoing_span)
                
            else:
                lst.append((None, None, None, False))
                ongoing_span = None
                completed = []
                if dev:
                    print("TMP IS EMPTY:",tmp,".\tONGOING SPAN:",ongoing_span)
            
        else:
            lst.append((None, None, None, False))
            ongoing_span = None
            completed = []
            if dev:
                print("DIDNT ENTER IF-ELSE")
            
        if ongoing_span is None or ongoing_span not in ["full", "end"]:
            ongoing_span = None
        if dev:
            print("COMPLETED=>",completed,"\n")

    return lst
                    

In [905]:
# ann_txt = spacy_text_process(all_ann_dict["essay184"]["T9"]["text"])
# edu_list = simple_ann_dict_bkp["essay184"][3]["edu_list"]
ann_txt = spacy_text_process(all_ann_dict["essay165"]["T11"]["text"])
edu_list = simple_ann_dict_bkp["essay165"][4]["edu_list"]


In [906]:
ann_txt, edu_list

('I spent the time of biology class to do my Chemistry research and got A+ for both biology and chemistry',
 ['Secondly , students should not be required to attend classes',
  'because they can manage the time',
  'spending on classes',
  'to do other benefit things',
  'if they already understand topic in the class very well .',
  'In my experience ,',
  'many times I did not attend biology class',
  'because my university does not require the students to attend classes .',
  'Also , I had already studied all biology topics',
  'before attending university .',
  'As a result ,',
  'I spent the time of biology class to do my Chemistry research',
  'and got A+ for both biology and chemistry .'])

In [912]:
match_edu_list_new(ann_txt, edu_list, dev=False)

[(None, None, None, False),
 (None, None, None, False),
 (None, None, None, False),
 (None, None, None, False),
 (None, None, None, False),
 (None, None, None, False),
 ['I', 11, 12, False],
 (None, None, None, False),
 ['I', 7, 8, False],
 (None, None, None, False),
 (None, None, None, False),
 ['I spent the time of biology class to do my Chemistry research',
  0,
  61,
  True],
 ['and got A+ for both biology and chemistry', 0, 41, False]]

In [35]:
all_ann_dict = {}

for name in tqdm(all_filenames):
    with open(name+".ann") as f:
        ann = f.readlines()
    tmp = {}
    for i in ann:
        ann_line = i.strip().split("\t")
        
        if ann_line[0].startswith("T"):
            typ, st, en = ann_line[1].split()
            st, en = int(st), int(en)
            tmp[ann_line[0]] = {"type": typ, "start":st, "end":en, "text":correct_spelling(ann_line[-1]),
                               "supports":[], "attacks":[]}
            
        elif ann_line[0].startswith("A"):
            _, t_node, stance = ann_line[1].split()
            tmp[t_node]["stance"] = stance
            
        elif ann_line[0].startswith("R"):
            typ, a1, a2 = ann_line[1].split()
            a1, a2 = a1.split(":")[-1], a2.split(":")[-1]
            tmp[a1][typ].append(a2)
            
        else:
            print("\nNOT RECOGNIZED!!\n")

    all_ann_dict[name.split("/")[-1]] = tmp

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 402/402 [00:00<00:00, 1263.29it/s]


In [37]:
simple_ann_dict_bkp = copy.deepcopy(simple_ann_dict)

In [38]:
all_unmatched = []
for name in tqdm(all_filenames):
    with open(name+".ann") as f:
        ann = f.readlines()
    
    for i in ann:
        ann_line = i.strip().split("\t")
        
        if ann_line[0].startswith("T"):
            typ, st, en = ann_line[1].split()
            st, en = int(st), int(en)
            ann_txt = spacy_text_process(ann_line[-1])
            ann_txt = correct_spelling(ann_txt)
            
            for para_id, v1 in simple_ann_dict_bkp[name.split("/")[-1]].items():
                if v1.get("edu_annotation", None) is None:
                    simple_ann_dict_bkp[name.split("/")[-1]][para_id]["edu_annotation"] = default_annotation(v1["edu_list"])
                    
                if (v1["orig_en"] > st and v1["orig_st"] < en): 
                    edu_list = match_edu_list_new(ann_txt, v1["edu_list"])
                    annotation = []
                    prev = False
                    
                    for ix2, j in enumerate(list(zip(edu_list, v1["edu_list"]))):
                        if ((prev or j[0][-1]) and j[0][0] is not None and len(j[0][0].strip().split()) > 1) or\
                            (j[0][0] is not None and j[0][0] in j[-1]):
                            annotation.append([ann_line[0], j[0][1], j[0][2]])
                            prev = j[0][-1]
                        else:
                            annotation.append(["O", 0, len(j[1])])

                    simple_ann_dict_bkp[name.split("/")[-1]][para_id]["edu_annotation"] = merge_annotations(simple_ann_dict_bkp[name.split("/")[-1]][para_id]["edu_annotation"], 
                                                                                                        annotation, name)
                    simple_ann_dict_bkp[name.split("/")[-1]][para_id]["edu_annotation"] = modify_annotation_by_clusters(simple_ann_dict_bkp[name.split("/")[-1]][para_id]["edu_annotation"])
                

    unm = match_annotations(name.split("/")[-1])
    if len(unm) > 0:
        print(len(unm), "Unmatched in",name.split("/")[-1],"!")
    all_unmatched.extend(unm)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 402/402 [01:38<00:00,  4.06it/s]


In [40]:
simple_ann_dict_bkp["essay327"][5]

{'orig_text': 'Overall, it is easy to distinguish between a child having two languages and a child having just his native language while comparing their ability to communicate, understanding other studies, etc, to find that why children should start learning a foreign language in the beginning of the education.',
 'orig_st': 1574,
 'orig_en': 1871,
 'edu_list': ['Overall , it is easy to distinguish between a child',
  'having two languages and a child',
  'having just his native language',
  'while comparing their ability',
  'to communicate ,',
  'understanding other studies , etc ,',
  'to find that why children should start learning a foreign language in the beginning of the education .'],
 'edu_annotation': [['T4', 10, 51],
  ['T4', 0, 32],
  ['T4', 0, 31],
  ['T4', 0, 29],
  ['T4', 0, 16],
  ['T4', 0, 33],
  ['T3', 17, 100]]}

In [41]:
pickle.dump(simple_ann_dict_bkp, open("./PE_essays_formatted_edu_segments_v1.pkl", "wb"))