# Download IDP4+ corpus

In [4]:
!wget https://s3.amazonaws.com/net.tagtog.public/resources/corpora/tagtog_IDP4%2B_anndoc.zip

--2021-08-03 12:05:33--  https://s3.amazonaws.com/net.tagtog.public/resources/corpora/tagtog_IDP4%2B_anndoc.zip
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.104.13
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.104.13|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3083160 (2.9M) [application/zip]
Saving to: ‘tagtog_IDP4+_anndoc.zip’


2021-08-03 12:05:56 (140 KB/s) - ‘tagtog_IDP4+_anndoc.zip’ saved [3083160/3083160]



In [2]:
!unzip -qq tagtog_IDP4+_anndoc.zip -d data/nala

# Prepare IDP4+ corpus to BIO2 format

In [1]:
from utils.nala.readers import HTMLReader
from utils.nala.annotation_readers import AnnJsonAnnotationReader, AnnJsonMergerAnnotationReader
from utils.nala.definers import ExclusiveNLDefiner
from utils.nala.tokenizers import TmVarTokenizer
from utils.nala.spliters import NLTK_SPLITTER
from utils.nala.labelers import BIOLabeler

import math
import pandas as pd
import os

[nltk_data] Downloading package punkt to /home/risubu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
MUT_CLASS_ID = 'e_2'
base_folder = 'data/nala/tagtog_IDP4+_anndoc/tagtog_IDP4'
html_folder = os.path.join(base_folder, 'html')
annjson_folder = os.path.join(base_folder, 'annjson')
dataset = HTMLReader(html_folder).read()
AnnJsonMergerAnnotationReader(
    os.path.join(annjson_folder, 'members'),
    read_only_class_id=MUT_CLASS_ID,
    strategy='union',
    entity_strategy='priority',
    priority=['Ectelion', 'abojchevski', 'sanjeevkrn', 'Shpendi'],
    delete_incomplete_docs=True).annotate(dataset)
dataset



Dataset(157 documents and 3632 entities (['e_2: Counter({False: 3632})']))

In [3]:
base_folder = 'data/nala/tagtog_IDP4+_anndoc/tagtog_nala_anndoc'
html_folder = os.path.join(base_folder, 'nala_plain_html', 'pool')
annjson_folder = os.path.join(base_folder, 'nala_members_json')
nala_anndoc_dataset = HTMLReader(html_folder).read()
AnnJsonMergerAnnotationReader(
    os.path.join(annjson_folder, 'pool'),
    read_only_class_id=MUT_CLASS_ID,
    strategy='union',
    entity_strategy='priority',
    priority=['abojchevski', 'cuhlig', 'jmcejuela'],
    delete_incomplete_docs=True).annotate(nala_anndoc_dataset)
dataset.extend_dataset(nala_anndoc_dataset)
nala_anndoc_dataset = None
dataset

Dataset(503 documents and 4897 entities (['e_2: Counter({False: 4897})']))

In [4]:
base_folder = 'data/nala/tagtog_IDP4+_anndoc/tagtog_nala_discoveries'
html_folder = os.path.join(base_folder, 'html')
annjson_folder = os.path.join(base_folder, 'annjson')
nala_dis_dataset = HTMLReader(html_folder).read()
AnnJsonAnnotationReader(
    annjson_folder,
    read_only_class_id=MUT_CLASS_ID,
    delete_incomplete_docs=True).annotate(nala_dis_dataset)
dataset.extend_dataset(nala_dis_dataset)
nala_dis_dataset = None
dataset



Dataset(552 documents and 5045 entities (['e_2: Counter({False: 5045})']))

In [5]:
definer = ExclusiveNLDefiner()
definer.define(dataset)
dataset

Dataset(552 documents and 5045 entities (['e_2: Counter({0: 4071, 1: 808, 2: 166})']))

In [6]:
NLTK_SPLITTER.split(dataset) 
tokenizer = TmVarTokenizer()
tokenizer.tokenize(dataset=dataset)
dataset

Dataset(552 documents and 5045 entities (['e_2: Counter({0: 4071, 1: 808, 2: 166})']))

In [7]:
# 0 (standard), 1(natural language) or 2 (semi standard)
remove_subclasses = [0] 
dataset.delete_subclass_annotations(subclasses=remove_subclasses)
dataset



Dataset(552 documents and 974 entities (['e_2: Counter({1: 808, 2: 166})']))

In [8]:
labeler = BIOLabeler() 
labeler.label(dataset)
dataset

Dataset(552 documents and 974 entities (['e_2: Counter({1: 808, 2: 166})']))

In [None]:
# '''
# Wrote while testing out regex block of hybrid pipeline
# not required for creating the NER data. 
# NOTE: DO NOT run the cell block above which removes annotations if they aren't in NL form.
# '''

# final_binary = []
# for i, part in enumerate(dataset.parts()):
#     for tokenized_sent, raw_sent in zip(part.sentences, part.sentences_):
#         mark_postitive = 0
#         # sanity check, checking if first character same, very dumb way 
#         assert str(tokenized_sent[0])[0] == raw_sent[0], f'{str(tokenized_sent[0])[0], raw_sent[0]}'
#         for token in tokenized_sent:
#             if mark_postitive:
#                 break
#             for ann in part.annotations:
#                 start = ann.offset
#                 end = ann.offset + len(ann.text)
#                 if start <= token.start < end:
#                     mark_postitive = 1
#                     break
#         final_binary.append([raw_sent, mark_postitive])

# print('Sentences count : ', len(final_binary))

# data = pd.DataFrame(final_binary[:], columns=["Sentence", "Contains mutation or not"])
# data.to_csv(r"data\nala\binary_nala_NOT_NER.csv", index=False, encoding='utf-8')


In [9]:
dataset.prune_sentences(0.1) # tried with 0.8, no change in NER acc 

In [10]:
final = []
for doc_id, doc in dataset.documents.items(): 
    for part_id, part in doc.parts.items():

        for sentence in part.sentences:
            for token in sentence:
                final.append([token.word, token.original_labels[0].value.split('-')[0]])
            final.append([])
len(final)

44062

In [11]:
chunk_size = 10000
total = len(final)
devel_thres = math.ceil(total*0.8)
 
for i in range(math.ceil(len(final)/chunk_size)):
    temp = final[i*chunk_size:(i+1)*chunk_size]
    print(len(temp))

    if i > math.ceil(len(final)//10000)*0.8:
        file = open("data/nala/devel.txt", "a", encoding="utf-8")

    file = open("data/nala/train_dev.txt", "a", encoding="utf-8")

    for index in range(len(temp)):
        if i*chunk_size + index > devel_thres and ok_to_switch:
            file.close()
            file = open("data/nala/devel.txt", "a", encoding="utf-8")
        if temp[index]:
            file.write(str(temp[index][0]) + " " + str(temp[index][1]) + "\n")
            ok_to_switch = False
        else:
            file.write("\n")
            ok_to_switch = True
    file.close()

10000
10000
10000
10000
4062


# Convert BIO2 text files to JSON 

In [12]:
import json 

In [13]:
data = []
with open('data/nala/devel.txt', 'r', encoding="utf-8") as f_in:
    for line in f_in:
        line = line.split()   
        data.append(line) 

token = []
label = []
for row in data:
    if row:
        token.append(row[0])
        label.append(row[1])
    else:
        assert len(token) == len(label)
        # for l in label:
        #     if l not in ['B', 'O', 'I']:
        #         print('Error')
        #         break
        dictionary = { 
            "tokens" : token, 
            "tags" : label, 
        }  
        with open("data/nala/devel.json", "a", encoding="utf-8") as outfile: 
            json.dump(dictionary, outfile)
            outfile.write('\n')
        token = []
        label = []

data = []
with open('data/nala/train_dev.txt', 'r', encoding="utf-8") as f_in:
    for line in f_in:
        line = line.split()   
        data.append(line) 

token = []
label = []
for row in data:
    if row:
        token.append(row[0])
        label.append(row[1])
    else:
        assert len(token) == len(label)
        # for l in label:
        #     if l not in ['B', 'O', 'I']:
        #         print('Error')
        #         break
        dictionary = { 
            "tokens" : token, 
            "tags" : label, 
        }  
        with open("data/nala/train_dev.json", "a", encoding="utf-8") as outfile: 
            json.dump(dictionary, outfile)
            outfile.write('\n')
        token = []
        label = []