In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd "/content/drive/My Drive/Colab Notebooks/github_repo/w266_final/"

/content/drive/.shortcut-targets-by-id/1kzskPlOj9PLeYfK_UrNXwaloxbg5EzRW/github_repo/w266_final


# I2b2 dataset
Contents of the 2018 Task2 challenge:  
This dataset was created to identify Adverse Drug Events and Medication Extraction in EHRs. This challenge focused on three tasks:
- Identifying concepts: drug names, dosages, durations, etc.  
- Identifying relations: relation of drugs with ADE's and other entities given gold standard entities (generated by human annotators). 
- Running an end-to-end model that identifies relation of drugs with ADE's and other entittes on system predicted entitites.  

See documentation for more details.  

The training data is composed of individual notes (.txt extension) and corresponding individual annotation files (.ann extension).   
Annotation files contain tags (labeled with a leading 'T') and relations (labeled with a leading 'R'):
- For tags, the structure is: Tag_id, Tag_entity, Start_character_loc, End_character_loc  
- For relations, the structure is: Relation_id, Relation_entity, Arg1:Tag_id, Arg2:Tag_id  

In [3]:
import pandas as pd
import i2b2_evaluate as i2b2e
import glob, os
import sys, io
import re
import matplotlib.pyplot as plt
import numpy as np
import nltk
import statistics as stats
import itertools
from nltk.tokenize import sent_tokenize 
from pathlib import Path, PureWindowsPath, PurePosixPath

%matplotlib inline

In [4]:
#test_path = '/Users/valeriemeausoone/Documents/W266/github_repo/w266_final/data/i2b2/2018/training_20180910/training_20180910/100035.ann'
# file_path = '/Users/valeriemeausoone/Documents/W266/github_repo/w266_final/data/i2b2/2018/training_20180910/training_20180910/'
file_path ='data/i2b2/2018/training_20180910/training_20180910/'
# os.chdir(file_path)
data_folder = Path(file_path)
# list(data_folder.glob('**/*.txt'))

    

# Processing training data for BERT

## Sentence tokenizing

In [5]:
list_text_files = list(data_folder.glob('**/*.txt'))
list_ann_files = list(data_folder.glob('**/*.ann'))

text_directory = sorted(list_text_files)
ann_directory = sorted(list_ann_files)
                                        
list_files=[]
for file in text_directory:
    with open(file, 'rb') as f:
        text=f.read().decode("utf-8")
        list_files.append(text)

In [6]:
print(text_directory[0:10])
print(ann_directory[0:10])
print(len(list_files))

[PosixPath('data/i2b2/2018/training_20180910/training_20180910/100035.txt'), PosixPath('data/i2b2/2018/training_20180910/training_20180910/100039.txt'), PosixPath('data/i2b2/2018/training_20180910/training_20180910/100187.txt'), PosixPath('data/i2b2/2018/training_20180910/training_20180910/100229.txt'), PosixPath('data/i2b2/2018/training_20180910/training_20180910/100564.txt'), PosixPath('data/i2b2/2018/training_20180910/training_20180910/100579.txt'), PosixPath('data/i2b2/2018/training_20180910/training_20180910/100590.txt'), PosixPath('data/i2b2/2018/training_20180910/training_20180910/100677.txt'), PosixPath('data/i2b2/2018/training_20180910/training_20180910/100847.txt'), PosixPath('data/i2b2/2018/training_20180910/training_20180910/100883.txt')]
[PosixPath('data/i2b2/2018/training_20180910/training_20180910/100035.ann'), PosixPath('data/i2b2/2018/training_20180910/training_20180910/100039.ann'), PosixPath('data/i2b2/2018/training_20180910/training_20180910/100187.ann'), PosixPath(

In [7]:
def sentence_tokenization(text):
    '''Splitting discharge summaries into sentences. Because discharge summaries are not consistently organized,
    extra processing is done to clean-up sentences and phrases. Chunks of texts are kept together to avoid splitting
    phrases too granularly'''
    #Using NLTK's sent_tokenize 
    sentence_tokens = sent_tokenize(text) 
    
    #Splititng paragraphs
    sentence_tokens2 = [paragraph for sentence in sentence_tokens for paragraph in sentence.split("\n\n\n")]
        
    #Removing sentences that are too short: only one dot (.) or a numerical bullet point (1., 2., 3.., ...10., etc.)    
    sentence_tokens3 = [sentence.strip() for sentence in sentence_tokens2 if (sentence != ".") or (re.match(r'\d*\.', sentence) is None)]

    #Cleaning up line breaks and replacing them with empty spaces
    sentence_tokens_clean = [sentence.replace('\n', ' ') for sentence in sentence_tokens3]
    
    #Saving results as dataframe 
    #sentences = pd.DataFrame(sentence_tokens_clean)
    #sentences = sentences.rename(columns={0:"sentences"})
    
    return sentence_tokens_clean

In [8]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [9]:
#100035.txt and 100039.txt
for file in list_files[0:2]:
    print(sentence_tokenization(file)[0])
    sentences = sentence_tokenization(file)

Admission Date:  [**2115-2-22**]              Discharge Date:   [**2115-3-19**]  Date of Birth:  [**2078-8-9**]             Sex:   M  Service: MEDICINE  Allergies: Vicodin  Attending:[**First Name3 (LF) 4891**] Chief Complaint: Post-cardiac arrest, asthma exacerbation  Major Surgical or Invasive Procedure: Intubation Removal of chest tubes placed at an outside hospital R CVL placement
Admission Date:  [**2174-4-18**]              Discharge Date:   [**2174-5-17**]  Date of Birth:  [**2135-11-15**]             Sex:   F  Service: MEDICINE  Allergies: Prochlorperazine / Heparin Agents  Attending:[**First Name3 (LF) 3918**] Chief Complaint: Abdominal Pain  Major Surgical or Invasive Procedure: Upper GI series with small bowel follow through Right heart catheterization IR guided paracentesis


## Linking with relations

In [10]:
def annotations_processing(file):
    '''This function processes the annotation files into dataframes (relation and concept). 
    It then combines these dataframes to create an enhanced relations dictionary'''
    
    #Reading the annotation file into a combined dataframe. 
    ann_df = pd.read_csv(file, sep="\t", header=None)
    ann_df = ann_df.rename(columns={0:"tag", 1:"description", 2:"text"})

    #Splitting concept entities and relations
    #Relations dataframe
    null_entries = pd.isnull(ann_df["text"])
    rf_df = ann_df[null_entries]
    rf_df = rf_df.rename(columns={'tag':"relation_id", 'description':"relation_description", 'text': 'relation_text'})
    
    #Cleaning up

    rf_df[['relation','arg1', 'arg2']] = rf_df['relation_description'].str.split(' ',expand=True)
    rf_df[['arg1_delete','arg1_keep']] = rf_df['arg1'].str.split(':',expand=True)
    rf_df[['arg2_delete','arg2_keep']] = rf_df['arg2'].str.split(':',expand=True)
    rf_df = rf_df.drop(columns=['relation_text', 'arg1', 'arg2', 'arg1_delete', 'arg2_delete'])
    rf_df = rf_df.rename(columns={'arg1_keep':"arg1", 'arg2_keep':"arg2"})
    
    
    #Concepts dataframe
    entries = pd.notnull(ann_df["text"])
    tag_df = ann_df[entries]
    tag_df = tag_df.rename(columns={'tag':"concept_id", 'description':"concept_description", 'text': 'concept_text'})

    #Combining relations and tags dataframes to create an enhanced relations dataframe
    rf_df = pd.merge(rf_df, tag_df, left_on = 'arg1', right_on='concept_id')
    rf_df = rf_df.rename(columns={'concept_id': 'arg1_id', 'concept_description':"arg1_description", 'concept_text':"arg1_text"})
    rf_df = pd.merge(rf_df, tag_df, left_on = 'arg2', right_on='concept_id')
    rf_df = rf_df.rename(columns={'concept_id': 'arg2_id', 'concept_description':"arg2_description", 'concept_text':"arg2_text"})
    rf_df = rf_df.drop(columns=['arg1_id', 'arg2_id'])

    #Creating a relations dictionary
    #Note that there could be "duplicate" relations that we will have to re-identify later. 
    dict_relation = {}
    for sentence in sentences: 
        for i in range(len(rf_df)):
            arg1 = rf_df['arg1_text'][i]
            arg2 = rf_df['arg2_text'][i]
            relation = rf_df['relation'][i]
            dict_relation[(arg1, arg2)] = relation
    return dict_relation

In [11]:
list_relations = [annotations_processing(file) for file in ann_directory]
len(list_relations)

303

## Compiling the dataframe

In [12]:
file_num=0
relation_sentences=[]
errors= 0
for file in list_files:
    
    #implementing sentence tokenization
    sentences = np.array(sentence_tokenization(file))

    #listing entities that make up relations
    list_entities = list(list_relations[file_num].keys())
    
    #looking for relation tags in sentences and pulling out sentences. 
    for e in list_entities:        
        new_e = e
        arg1_indices = np.where(np.char.find(sentences, new_e[0])>=0)[0]
        if arg1_indices.size==0: 
            new_e = list(new_e)
            new_e[0] = new_e[0].replace(' ', '')
            new_e = tuple(new_e)
            arg1_indices = np.where(np.char.find(sentences, new_e[0])>=0)[0]

        arg2_indices = np.where(np.char.find(sentences, new_e[1])>=0)[0]
        if arg2_indices.size==0: 
            new_e = list(new_e)
            new_e[1] = new_e[1].replace(' ', '')
            new_e = tuple(new_e)
            arg2_indices = np.where(np.char.find(sentences, new_e[1])>=0)[0]

        #extract where minimum. 
        combinations = [(i,j,abs(i-j)) for i,j in list(itertools.product(arg1_indices, arg2_indices))]
        try:
            min_distance = min(combinations, key = lambda t: t[2])[2]
        except ValueError:
            min_distance = "none"
        if min_distance != "none":
            min_combinations = [(t[0], t[1]) for t in combinations if t[2] == min_distance]
            for c in min_combinations:
                if c[0]==c[1]:
                    include_sentence = sentences[c[0]]
                    include_sentence = include_sentence.replace(new_e[0], ("SUB_B " + new_e[0] + " SUB_E"))
                    include_sentence = include_sentence.replace(new_e[1], ("OBJ_B " + new_e[1] + " OBJ_E"))
                    relation_sentences.append((new_e, list_relations[file_num][e], include_sentence))
                    sentences.tolist().pop(c[0])
                elif c[0]!=c[1]:
                    include_sentence = sentences[c[0]] + " " + sentences[c[1]]
                    include_sentence = include_sentence.replace(new_e[0], ("SUB_B " + new_e[0] + " SUB_E"))
                    include_sentence = include_sentence.replace(new_e[1], ("OBJ_B " + new_e[1] + " OBJ_E"))
                    relation_sentences.append((new_e, list_relations[file_num][e], include_sentence))
                    sentences.tolist().pop(c[0])
                    sentences.tolist().pop(c[1])
                    
    for s in range(len(sentences)):
        relation_sentences.append(("none", "no relation", sentences[s]))   
    #print("output length", len(relation_sentences))
    
    file_num+=1

In [14]:
print("Number of sentences", len(relation_sentences))

Number of sentences 76318


In [15]:
#Generating a dataframe
train_df = pd.DataFrame(relation_sentences)
train_df = train_df.rename(columns={0:"args", 1:"relation", 2: "sentence"})
train_df.head()

Unnamed: 0,args,relation,sentence
0,"(recurrent seizures, ativan)",Reason-Drug,He also may have SUB_B recurrent seizures SUB_...
1,"(IM, ativan)",Route-Drug,He also may have recurrent seizures which shou...
2,"(IV, ativan)",Route-Drug,He also may have recurrent seizures which shou...
3,"(25mg, Topiramate)",Strength-Drug,-patient will be on OBJ_B Topiramate OBJ_E SUB...
4,"(PO, Topiramate)",Route-Drug,-patient will be on OBJ_B Topiramate OBJ_E 25m...


In [16]:
train_df['relation'].value_counts()

no relation       40955
Strength-Drug      6781
Frequency-Drug     6484
Route-Drug         5925
Reason-Drug        5263
Form-Drug          4643
Dosage-Drug        4455
ADE-Drug           1167
Duration-Drug       645
Name: relation, dtype: int64

In [17]:
train_df['relation'].unique()

array(['Reason-Drug', 'Route-Drug', 'Strength-Drug', 'Frequency-Drug',
       'Duration-Drug', 'Form-Drug', 'Dosage-Drug', 'ADE-Drug',
       'no relation'], dtype=object)

In [18]:
print("Percentage without a relation:", 40955*100/len(train_df))

Percentage without a relation: 53.66361801934013


In [19]:
label_list = train_df['relation'].unique()

label_to_ids_map =  {label: i for i, label in enumerate(label_list)}

print(label_to_ids_map)

{'Reason-Drug': 0, 'Route-Drug': 1, 'Strength-Drug': 2, 'Frequency-Drug': 3, 'Duration-Drug': 4, 'Form-Drug': 5, 'Dosage-Drug': 6, 'ADE-Drug': 7, 'no relation': 8}


In [20]:
def to_label_id(series):
    return label_to_ids_map[series]

train_df_bert = train_df.copy()
train_df_bert = pd.DataFrame({
    'id':range(len(train_df)),
    'label': train_df['relation'].apply(to_label_id),
    'alpha':['a']*train_df.shape[0],
    'text': train_df['sentence']
})

In [21]:
train_df_bert.head()

Unnamed: 0,id,label,alpha,text
0,0,0,a,He also may have SUB_B recurrent seizures SUB_...
1,1,1,a,He also may have recurrent seizures which shou...
2,2,1,a,He also may have recurrent seizures which shou...
3,3,2,a,-patient will be on OBJ_B Topiramate OBJ_E SUB...
4,4,1,a,-patient will be on OBJ_B Topiramate OBJ_E 25m...


In [22]:
train_df_bert.to_csv("i2b2_train_bert.csv",  sep='\t', index=False, header=False)

In [23]:
train_df_bert.to_csv("project_re/data/train.tsv", sep = '\t', index=False, header=False)

# Processting test data for BERT

In [24]:
#test_path = '/Users/valeriemeausoone/Documents/W266/github_repo/w266_final/data/i2b2/2018/training_20180910/training_20180910/100035.ann'
# file_path = '/Users/valeriemeausoone/Documents/W266/github_repo/w266_final/data/i2b2/2018/gold_standard_test/'
# os.chdir(file_path)
file_path ='data/i2b2/2018/gold_standard_test/'
# os.chdir(file_path)
data_folder = Path(file_path)

## Sentence Tokenizing

In [25]:
# text_directory = sorted(glob.glob("*.txt"))
# ann_directory = sorted(glob.glob("*.ann"))
list_text_files = list(data_folder.glob('**/*.txt'))
list_ann_files = list(data_folder.glob('**/*.ann'))

text_directory = sorted(list_text_files)
ann_directory = sorted(list_ann_files)

list_files=[]
for file in text_directory:
    with open(file, 'rb') as f:
        text=f.read().decode("utf-8")
        list_files.append(text)

In [26]:
list_relations = [annotations_processing(file) for file in ann_directory]
len(list_relations)

201

In [27]:
file_num=0
relation_sentences=[]
errors= 0
for file in list_files:
    
    #implementing sentence tokenization
    sentences = np.array(sentence_tokenization(file))

    #listing entities that make up relations
    list_entities = list(list_relations[file_num].keys())
    
    #looking for relation tags in sentences and pulling out sentences. 
    for e in list_entities:        
        new_e = e
        arg1_indices = np.where(np.char.find(sentences, new_e[0])>=0)[0]
        if arg1_indices.size==0: 
            new_e = list(new_e)
            new_e[0] = new_e[0].replace(' ', '')
            new_e = tuple(new_e)
            arg1_indices = np.where(np.char.find(sentences, new_e[0])>=0)[0]

        arg2_indices = np.where(np.char.find(sentences, new_e[1])>=0)[0]
        if arg2_indices.size==0: 
            new_e = list(new_e)
            new_e[1] = new_e[1].replace(' ', '')
            new_e = tuple(new_e)
            arg2_indices = np.where(np.char.find(sentences, new_e[1])>=0)[0]

        #extract where minimum. 
        combinations = [(i,j,abs(i-j)) for i,j in list(itertools.product(arg1_indices, arg2_indices))]
        try:
            min_distance = min(combinations, key = lambda t: t[2])[2]
        except ValueError:
            min_distance = "none"
        if min_distance != "none":
            min_combinations = [(t[0], t[1]) for t in combinations if t[2] == min_distance]
            for c in min_combinations:
                if c[0]==c[1]:
                    include_sentence = sentences[c[0]]
                    include_sentence = include_sentence.replace(new_e[0], ("SUB_B " + new_e[0] + " SUB_E"))
                    include_sentence = include_sentence.replace(new_e[1], ("OBJ_B " + new_e[1] + " OBJ_E"))
                    relation_sentences.append((new_e, list_relations[file_num][e], include_sentence))
                    sentences.tolist().pop(c[0])
                elif c[0]!=c[1]:
                    include_sentence = sentences[c[0]] + " " + sentences[c[1]]
                    include_sentence = include_sentence.replace(new_e[0], ("SUB_B " + new_e[0] + " SUB_E"))
                    include_sentence = include_sentence.replace(new_e[1], ("OBJ_B " + new_e[1] + " OBJ_E"))
                    relation_sentences.append((new_e, list_relations[file_num][e], include_sentence))
                    sentences.tolist().pop(c[0])
                    sentences.tolist().pop(c[1])
                    
    for s in range(len(sentences)):
        relation_sentences.append(("none", "no relation", sentences[s]))   
    #print("output length", len(relation_sentences))
    
    file_num+=1
    
print("Number of sentences", len(relation_sentences))

Number of sentences 50010


In [28]:
#Generating a dataframe
test_df = pd.DataFrame(relation_sentences)
test_df = test_df.rename(columns={0:"args", 1:"relation", 2: "sentence"})

test_df['relation'].value_counts()

no relation       27438
Strength-Drug      4274
Frequency-Drug     4090
Route-Drug         3723
Reason-Drug        3464
Form-Drug          3024
Dosage-Drug        2802
ADE-Drug            775
Duration-Drug       420
Name: relation, dtype: int64

In [29]:
test_df.head()

Unnamed: 0,args,relation,sentence
0,"(q.i.d, Decadron)",Frequency-Drug,"MEDICATIONS: Lipitor, Tylenol with Codeine, D..."
1,"(one week, Decadron)",Duration-Drug,tapered over SUB_B one week SUB_E and disconti...
2,"(ophthalmic involvement, Oxacillin)",Reason-Drug,She was started on prophylactic OBJ_B Oxacilli...
3,"(q.1 hour, Pred Forte)",Frequency-Drug,The patient's course in the Intensive Care Uni...
4,"(eye, Pred Forte)",Route-Drug,The patient's course in the Intensive Care Uni...


In [30]:
print("Percentage without a relation:", 27438*100/len(test_df))

Percentage without a relation: 54.86502699460108


In [31]:
label_list = test_df['relation'].unique()

### DO NOT RERUN THIS _ IT IS WRONG. Use from the training data instead!!!!
##label_to_ids_map =  {label: i for i, label in enumerate(label_list)}
print(label_to_ids_map)

{'Reason-Drug': 0, 'Route-Drug': 1, 'Strength-Drug': 2, 'Frequency-Drug': 3, 'Duration-Drug': 4, 'Form-Drug': 5, 'Dosage-Drug': 6, 'ADE-Drug': 7, 'no relation': 8}


In [32]:


def to_label_id(series):
    return label_to_ids_map[series]

test_df_bert = test_df.copy()
test_df_bert = pd.DataFrame({
    'id':range(len(test_df)),
    'label': test_df['relation'].apply(to_label_id),
    'alpha':['a']*test_df.shape[0],
    'text': test_df['sentence']
})

test_df_bert.head()

Unnamed: 0,id,label,alpha,text
0,0,3,a,"MEDICATIONS: Lipitor, Tylenol with Codeine, D..."
1,1,4,a,tapered over SUB_B one week SUB_E and disconti...
2,2,0,a,She was started on prophylactic OBJ_B Oxacilli...
3,3,3,a,The patient's course in the Intensive Care Uni...
4,4,1,a,The patient's course in the Intensive Care Uni...


In [33]:
test_df_bert.to_csv("i2b2_test_bert.csv",  index=False, header=False)

In [None]:
!pwd

/content/drive/.shortcut-targets-by-id/1kzskPlOj9PLeYfK_UrNXwaloxbg5EzRW/github_repo/w266_final


In [34]:
test_df_bert.to_csv("project_re/data/dev.tsv", sep = '\t', index=False, header=False)

# Exploring the prepared data

In [None]:
import pandas as pd

In [None]:
!head -50 'data/dev.tsv'

0	0	a	"MEDICATIONS:  Lipitor, Tylenol with Codeine, Dilantin, previously on OBJ_B Decadron OBJ_E SUB_B q.i.d SUB_E."
1	1	a	"tapered over SUB_B one week SUB_E and discontinued a week ago. MEDICATIONS:  Lipitor, Tylenol with Codeine, Dilantin, previously on OBJ_B Decadron OBJ_E q.i.d."
2	2	a	"She was started on prophylactic OBJ_B Oxacillin OBJ_E to cover skin flora, and Dermatology was consulted along with Neurology and Ophthalmology for the SUB_B ophthalmic involvement SUB_E."
3	0	a	"The patient's course in the Intensive Care Unit was uneventful, and she was discharged to the floor with very close monitoring which included SUB_B q.1 hour SUB_E OBJ_B Pred Forte OBJ_E application to the eye and close consultation with Ophthalmology."
4	3	a	"The patient's course in the Intensive Care Unit was uneventful, and she was discharged to the floor with very close monitoring which included q.1 hour OBJ_B Pred Forte OBJ_E application to the SUB_B eye SUB_E and close consultation with Ophthalmology."