In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **Setup** 

In [None]:
import pandas as pd

import pickle
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from os import path
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [None]:
device_name = tf.test.gpu_device_name()
print('Found GPU at: {}'.format(device_name))

In [None]:
%ls gdrive/MyDrive/CodiEsp/

# **D-subtask** *English*: **Data Loader**

X_train, X_val, X_test: list of *input text data*

Y_train, Y_val, Y_test: list of one-hot encoded *labels*

In [None]:
df_train = pd.read_csv('drive/MyDrive/CodiEsp/train/trainD.tsv', sep='\t', header=None)
df_train.rename(columns={0:"Id", 1:"ICD10"}, inplace=True)
print("Training Data:")
display(df_train.head())

print("\n\nValidation Data:")
df_val = pd.read_csv('drive/MyDrive/CodiEsp/dev/devD.tsv', sep='\t', header=None)
df_val.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
display(df_val.head())

print("\n\nTest Data:")
df_test = pd.read_csv('drive/MyDrive/CodiEsp/test/testD.tsv', sep='\t', header=None)
df_test.rename(columns={0:"Id", 1:"ICD10"}, inplace=True)
display(df_test.head())

df = pd.concat([df_train, df_val, df_test])

In [None]:
ids = df['Id'].unique()
codes = df['ICD10'].unique()  

print("Number of documents in training data:", len(ids), "\nNumber of ICD10 codes:", len(codes))

In [None]:
code2idx = {}
for i in range(len(codes)):
  code2idx[codes[i]] = i

id2label = {}
for i in range(len(ids)):
  id2label[ids[i]] = [0]*len(codes)

for i, data in df.iterrows():
  _id = data[0]
  _code = data[1]
  id2label[_id][code2idx[_code]] = 1

_id2label = [(id, y) for id, y in id2label.items()]
ID, Y = zip(*_id2label)

In [None]:
stop_words = stopwords.words('english')

In [None]:
def remstopwords(text, stopwords):
    text = re.sub('\[\*\*[^\]]*\*\*\]', '', text)
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', ' ', text.lower()) 
    text = re.sub(" \d+", " ", text)
    return " ".join([i for i in text.split() if i not in stopwords])

In [None]:
X_train = []
Y_train = []

for id in (df_train['Id'].unique()):
  Y_train.append(id2label[id])

  # with open('gdrive/MyDrive/CodiEsp/train/text_files_en/' + id + '.txt', 'r') as f:
  #   text = f.read().replace('\n', ' ')
  # X_train.append(remstopwords(text.lower(), stop_words))
# with open("gdrive/MyDrive/X_train.txt", "wb") as fp:
#   pickle.dump(X_train, fp)
with open("drive/MyDrive/X_train.txt", "rb") as fp:
  X_train = pickle.load(fp)

In [None]:
X_val = []
Y_val = []

for id in (df_val['Id'].unique()):
  Y_val.append(id2label[id])

  # with open('gdrive/MyDrive/CodiEsp/dev/text_files_en/' + id + '.txt', 'r') as f:
  #   text = f.read().replace('\n', ' ')
  # X_val.append(remstopwords(text.lower(), stop_words))
# with open("gdrive/MyDrive/X_val.txt", "wb") as fp:
#   pickle.dump(X_val, fp)
with open("drive/MyDrive/X_val.txt", "rb") as fp:
  X_val = pickle.load(fp)

In [None]:
X_test = []
Y_test = []

for id in (df_test['Id'].unique()):
  Y_test.append(id2label[id])

#   with open('gdrive/MyDrive/CodiEsp/test/text_files_en/' + id + '.txt', 'r') as f:
#     text = f.read().replace('\n', ' ')
#   X_test.append(remstopwords(text.lower(), stop_words))
# with open("gdrive/MyDrive/X_test.txt", "wb") as fp:
#   pickle.dump(X_val, fp)
with open("drive/MyDrive/X_test.txt", "rb") as fp:
  X_test = pickle.load(fp)

In [None]:
X_train[0]

In [None]:
p_code = [0]*len(codes)
for label in Y_train:
  for i, code in enumerate(label):
    if (code == 1):
      p_code[i] = 1

not_present = 0
for i, present in enumerate(p_code):
  if (present == 0):
    not_present += 1

print("Number of classes NOT PRESENT in training dataset:", not_present)

In [None]:
def hamming_score(y_true, y_pred, normalize = True, sample_weight = None):
    ''' Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
        http://stackoverflow.com/q/32239577/395857 '''

    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set(np.where(y_true[i])[0])
        set_pred = set(np.where(y_pred[i])[0])
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)))
        acc_list.append(tmp_a)
    return np.mean(acc_list)

def print_score(y_pred, y_t, clf):
    print("Clf: ", clf.__class__.__name__)
    print("Hamming loss: {}".format(hamming_loss(y_pred, y_t)))
    print("Hamming score: {}".format(hamming_score(y_pred, y_t)))
    print("---")    

In [None]:
SOS = 0
EOS = 1
PAD = 2
UNK = 3

dictionary = {'START' : (1, SOS), 'END' : (1, EOS), 'PAD' : (1, PAD), 'UNK': (1, UNK)}

max_length = -1
index = 4
for i in range(len(X_train)):
  if i%50 == 0:
    print(f'[INFO] At {i}th line, making the dictionary...')
  tokenized = nltk.word_tokenize(X_train[i])
  if len(tokenized) > max_length:
    max_length = len(tokenized)
  for t in tokenized:
    if t not in dictionary.keys():
      dictionary[t] = (1, index)
      index += 1
    else:
      dictionary[t] = (dictionary[t][0]+1, dictionary[t][1])

for i in range(len(X_val)):
  if i%50 == 0:
    print(f'[INFO] At {i}th line, making the dictionary...')
  tokenized = nltk.word_tokenize(X_val[i])
  if len(tokenized) > max_length:
    max_length = len(tokenized)
  for t in tokenized:
    if t not in dictionary.keys():
      dictionary[t] = (1, index)
      index += 1
    else:
      dictionary[t] = (dictionary[t][0]+1, dictionary[t][1])

max_length = max_length + 1
print('-------------------------------------------------------')

print(f'[INFO] Maximum length of the documents is : {max_length}')
print(f'[INFO] Number of words in the dictionary : {len(dictionary)}')

In [None]:
X_train_indexed = []
X_val_indexed = []
X_test_indexed = []

for i in range(len(X_train)):
  tokenized = nltk.word_tokenize(X_train[i])
  tokenized = list(map(lambda x: dictionary[x][1], tokenized))
  tokenized.append(dictionary['END'][1])
  X_train_indexed.append(tokenized)

for i in range(len(X_val)):
  tokenized = nltk.word_tokenize(X_val[i])
  tokenized = list(map(lambda x: dictionary[x][1], tokenized))
  tokenized.append(dictionary['END'][1])
  X_val_indexed.append(tokenized)

for i in range(len(X_test)):
  tokenized = nltk.word_tokenize(X_test[i])
  tokenized = [dictionary[x][1] if x in dictionary.keys() else dictionary['UNK'][1] for x  in tokenized]
  tokenized.append(dictionary['END'][1])
  X_test_indexed.append(tokenized)

In [None]:
X_train_indexed = tf.keras.preprocessing.sequence.pad_sequences(X_train_indexed, maxlen=max_length, padding='post', value=dictionary['PAD'][1], dtype='float32')
X_val_indexed = tf.keras.preprocessing.sequence.pad_sequences(X_val_indexed, maxlen=max_length, padding='post', value=dictionary['PAD'][1], dtype='float32')
X_test_indexed = tf.keras.preprocessing.sequence.pad_sequences(X_test_indexed, maxlen=max_length, padding='post', value=dictionary['PAD'][1], dtype='float32')
Y_train_indexed = np.array(Y_train)
Y_val_indexed = np.array(Y_val)
Y_test_indexed = np.array(Y_test)

In [None]:
from sklearn.metrics import f1_score

# **P-subtask** *English*: **Data Loader**

X_train, X_val, X_test: list of *input text data*

Y_train, Y_val, Y_test: list of one-hot encoded *labels*

In [None]:
df_train = pd.read_csv('drive/MyDrive/CodiEsp/train/trainP.tsv', sep='\t', header=None)
df_train.rename(columns={0:"Id", 1:"ICD10"}, inplace=True)
print("Training Data:")
display(df_train.head())

print("\n\nValidation Data:")
df_val = pd.read_csv('drive/MyDrive/CodiEsp/dev/devP.tsv', sep='\t', header=None)
df_val.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
display(df_val.head())

print("\n\nTest Data:")
df_test = pd.read_csv('drive/MyDrive/CodiEsp/test/testP.tsv', sep='\t', header=None)
df_test.rename(columns={0:"Id", 1:"ICD10"}, inplace=True)
display(df_test.head())

df = pd.concat([df_train, df_val, df_test])

In [None]:
ids = df['Id'].unique()
codes = df['ICD10'].unique()  

print("Number of documents in training data:", len(ids), "\nNumber of ICD10 codes:", len(codes))

In [None]:
code2idx = {}
for i in range(len(codes)):
  code2idx[codes[i]] = i

id2label = {}
for i in range(len(ids)):
  id2label[ids[i]] = [0]*len(codes)

for i, data in df.iterrows():
  _id = data[0]
  _code = data[1]
  id2label[_id][code2idx[_code]] = 1

_id2label = [(id, y) for id, y in id2label.items()]
ID, Y = zip(*_id2label)

In [None]:
X_train = []
Y_train = []

for id in (df_train['Id'].unique()):
  Y_train.append(id2label[id])

  # with open('gdrive/MyDrive/CodiEsp/train/text_files_en/' + id + '.txt', 'r') as f:
  #   text = f.read().replace('\n', ' ')
  # X_train.append(remstopwords(text.lower(), stop_words))
# with open("gdrive/MyDrive/X_train.txt", "wb") as fp:
#   pickle.dump(X_train, fp)
with open("drive/MyDrive/X_train_P.txt", "rb") as fp:
  X_train = pickle.load(fp)

In [None]:
X_val = []
Y_val = []

for id in (df_val['Id'].unique()):
  Y_val.append(id2label[id])

  # with open('gdrive/MyDrive/CodiEsp/dev/text_files_en/' + id + '.txt', 'r') as f:
  #   text = f.read().replace('\n', ' ')
  # X_val.append(remstopwords(text.lower(), stop_words))
# with open("gdrive/MyDrive/X_val.txt", "wb") as fp:
#   pickle.dump(X_val, fp)
with open("drive/MyDrive/X_val_P.txt", "rb") as fp:
  X_val = pickle.load(fp)

In [None]:
X_test = []
Y_test = []

for id in (df_test['Id'].unique()):
  Y_test.append(id2label[id])

#   with open('gdrive/MyDrive/CodiEsp/test/text_files_en/' + id + '.txt', 'r') as f:
#     text = f.read().replace('\n', ' ')
#   X_test.append(remstopwords(text.lower(), stop_words))
# with open("gdrive/MyDrive/X_test.txt", "wb") as fp:
#   pickle.dump(X_val, fp)
with open("drive/MyDrive/X_test_P.txt", "rb") as fp:
  X_test = pickle.load(fp)

In [None]:
p_code = [0]*len(codes)
for label in Y_train:
  for i, code in enumerate(label):
    if (code == 1):
      p_code[i] = 1

not_present = 0
for i, present in enumerate(p_code):
  if (present == 0):
    not_present += 1

print("Number of classes NOT PRESENT in training dataset:", not_present)

In [None]:
SOS = 0
EOS = 1
PAD = 2
UNK = 3

dictionary = {'START' : (1, SOS), 'END' : (1, EOS), 'PAD' : (1, PAD), 'UNK': (1, UNK)}

max_length = -1
index = 4
for i in range(len(X_train)):
  if i%50 == 0:
    print(f'[INFO] At {i}th line, making the dictionary...')
  tokenized = nltk.word_tokenize(X_train[i])
  if len(tokenized) > max_length:
    max_length = len(tokenized)
  for t in tokenized:
    if t not in dictionary.keys():
      dictionary[t] = (1, index)
      index += 1
    else:
      dictionary[t] = (dictionary[t][0]+1, dictionary[t][1])

for i in range(len(X_val)):
  if i%50 == 0:
    print(f'[INFO] At {i}th line, making the dictionary...')
  tokenized = nltk.word_tokenize(X_val[i])
  if len(tokenized) > max_length:
    max_length = len(tokenized)
  for t in tokenized:
    if t not in dictionary.keys():
      dictionary[t] = (1, index)
      index += 1
    else:
      dictionary[t] = (dictionary[t][0]+1, dictionary[t][1])

max_length = max_length + 1
print('-------------------------------------------------------')

print(f'[INFO] Maximum length of the documents is : {max_length}')
print(f'[INFO] Number of words in the dictionary : {len(dictionary)}')

In [None]:
X_train_indexed = []
X_val_indexed = []
X_test_indexed = []

for i in range(len(X_train)):
  tokenized = nltk.word_tokenize(X_train[i])
  tokenized = list(map(lambda x: dictionary[x][1], tokenized))
  tokenized.append(dictionary['END'][1])
  X_train_indexed.append(tokenized)

for i in range(len(X_val)):
  tokenized = nltk.word_tokenize(X_val[i])
  tokenized = list(map(lambda x: dictionary[x][1], tokenized))
  tokenized.append(dictionary['END'][1])
  X_val_indexed.append(tokenized)

for i in range(len(X_test)):
  tokenized = nltk.word_tokenize(X_test[i])
  tokenized = [dictionary[x][1] if x in dictionary.keys() else dictionary['UNK'][1] for x  in tokenized]
  tokenized.append(dictionary['END'][1])
  X_test_indexed.append(tokenized)

In [None]:
X_train_indexed = tf.keras.preprocessing.sequence.pad_sequences(X_train_indexed, maxlen=max_length, padding='post', value=dictionary['PAD'][1], dtype='float32')
X_val_indexed = tf.keras.preprocessing.sequence.pad_sequences(X_val_indexed, maxlen=max_length, padding='post', value=dictionary['PAD'][1], dtype='float32')
X_test_indexed = tf.keras.preprocessing.sequence.pad_sequences(X_test_indexed, maxlen=max_length, padding='post', value=dictionary['PAD'][1], dtype='float32')
Y_train_indexed = np.array(Y_train)
Y_val_indexed = np.array(Y_val)
Y_test_indexed = np.array(Y_test)

# Model 1 Task D

In [None]:
model_1 = tf.keras.models.Sequential([
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(512, activation='relu', input_shape=(max_length,)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(Y_train_indexed.shape[1], activation='sigmoid')
])

In [None]:
model_1.compile(loss = 'binary_crossentropy' , optimizer = 'rmsprop' , metrics = ['accuracy'])

In [None]:
model_1.fit(X_train_indexed, Y_train_indexed, epochs = 100, batch_size = 64)

In [None]:
predictions_train_1 = model_1.predict(X_train_indexed)
predictions_rounded_train_1 = np.zeros(predictions_train_1.shape)

for i in range(len(predictions_train_1)):
  predictions_rounded_train_1[i] = np.where(predictions_train_1[i] < 0.2, 0, 1)

hamming_score(Y_train_indexed, predictions_rounded_train_1)

In [None]:
predictions_val_1 = model_1.predict(X_val_indexed)
predictions_rounded_val_1 = np.zeros(predictions_val_1.shape)

for i in range(len(predictions_val_1)):
  predictions_rounded_val_1[i] = np.where(predictions_val_1[i] < 0.2, 0, 1)

hamming_score(Y_val_indexed, predictions_rounded_val_1)

In [None]:
f1_score(Y_val_indexed, predictions_rounded_val_1, average='macro')

In [None]:
predictions_test_1 = model_1.predict(X_test_indexed)
predictions_rounded_test_1 = np.zeros(predictions_test_1.shape)

for i in range(len(predictions_test_1)):
  predictions_rounded_test_1[i] = np.where(predictions_test_1[i] < 0.2, 0, 1)

hamming_score(Y_test_indexed, predictions_rounded_test_1)

In [None]:
model_1.save('gdrive/MyDrive/model_1.hdf5')

# Model 1 Task P

In [None]:
model_1 = tf.keras.models.Sequential([
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(512, activation='relu', input_shape=(max_length,)),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(Y_train_indexed.shape[1], activation='sigmoid')
])

In [None]:
model_1.compile(loss = 'binary_crossentropy' , optimizer = 'rmsprop' , metrics = ['accuracy'])

In [None]:
model_1.fit(X_train_indexed, Y_train_indexed, epochs = 100, batch_size = 64)

In [None]:
predictions_train_1 = model_1.predict(X_train_indexed)
predictions_rounded_train_1 = np.zeros(predictions_train_1.shape)

for i in range(len(predictions_train_1)):
  predictions_rounded_train_1[i] = np.where(predictions_train_1[i] < 0.2, 0, 1)

hamming_score(Y_train_indexed, predictions_rounded_train_1)

In [None]:
predictions_val_1 = model_1.predict(X_val_indexed)
predictions_rounded_val_1 = np.zeros(predictions_val_1.shape)

for i in range(len(predictions_val_1)):
  predictions_rounded_val_1[i] = np.where(predictions_val_1[i] < 0.2, 0, 1)

hamming_score(Y_val_indexed, predictions_rounded_val_1)

In [None]:
predictions_test_1 = model_1.predict(X_test_indexed)
predictions_rounded_test_1 = np.zeros(predictions_test_1.shape)

for i in range(len(predictions_test_1)):
  predictions_rounded_test_1[i] = np.where(predictions_test_1[i] < 0.2, 0, 1)

hamming_score(Y_test_indexed, predictions_rounded_test_1)

# Model 2

In [None]:
model_2 = tf.keras.models.Sequential([
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(128, activation='relu', input_shape = (max_length,)),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dense(Y_train_indexed.shape[1], activation='sigmoid')
])

In [None]:
model_2.compile(loss='binary_crossentropy' , optimizer='rmsprop' , metrics=['accuracy'])

In [None]:
model_2.fit(X_train_indexed, Y_train_indexed, epochs = 30, batch_size = 64)

In [None]:
predictions_train_2 = model_2.predict(X_train_indexed)
predictions_rounded_train_2 = np.zeros(predictions_train_2.shape)

for i in range(len(predictions_train_2)):
  predictions_rounded_train_2[i] = np.where(predictions_train_2[i] < 0.5, 0, 1)

hamming_score(Y_train_indexed, predictions_rounded_train_2)

In [None]:
predictions_val_2 = model_2.predict(X_val_indexed)
predictions_rounded_val_2 = np.zeros(predictions_val_2.shape)

for i in range(len(predictions_val_2)):
  predictions_rounded_val_2[i] = np.where(predictions_val_2[i] < 0.5, 0, 1)

hamming_score(Y_val_indexed, predictions_rounded_val_2)

In [None]:
f1_score(Y_val_indexed, predictions_rounded_val_2, average='macro')

In [None]:
model_2.save('drive/MyDrive/model_2.hdf5')

# Model 3

In [None]:
model_3 = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(len(dictionary), 1024, input_length = max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Flatten(),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(256, activation = 'relu'),
    tf.keras.layers.Dense(Y_train_indexed.shape[1], activation='sigmoid')
])

In [None]:
model_3.summary()

In [None]:
# early = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=10, verbose=0, restore_best_weights=True)
optimizer = tf.keras.optimizers.Adam(lr=0.002, clipnorm=1)

In [None]:
model_3.compile(loss='binary_crossentropy' , optimizer=optimizer , metrics=['accuracy'])

In [None]:
model_3.fit(X_train_indexed, Y_train_indexed, epochs=40, batch_size=64, validation_data=(X_val_indexed, Y_val_indexed,))

In [None]:
predictions_val_3 = model_3.predict(X_val_indexed)
predictions_rounded_val_3 = np.zeros(predictions_val_3.shape)

for i in range(len(predictions_val_3)):
  predictions_rounded_val_3[i] = np.where(predictions_val_3[i] < 0.2, 0, 1)

hamming_score(Y_val_indexed, predictions_rounded_val_3)

In [None]:
f1_score(Y_val_indexed, predictions_rounded_val_3, average='macro')

In [None]:
model_3.save('drive/MyDrive/model_3.hdf5')

## Final Model

In [None]:
model_3 = tf.keras.models.load_model('drive/MyDrive/model_3.hdf5')

In [None]:
model_3.summary()

In [None]:
predictions_val_3 = model_3.predict(X_val_indexed)
predictions_rounded_val_3 = np.zeros(predictions_val_3.shape)

for i in range(len(predictions_val_3)):
  predictions_rounded_val_3[i] = np.where(predictions_val_3[i] < 0.2, 0, 1)

hamming_score(Y_val_indexed, predictions_rounded_val_3)

In [None]:
f1_score(Y_val_indexed, predictions_rounded_val_3, average='macro')