In [None]:
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


# **Setup**

---



In [1]:
import numpy as np
import pandas as pd

import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

import re
from os import path
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

from sklearn.metrics import average_precision_score, f1_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# **D-subtask** *English*: **Data Loader**

X_train, X_val, X_test: list of *input text data*

Y_train, Y_val, Y_test: list of one-hot encoded *labels*

---




In [3]:
df_train = pd.read_csv('drive/MyDrive/CodiEsp/train/trainD.tsv', sep = '\t', header = None)
df_train.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
print("Training Data:")
display(df_train.head())

print("\n\nValidation Data:")
df_val = pd.read_csv('drive/MyDrive/CodiEsp/dev/devD.tsv', sep = '\t', header = None)
df_val.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
display(df_val.head())

print("\n\nTest Data:")
df_test = pd.read_csv('drive/MyDrive/CodiEsp/test/testD.tsv', sep = '\t', header = None)
df_test.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
display(df_test.head())

df = pd.concat([df_train, df_val, df_test])

Training Data:


Unnamed: 0,Id,ICD10
0,S0004-06142005000700014-1,n44.8
1,S0004-06142005000700014-1,z20.818
2,S0004-06142005000700014-1,r60.9
3,S0004-06142005000700014-1,r52
4,S0004-06142005000700014-1,a23.9




Validation Data:


Unnamed: 0,Id,ICD10
0,S0004-06142005000900016-1,q62.11
1,S0004-06142005000900016-1,n28.89
2,S0004-06142005000900016-1,n39.0
3,S0004-06142005000900016-1,r31.9
4,S0004-06142005000900016-1,n23




Test Data:


Unnamed: 0,Id,ICD10
0,S0004-06142005000500011-1,s22.49xa
1,S0004-06142005000500011-1,n28.1
2,S0004-06142005000500011-1,r69
3,S0004-06142005000500011-1,f17.210
4,S0004-06142005000500011-1,r31.9


In [4]:
ids = df['Id'].unique()
codes = df['ICD10'].unique()  

print("Number of documents in training data:", len(ids), "\nNumber of ICD10 codes:", len(codes))

Number of documents in training data: 1000 
Number of ICD10 codes: 2557


In [5]:
code2idx = {}
for i in range(len(codes)):
  code2idx[codes[i]] = i

id2label = {}
for i in range(len(ids)):
  id2label[ids[i]] = [0]*len(codes)

for i, data in df.iterrows():
  _id = data[0]
  _code = data[1]
  id2label[_id][code2idx[_code]] = 1

_id2label = [(id, y) for id, y in id2label.items()]
ID, Y = zip(*_id2label)

In [6]:
def remstopwords(text, stopwords):
    text = re.sub('\[\*\*[^\]]*\*\*\]', '', text)
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', ' ', text.lower()) 
    text = re.sub(" \d+", " ", text)
    return " ".join([i for i in text.split() if i not in stopwords])

stop_words = stopwords.words('english')

In [13]:
X_train = []
Y_train = []

count = 0 
for id in (df_train['Id'].unique()):
  if count == 4:
    break
  count += 1
  Y_train.append(id2label[id])

  with open('drive/MyDrive/CodiEsp/train/text_files_en/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  # X_train.append(remstopwords(text.lower(), stop_words))
  X_train.append(text.lower())

In [None]:
X_val = []
Y_val = []

for id in (df_val['Id'].unique()):
  Y_val.append(id2label[id])

  with open('gdrive/MyDrive/CodiEsp/dev/text_files_en/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  X_val.append(remstopwords(text.lower(), stop_words))

In [None]:
X_test = []
Y_test = []

for id in (df_test['Id'].unique()):
  Y_test.append(id2label[id])

  with open('gdrive/MyDrive/CodiEsp/test/text_files_en/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  X_test.append(remstopwords(text.lower(), stop_words))

In [None]:
p_code = [0]*len(codes)
for label in Y_train:
  for i, code in enumerate(label):
    if (code == 1):
      p_code[i] = 1

not_present = 0
for i, present in enumerate(p_code):
  if (present == 0):
    not_present += 1

print("Number of classes NOT PRESENT in training dataset:", not_present)

Number of classes NOT PRESENT in training dataset: 790


# **D-subtask** *English*: **Training**
 

*   Feature Extraction:
  * Bag Of Words
  * Term Frequency Inverse Document Frequency
*   Models:
  * Multinomial Naive Bayesian
  * SGDClassifier
  * Logistic Regression
  * Support Vector Classifier
  * Random Forest Classifier
  * AdaBoost Classifier
* Hyper-parameter tuning

---

In [None]:
nb_clf = MultinomialNB()
sgd = SGDClassifier()
lr = LogisticRegression()
rf = RandomForestClassifier()
ada_clf = AdaBoostClassifier()
scv = SVC()

In [None]:
def hamming_score(y_true, y_pred):
    ''' Compute the Hamming score (a.k.a. label-based accuracy) for the multi-label case
        http://stackoverflow.com/q/32239577/395857 '''
        
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set(np.where(y_true[i])[0])
        set_pred = set(np.where(y_pred[i])[0])
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/float(len(set_true.union(set_pred)))
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [None]:
def print_score(y_true, y_pred, clf):
    print("\t\tF1 score: {}".format(f1_score(y_true, y_pred, average = 'macro')))
    print("\t\tmAP score: {}".format(average_precision_score(y_true, y_pred)))
    print("\t\tHamming score: {}".format(hamming_score(y_true, y_pred)))   

In [None]:
def trainModels(x_t, y_t, x_d, y_d, x_T, y_T):
  for classifier in [nb_clf, sgd, rf, scv, ada_clf, lr]:
    clf = OneVsRestClassifier(classifier)
    clf.fit(x_t, y_t)

    print('-'*50)
    print("Clf:", classifier.__class__.__name__)

    print('\tValidation:')
    y_pred = clf.predict(x_d)
    print_score(np.array(y_d), np.array(y_pred), classifier)

    print('\tTest:')
    y_pred = clf.predict(x_T)
    print_score(np.array(y_T), np.array(y_pred), classifier)

In [None]:
count_vect = CountVectorizer()
count_vect = count_vect.fit(X_train)

tfidf_transformer = TfidfVectorizer()
tfidf_transformer = tfidf_transformer.fit(X_train)

In [None]:
trainModels(count_vect.transform(X_train), Y_train, count_vect.transform(X_val), Y_val, count_vect.transform(X_test), Y_test)

--------------------------------------------------
Clf: MultinomialNB
	Validation:
		F1 score: 0.0008433855249689311
		mAP score: nan
		Hamming score: 0.019213579256427554
	Test:
		F1 score: 0.0015046113025406278
		mAP score: nan
		Hamming score: 0.022712685748367906
--------------------------------------------------
Clf: SGDClassifier
	Validation:
		F1 score: 0.012920174137261639
		mAP score: nan
		Hamming score: 0.07136105523526684
	Test:
		F1 score: 0.013530463159860943
		mAP score: nan
		Hamming score: 0.08852474614197252
--------------------------------------------------
Clf: RandomForestClassifier
	Validation:
		F1 score: 0.0003579916368340303
		mAP score: nan
		Hamming score: 0.004673021620517118
	Test:
		F1 score: 0.0005215278042918857
		mAP score: nan
		Hamming score: 0.008062783191344607
--------------------------------------------------
Clf: SVC
	Validation:
		F1 score: 0.0001751279615964794
		mAP score: nan
		Hamming score: 0.004985191760423958
	Test:
		F1 score: 0.00020585

In [None]:
trainModels(tfidf_transformer.transform(X_train), Y_train, tfidf_transformer.transform(X_val), Y_val, tfidf_transformer.transform(X_test), Y_test)

--------------------------------------------------
Clf: MultinomialNB
	Validation:
		F1 score: 0.0
		mAP score: nan
		Hamming score: 0.0
	Test:
		F1 score: 0.0
		mAP score: nan
		Hamming score: 0.0
--------------------------------------------------
Clf: SGDClassifier
	Validation:
		F1 score: 0.007555683835451947
		mAP score: nan
		Hamming score: 0.0479571718489056
	Test:
		F1 score: 0.011319166338259766
		mAP score: nan
		Hamming score: 0.07289803104176616
--------------------------------------------------
Clf: RandomForestClassifier
	Validation:
		F1 score: 0.0003860416130236399
		mAP score: nan
		Hamming score: 0.006775848881081079
	Test:
		F1 score: 0.0003931924952345164
		mAP score: nan
		Hamming score: 0.009089694005810948
--------------------------------------------------
Clf: SVC
	Validation:
		F1 score: 1.504166541319455e-05
		mAP score: nan
		Hamming score: 0.00023529411764705883
	Test:
		F1 score: 0.00027636553252509454
		mAP score: nan
		Hamming score: 0.001
----------------

# **P-subtask** *English*: **Data Loader**
---

In [None]:
df_train = pd.read_csv('gdrive/MyDrive/CodiEsp/train/trainP.tsv', sep = '\t', header = None)
df_train.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
print("Training Data:")
display(df_train.head())

print("\n\nValidation Data:")
df_val = pd.read_csv('gdrive/MyDrive/CodiEsp/dev/devP.tsv', sep = '\t', header = None)
df_val.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
display(df_val.head())

print("\n\nTest Data:")
df_test = pd.read_csv('gdrive/MyDrive/CodiEsp/test/testP.tsv', sep = '\t', header = None)
df_test.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
display(df_test.head())

df = pd.concat([df_train, df_val, df_test])

Training Data:


Unnamed: 0,Id,ICD10
0,S0004-06142005000700014-1,bw03zzz
1,S0004-06142005000700014-1,3e02329
2,S0004-06142005000700014-1,bw40zzz
3,S0004-06142005000700014-1,bv44zzz
4,S0004-06142005000700014-1,bn20




Validation Data:


Unnamed: 0,Id,ICD10
0,S0004-06142005000900016-1,bt41zzz
1,S0004-06142005000900016-1,ct13
2,S0004-06142005001000011-1,3e1m39z
3,S0004-06142005001000011-1,0tcb
4,S0004-06142005001000011-1,bt02




Test Data:


Unnamed: 0,Id,ICD10
0,S0004-06142005000500011-1,0ttb
1,S0004-06142005000500011-1,bv49zzz
2,S0004-06142005000500011-1,0djdxzz
3,S0004-06142005000500011-1,bw00zzz
4,S0004-06142005000500011-1,bw20


In [None]:
ids = df['Id'].unique()
codes = df['ICD10'].unique()  

print("Number of documents in training data:", len(ids), "\nNumber of ICD10 codes:", len(codes))

Number of documents in training data: 881 
Number of ICD10 codes: 870


In [None]:
code2idx = {}
for i in range(len(codes)):
  code2idx[codes[i]] = i

id2label = {}
for i in range(len(ids)):
  id2label[ids[i]] = [0]*len(codes)

for i, data in df.iterrows():
  _id = data[0]
  _code = data[1]
  id2label[_id][code2idx[_code]] = 1

_id2label = [(id, y) for id, y in id2label.items()]
ID, Y = zip(*_id2label)

In [None]:
X_train = []
Y_train = []

for id in (df_train['Id'].unique()):
  Y_train.append(id2label[id])

  with open('gdrive/MyDrive/CodiEsp/train/text_files_en/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  X_train.append(remstopwords(text.lower(), stop_words))

In [None]:
X_val = []
Y_val = []

for id in (df_val['Id'].unique()):
  Y_val.append(id2label[id])

  with open('gdrive/MyDrive/CodiEsp/dev/text_files_en/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  X_val.append(remstopwords(text.lower(), stop_words))

In [None]:
X_test = []
Y_test = []

for id in (df_test['Id'].unique()):
  Y_test.append(id2label[id])

  with open('gdrive/MyDrive/CodiEsp/test/text_files_en/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  X_test.append(remstopwords(text.lower(), stop_words))

In [None]:
p_code = [0]*len(codes)
for label in Y_train:
  for i, code in enumerate(label):
    if (code == 1):
      p_code[i] = 1

not_present = 0
for i, present in enumerate(p_code):
  if (present == 0):
    not_present += 1

print("Number of classes NOT PRESENT in training dataset:", not_present)

Number of classes NOT PRESENT in training dataset: 307


# **P-subtask** *English*: **Training**

In [None]:
nb_clf = MultinomialNB()
sgd = SGDClassifier()
lr = LogisticRegression()
rf = RandomForestClassifier()
ada_clf = AdaBoostClassifier()
scv = SVC()

In [None]:
count_vect = CountVectorizer()
count_vect = count_vect.fit(X_train)

tfidf_transformer = TfidfVectorizer()
tfidf_transformer = tfidf_transformer.fit(X_train)

In [None]:
trainModels(count_vect.transform(X_train), Y_train, count_vect.transform(X_val), Y_val, count_vect.transform(X_test), Y_test)

--------------------------------------------------
Clf: MultinomialNB
	Validation:
		F1 score: 0.0015915119363395225
		mAP score: nan
		Hamming score: 0.023911411411411412
	Test:
		F1 score: 0.0015976906403403683
		mAP score: nan
		Hamming score: 0.031069302721088432
--------------------------------------------------
Clf: SGDClassifier
	Validation:
		F1 score: 0.014599759202892885
		mAP score: nan
		Hamming score: 0.08242885742885742
	Test:
		F1 score: 0.012471498088535204
		mAP score: nan
		Hamming score: 0.0886164430807288
--------------------------------------------------
Clf: RandomForestClassifier
	Validation:
		F1 score: 0.0003423820004891171
		mAP score: nan
		Hamming score: 0.0070195195195195195
	Test:
		F1 score: 0.00028149190710767064
		mAP score: nan
		Hamming score: 0.00606060606060606
--------------------------------------------------
Clf: SVC
	Validation:
		F1 score: 5.606952621250352e-05
		mAP score: nan
		Hamming score: 0.0009009009009009009
	Test:
		F1 score: 0.0003125

In [None]:
trainModels(tfidf_transformer.transform(X_train), Y_train, tfidf_transformer.transform(X_val), Y_val, tfidf_transformer.transform(X_test), Y_test)

--------------------------------------------------
Clf: MultinomialNB
	Validation:
		F1 score: 0.0
		mAP score: nan
		Hamming score: 0.0
	Test:
		F1 score: 0.0
		mAP score: nan
		Hamming score: 0.0
--------------------------------------------------
Clf: SGDClassifier
	Validation:
		F1 score: 0.006732858707165671
		mAP score: nan
		Hamming score: 0.06127913627913628
	Test:
		F1 score: 0.007730457189866584
		mAP score: nan
		Hamming score: 0.06059913548752834
--------------------------------------------------
Clf: RandomForestClassifier
	Validation:
		F1 score: 0.00016038492381716118
		mAP score: nan
		Hamming score: 0.0035285285285285286
	Test:
		F1 score: 0.00022316327216508703
		mAP score: nan
		Hamming score: 0.004464285714285714
--------------------------------------------------
Clf: SVC
	Validation:
		F1 score: 0.00016420361247947458
		mAP score: nan
		Hamming score: 0.0006435006435006435
	Test:
		F1 score: 0.00047466145469775235
		mAP score: nan
		Hamming score: 0.0038690476190476

# **D-subtask** *Spanish*: **Data Loader**

X_train, X_val, X_test: list of *input text data*

Y_train, Y_val, Y_test: list of one-hot encoded *labels*

---




In [None]:
df_train = pd.read_csv('gdrive/MyDrive/CodiEsp/train/trainD.tsv', sep = '\t', header = None)
df_train.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
print("Training Data:")
display(df_train.head())

print("\n\nValidation Data:")
df_val = pd.read_csv('gdrive/MyDrive/CodiEsp/dev/devD.tsv', sep = '\t', header = None)
df_val.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
display(df_val.head())

print("\n\nTest Data:")
df_test = pd.read_csv('gdrive/MyDrive/CodiEsp/test/testD.tsv', sep = '\t', header = None)
df_test.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
display(df_test.head())

df = pd.concat([df_train, df_val, df_test])

Training Data:


Unnamed: 0,Id,ICD10
0,S0004-06142005000700014-1,n44.8
1,S0004-06142005000700014-1,z20.818
2,S0004-06142005000700014-1,r60.9
3,S0004-06142005000700014-1,r52
4,S0004-06142005000700014-1,a23.9




Validation Data:


Unnamed: 0,Id,ICD10
0,S0004-06142005000900016-1,q62.11
1,S0004-06142005000900016-1,n28.89
2,S0004-06142005000900016-1,n39.0
3,S0004-06142005000900016-1,r31.9
4,S0004-06142005000900016-1,n23




Test Data:


Unnamed: 0,Id,ICD10
0,S0004-06142005000500011-1,s22.49xa
1,S0004-06142005000500011-1,n28.1
2,S0004-06142005000500011-1,r69
3,S0004-06142005000500011-1,f17.210
4,S0004-06142005000500011-1,r31.9


In [None]:
ids = df['Id'].unique()
codes = df['ICD10'].unique()  

print("Number of documents in training data:", len(ids), "\nNumber of ICD10 codes:", len(codes))

Number of documents in training data: 1000 
Number of ICD10 codes: 2557


In [None]:
code2idx = {}
for i in range(len(codes)):
  code2idx[codes[i]] = i

id2label = {}
for i in range(len(ids)):
  id2label[ids[i]] = [0]*len(codes)

for i, data in df.iterrows():
  _id = data[0]
  _code = data[1]
  id2label[_id][code2idx[_code]] = 1

_id2label = [(id, y) for id, y in id2label.items()]
ID, Y = zip(*_id2label)

In [None]:
def remstopwords(text, stopwords):
    text = re.sub('\[\*\*[^\]]*\*\*\]', '', text)
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', ' ', text.lower()) 
    text = re.sub(" \d+", " ", text)
    return " ".join([i for i in text.split() if i not in stopwords])

stop_words = stopwords.words('spanish')

In [None]:
X_train = []
Y_train = []

for id in (df_train['Id'].unique()):
  Y_train.append(id2label[id])

  with open('gdrive/MyDrive/CodiEsp/train/text_files/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  X_train.append(remstopwords(text.lower(), stop_words))

In [None]:
X_val = []
Y_val = []

for id in (df_val['Id'].unique()):
  Y_val.append(id2label[id])

  with open('gdrive/MyDrive/CodiEsp/dev/text_files/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  X_val.append(remstopwords(text.lower(), stop_words))

In [None]:
X_test = []
Y_test = []

for id in (df_test['Id'].unique()):
  Y_test.append(id2label[id])

  with open('gdrive/MyDrive/CodiEsp/test/text_files/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  X_test.append(remstopwords(text.lower(), stop_words))

# **D-subtask** *Spanish*: **Training**

---

In [None]:
nb_clf = MultinomialNB()
sgd = SGDClassifier()
lr = LogisticRegression()
rf = RandomForestClassifier()
ada_clf = AdaBoostClassifier()
scv = SVC()

In [None]:
count_vect = CountVectorizer()
count_vect = count_vect.fit(X_train)

tfidf_transformer = TfidfVectorizer()
tfidf_transformer = tfidf_transformer.fit(X_train)

In [None]:
trainModels(count_vect.transform(X_train), Y_train, count_vect.transform(X_val), Y_val, count_vect.transform(X_test), Y_test)

--------------------------------------------------
Clf: MultinomialNB
	Validation:
		F1 score: 0.0008018276237230943
		mAP score: nan
		Hamming score: 0.01696272475030989
	Test:
		F1 score: 0.0012516830098704686
		mAP score: nan
		Hamming score: 0.020289490230435546
--------------------------------------------------
Clf: SGDClassifier
	Validation:
		F1 score: 0.017305163830058472
		mAP score: nan
		Hamming score: 0.08384784563567149
	Test:
		F1 score: 0.016058424328073993
		mAP score: nan
		Hamming score: 0.09401704346590617
--------------------------------------------------
Clf: RandomForestClassifier
	Validation:
		F1 score: 0.000375274113399689
		mAP score: nan
		Hamming score: 0.005827263353880751
	Test:
		F1 score: 0.0003444650596191675
		mAP score: nan
		Hamming score: 0.007013784535164633
--------------------------------------------------
Clf: SVC
	Validation:
		F1 score: 0.0001351015038930565
		mAP score: nan
		Hamming score: 0.004011029080998121
	Test:
		F1 score: 9.8802624748

In [None]:
trainModels(tfidf_transformer.transform(X_train), Y_train, tfidf_transformer.transform(X_val), Y_val, tfidf_transformer.transform(X_test), Y_test)

--------------------------------------------------
Clf: MultinomialNB
	Validation:
		F1 score: 0.0
		mAP score: nan
		Hamming score: 0.0
	Test:
		F1 score: 0.0
		mAP score: nan
		Hamming score: 0.0
--------------------------------------------------
Clf: SGDClassifier
	Validation:
		F1 score: 0.007970558310803804
		mAP score: nan
		Hamming score: 0.04720164647472697
	Test:
		F1 score: 0.010758851763046114
		mAP score: nan
		Hamming score: 0.06103736163529555
--------------------------------------------------
Clf: RandomForestClassifier
	Validation:
		F1 score: 0.00031819098281428616
		mAP score: nan
		Hamming score: 0.004955639422256819
	Test:
		F1 score: 0.00033698448018935364
		mAP score: nan
		Hamming score: 0.007583349937082978
--------------------------------------------------
Clf: SVC
	Validation:
		F1 score: 0.0
		mAP score: nan
		Hamming score: 0.0
	Test:
		F1 score: 0.0002900877536037983
		mAP score: nan
		Hamming score: 0.0018
--------------------------------------------------

# **P-subtask** *Spanish*: **Data Loader**
---

In [None]:
df_train = pd.read_csv('gdrive/MyDrive/CodiEsp/train/trainP.tsv', sep = '\t', header = None)
df_train.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
print("Training Data:")
display(df_train.head())

print("\n\nValidation Data:")
df_val = pd.read_csv('gdrive/MyDrive/CodiEsp/dev/devP.tsv', sep = '\t', header = None)
df_val.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
display(df_val.head())

print("\n\nTest Data:")
df_test = pd.read_csv('gdrive/MyDrive/CodiEsp/test/testP.tsv', sep = '\t', header = None)
df_test.rename(columns = {0:"Id", 1:"ICD10"}, inplace = True)
display(df_test.head())

df = pd.concat([df_train, df_val, df_test])

Training Data:


Unnamed: 0,Id,ICD10
0,S0004-06142005000700014-1,bw03zzz
1,S0004-06142005000700014-1,3e02329
2,S0004-06142005000700014-1,bw40zzz
3,S0004-06142005000700014-1,bv44zzz
4,S0004-06142005000700014-1,bn20




Validation Data:


Unnamed: 0,Id,ICD10
0,S0004-06142005000900016-1,bt41zzz
1,S0004-06142005000900016-1,ct13
2,S0004-06142005001000011-1,3e1m39z
3,S0004-06142005001000011-1,0tcb
4,S0004-06142005001000011-1,bt02




Test Data:


Unnamed: 0,Id,ICD10
0,S0004-06142005000500011-1,0ttb
1,S0004-06142005000500011-1,bv49zzz
2,S0004-06142005000500011-1,0djdxzz
3,S0004-06142005000500011-1,bw00zzz
4,S0004-06142005000500011-1,bw20


In [None]:
ids = df['Id'].unique()
codes = df['ICD10'].unique()  

print("Number of documents in training data:", len(ids), "\nNumber of ICD10 codes:", len(codes))

Number of documents in training data: 881 
Number of ICD10 codes: 870


In [None]:
code2idx = {}
for i in range(len(codes)):
  code2idx[codes[i]] = i

id2label = {}
for i in range(len(ids)):
  id2label[ids[i]] = [0]*len(codes)

for i, data in df.iterrows():
  _id = data[0]
  _code = data[1]
  id2label[_id][code2idx[_code]] = 1

_id2label = [(id, y) for id, y in id2label.items()]
ID, Y = zip(*_id2label)

In [None]:
X_train = []
Y_train = []

for id in (df_train['Id'].unique()):
  Y_train.append(id2label[id])

  with open('gdrive/MyDrive/CodiEsp/train/text_files/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  X_train.append(remstopwords(text.lower(), stop_words))

In [None]:
X_val = []
Y_val = []

for id in (df_val['Id'].unique()):
  Y_val.append(id2label[id])

  with open('gdrive/MyDrive/CodiEsp/dev/text_files/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  X_val.append(remstopwords(text.lower(), stop_words))

In [None]:
X_test = []
Y_test = []

for id in (df_test['Id'].unique()):
  Y_test.append(id2label[id])

  with open('gdrive/MyDrive/CodiEsp/test/text_files/' + id + '.txt', 'r') as f:
    text = f.read().replace('\n', ' ')
  X_test.append(remstopwords(text.lower(), stop_words))

# **P-subtask** *English*: **Training**

In [None]:
nb_clf = MultinomialNB()
sgd = SGDClassifier()
lr = LogisticRegression()
rf = RandomForestClassifier()
ada_clf = AdaBoostClassifier()
scv = SVC()

In [None]:
count_vect = CountVectorizer()
count_vect = count_vect.fit(X_train)

tfidf_transformer = TfidfVectorizer()
tfidf_transformer = tfidf_transformer.fit(X_train)

In [None]:
trainModels(count_vect.transform(X_train), Y_train, count_vect.transform(X_val), Y_val, count_vect.transform(X_test), Y_test)

--------------------------------------------------
Clf: MultinomialNB
	Validation:
		F1 score: 0.0019212895886323877
		mAP score: nan
		Hamming score: 0.027327327327327327
	Test:
		F1 score: 0.0018639282610745157
		mAP score: nan
		Hamming score: 0.033939200680272105
--------------------------------------------------
Clf: SGDClassifier
	Validation:
		F1 score: 0.020408535534708217
		mAP score: nan
		Hamming score: 0.1132241007241007
	Test:
		F1 score: 0.01567188734149103
		mAP score: nan
		Hamming score: 0.12152460634603492
--------------------------------------------------
Clf: RandomForestClassifier
	Validation:
		F1 score: 0.00037319002836244214
		mAP score: nan
		Hamming score: 0.0059309309309309305
	Test:
		F1 score: 0.0006248426174509645
		mAP score: nan
		Hamming score: 0.006804653679653679
--------------------------------------------------
Clf: SVC
	Validation:
		F1 score: 0.0
		mAP score: nan
		Hamming score: 0.0
	Test:
		F1 score: 0.0003125630167372454
		mAP score: nan
		Hamm

In [None]:
count_vect = CountVectorizer()
count_vect = count_vect.fit(X_train)

tfidf_transformer = TfidfVectorizer()
tfidf_transformer = tfidf_transformer.fit(X_train)

In [None]:
trainModels(tfidf_transformer.transform(X_train), Y_train, tfidf_transformer.transform(X_val), Y_val, tfidf_transformer.transform(X_test), Y_test)

--------------------------------------------------
Clf: MultinomialNB
	Validation:
		F1 score: 0.0
		mAP score: nan
		Hamming score: 0.0
	Test:
		F1 score: 0.0
		mAP score: nan
		Hamming score: 0.0
--------------------------------------------------
Clf: SGDClassifier
	Validation:
		F1 score: 0.007459781767708449
		mAP score: nan
		Hamming score: 0.05617939367939367
	Test:
		F1 score: 0.0062766071144235
		mAP score: nan
		Hamming score: 0.06363910147392289
--------------------------------------------------
Clf: RandomForestClassifier
	Validation:
		F1 score: 0.00016038492381716118
		mAP score: nan
		Hamming score: 0.003153153153153153
	Test:
		F1 score: 0.00036045573704375885
		mAP score: nan
		Hamming score: 0.005582289734075448
--------------------------------------------------
Clf: SVC
	Validation:
		F1 score: 0.00016420361247947458
		mAP score: nan
		Hamming score: 0.0006435006435006435
	Test:
		F1 score: 0.0003125630167372454
		mAP score: nan
		Hamming score: 0.002976190476190476
-

# **XGBoost**

---



In [None]:
count_vect = CountVectorizer()
count_vect = count_vect.fit(X_train)

In [None]:
from xgboost import XGBClassifier

In [None]:
count_vect = CountVectorizer()
count_vect = count_vect.fit(X_train)

In [None]:
classifier = XGBClassifier(subsample = 0.9)

In [None]:
clf = OneVsRestClassifier(classifier)
clf.fit(count_vect.transform(X_train), Y_train)

print('\tValidation:')
y_pred = clf.predict(count_vect.transform(X_val))
print_score(np.array(Y_val), np.array(y_pred), classifier)

print('\tTest:')
y_pred = clf.predict(count_vect.transform(X_test))
print_score(np.array(Y_test), np.array(y_pred), classifier)

	Validation:
		F1 score: 0.019632752284986538
		mAP score: nan
		Hamming score: 0.20494752994752993
	Test:
		F1 score: 0.02055017298040678
		mAP score: nan
		Hamming score: 0.19356640383426096
