In [9]:
import pandas as pd
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import re
import numpy as np
import matplotlib.pyplot as plt

from scipy import sparse
from scipy.sparse.linalg import svds

from sklearn.metrics.pairwise import cosine_similarity

from nltk.tokenize import sent_tokenize, word_tokenize, regexp_tokenize


from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer

import nltk
nltk.download('stopwords')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\upadh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
dfbert = pd.read_csv('../data/df_bert.csv').dropna().reset_index()

In [5]:
dfbert.columns

Index(['index', 'subject_id', 'hadm_id', 'discharge_instruction', 'icd_E11',
       'icd_E78', 'icd_E87', 'icd_F32', 'icd_I16', 'icd_I50', 'icd_N17',
       'icd_Y92', 'icd_Z85'],
      dtype='object')

In [11]:
def Accuracy(y_true, y_pred):

    temp = 0

    for i in range(y_true.shape[0]):
        temp += sum(np.logical_and(y_true[i], y_pred[i])) / sum(np.logical_or(y_true[i], y_pred[i]))
    return temp / y_true.shape[0]

In [12]:
def Hamming_Loss(y_true, y_pred):
    temp=0
    for i in range(y_true.shape[0]):
        temp += np.size(y_true[i] == y_pred[i]) - np.count_nonzero(y_true[i] == y_pred[i])
    return temp/(y_true.shape[0] * y_true.shape[1])

In [6]:
categories = ['icd_E11', 'icd_E78','icd_E87', 'icd_F32', 'icd_I16', 'icd_I50', 'icd_N17', 'icd_Y92','icd_Z85']



train, test = train_test_split(dfbert, random_state=42, test_size=0.33, shuffle=True)
X_train = train.discharge_instruction
X_test = test.discharge_instruction
y_train = train[categories]
y_test = test[categories]

print(X_train.shape)
print(X_test.shape)

(175298,)
(86341,)


**Tfidf MultinomialNB**

In [10]:
# traing classifier

# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

var_pre = {}


for category in categories:
    print('... Processing {}'.format(category))
    
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    var_pre[category] = prediction
    
    
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

nb_pred = pd.DataFrame(var_pre)

print(nb_pred)

... Processing icd_E11
Test accuracy is 0.7000845484763901
... Processing icd_E78
Test accuracy is 0.6099303922817665
... Processing icd_E87
Test accuracy is 0.7887330468722855
... Processing icd_F32
Test accuracy is 0.7971878945112982
... Processing icd_I16
Test accuracy is 0.6976754959984248
... Processing icd_I50
Test accuracy is 0.8367056207363824
... Processing icd_N17
Test accuracy is 0.8132173590762211
... Processing icd_Y92
Test accuracy is 0.817491110827996
... Processing icd_Z85
Test accuracy is 0.8066967026094208
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            0        0        0        0        0        0        0        0   
1            0        1        0        0        1        0        0        0   
2            0        1        0        0        0        0        0        0   
3            0        1        0        0        0        0        0        0   
4            0        0        0        0        0        0      

In [14]:
y_true = y_test.to_numpy()
y_pred1 = nb_pred.to_numpy()

In [15]:
print(y_test.columns, ((abs(np.diff([y_test.to_numpy(), nb_pred.to_numpy()], axis=0))).sum(axis=1)/len(y_test))*100)
print('Accuracy Score with the discharge_instruction as variable is for multiNB',  Accuracy(y_true, y_pred1))
print('Hamming_loss with the discharge_instruction as variable is for multiNB', Hamming_Loss(y_true, y_pred1))

Index(['icd_E11', 'icd_E78', 'icd_E87', 'icd_F32', 'icd_I16', 'icd_I50',
       'icd_N17', 'icd_Y92', 'icd_Z85'],
      dtype='object') [[29.99154515 39.00696077 21.12669531 20.28121055 30.2324504  16.32943793
  18.67826409 18.25088892 19.33032974]]
Accuracy Score with the discharge_instruction as variable is for multiNB 0.15303936054217776
Hamming_loss with the discharge_instruction as variable is for multiNB 0.23691975873442384


**Tfidf LinearSVC**

In [16]:
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
var_pre = {}

for category in categories:
    print('... Processing {}'.format(category))
   
    SVC_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    var_pre[category] = prediction
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
svc_pred = pd.DataFrame(var_pre)

print(svc_pred)

... Processing icd_E11
Test accuracy is 0.7025978388019597
... Processing icd_E78
Test accuracy is 0.5999698868440254
... Processing icd_E87
Test accuracy is 0.8002455380410234
... Processing icd_F32
Test accuracy is 0.7948367519486687
... Processing icd_I16
Test accuracy is 0.7097555043374527
... Processing icd_I50
Test accuracy is 0.8547503503549878
... Processing icd_N17
Test accuracy is 0.839218911061952
... Processing icd_Y92
Test accuracy is 0.825482679144323
... Processing icd_Z85
Test accuracy is 0.8028862301803315
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            0        0        0        0        1        0        0        0   
1            0        1        0        0        1        0        0        0   
2            0        1        0        0        0        0        0        0   
3            0        0        0        0        0        1        0        0   
4            0        0        0        1        1        0       

In [17]:
y_true = y_test.to_numpy()
y_pred2 = svc_pred.to_numpy()

In [18]:
print(y_test.columns, ((abs(np.diff([y_test.to_numpy(), nb_pred.to_numpy()], axis=0))).sum(axis=1)/len(y_test))*100)
print('Accuracy Score with the discharge_instruction as variable is for lineraSVC',  Accuracy(y_true, y_pred2))
print('Hamming_loss with the discharge_instruction as variable is for linearSVC', Hamming_Loss(y_true, y_pred2))

Index(['icd_E11', 'icd_E78', 'icd_E87', 'icd_F32', 'icd_I16', 'icd_I50',
       'icd_N17', 'icd_Y92', 'icd_Z85'],
      dtype='object') [[29.99154515 39.00696077 21.12669531 20.28121055 30.2324504  16.32943793
  18.67826409 18.25088892 19.33032974]]
Accuracy Score with the discharge_instruction as variable is for lineraSVC 0.28010256673290657
Hamming_loss with the discharge_instruction as variable is for linearSVC 0.23002847880947508


**Tfidf LogReg**

In [19]:
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])

var_pre = {}

for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(X_test)
    var_pre[category] = prediction
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

logreg_pred = pd.DataFrame(var_pre)

print(logreg_pred)

... Processing icd_E11
Test accuracy is 0.7141798218691005
... Processing icd_E78
Test accuracy is 0.6127795601162831
... Processing icd_E87
Test accuracy is 0.8047625114372082
... Processing icd_F32
Test accuracy is 0.7990410118020408
... Processing icd_I16
Test accuracy is 0.7132764271898634
... Processing icd_I50
Test accuracy is 0.8577500839693772
... Processing icd_N17
Test accuracy is 0.8436432285935997
... Processing icd_Y92
Test accuracy is 0.8281002073174969
... Processing icd_Z85
Test accuracy is 0.8067082845924879
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            0        0        0        0        0        0        0        0   
1            0        1        0        0        1        0        0        0   
2            0        0        0        0        0        0        0        0   
3            0        1        0        0        0        1        0        0   
4            0        0        0        0        1        0     

In [20]:
y_true = y_test.to_numpy()
y_pred3 = logreg_pred.to_numpy()

In [21]:
print(y_test.columns, ((abs(np.diff([y_test.to_numpy(), nb_pred.to_numpy()], axis=0))).sum(axis=1)/len(y_test))*100)
print('Accuracy Score with the discharge_instruction as variable is for logReg',  Accuracy(y_true, y_pred3))
print('Hamming_loss with the discharge_instruction as variable is for logReg', Hamming_Loss(y_true, y_pred3))

Index(['icd_E11', 'icd_E78', 'icd_E87', 'icd_F32', 'icd_I16', 'icd_I50',
       'icd_N17', 'icd_Y92', 'icd_Z85'],
      dtype='object') [[29.99154515 39.00696077 21.12669531 20.28121055 30.2324504  16.32943793
  18.67826409 18.25088892 19.33032974]]
Accuracy Score with the discharge_instruction as variable is for logReg 0.2660085986113052
Hamming_loss with the discharge_instruction as variable is for logReg 0.2244176514569491


**Countvectorizer multiNB**

In [24]:
# traing classifier

# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline([
                ('vect', CountVectorizer(stop_words=stopwords.words('english'))),
#                 ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

var_pre = {}


for category in categories:
    print('... Processing {}'.format(category))
    
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    var_pre[category] = prediction
    
    
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

nb_pred = pd.DataFrame(var_pre)

print(nb_pred)

... Processing icd_E11
Test accuracy is 0.6273844407639476
... Processing icd_E78
Test accuracy is 0.6095134408913494
... Processing icd_E87
Test accuracy is 0.6069769865996456
... Processing icd_F32
Test accuracy is 0.682630499994209
... Processing icd_I16
Test accuracy is 0.6931237766530385
... Processing icd_I50
Test accuracy is 0.7073580338425545
... Processing icd_N17
Test accuracy is 0.6114592140466291
... Processing icd_Y92
Test accuracy is 0.7820734066086795
... Processing icd_Z85
Test accuracy is 0.7524814398721349
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            1        0        1        0        0        0        0        0   
1            0        1        0        0        1        0        0        0   
2            0        0        1        1        0        1        1        1   
3            1        1        1        0        0        1        1        0   
4            0        0        1        1        0        0      

In [25]:
y_true = y_test.to_numpy()
y_pred4 = nb_pred.to_numpy()

In [27]:
print(y_test.columns, ((abs(np.diff([y_test.to_numpy(), nb_pred.to_numpy()], axis=0))).sum(axis=1)/len(y_test))*100)
print('Accuracy Score with the discharge_instruction as variable is for countvec multiNB',  Accuracy(y_true, y_pred4))
print('Hamming_loss with the discharge_instruction as variable is for countvec multiNB', Hamming_Loss(y_true, y_pred4))

Index(['icd_E11', 'icd_E78', 'icd_E87', 'icd_F32', 'icd_I16', 'icd_I50',
       'icd_N17', 'icd_Y92', 'icd_Z85'],
      dtype='object') [[37.26155592 39.04865591 39.30230134 31.73695    30.68762233 29.26419662
  38.8540786  21.79265934 24.75185601]]
Accuracy Score with the discharge_instruction as variable is for multiNB 0.30559664861534935
Hamming_loss with the discharge_instruction as variable is for multiNB 0.3252220845253124


**Countvectorizer linerSVC**

In [28]:
SVC_pipeline = Pipeline([
                ('vect', CountVectorizer(stop_words=stopwords.words('english'))),
#                 ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
var_pre = {}

for category in categories:
    print('... Processing {}'.format(category))
   
    SVC_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    var_pre[category] = prediction
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
svc_pred = pd.DataFrame(var_pre)

print(svc_pred)

... Processing icd_E11




Test accuracy is 0.6748705713392247
... Processing icd_E78




Test accuracy is 0.583060191566
... Processing icd_E87




Test accuracy is 0.7753906023789393
... Processing icd_F32




Test accuracy is 0.7658470483316153
... Processing icd_I16




Test accuracy is 0.6921856360246001
... Processing icd_I50




Test accuracy is 0.833045714087166
... Processing icd_N17




Test accuracy is 0.8162055107075433
... Processing icd_Y92




Test accuracy is 0.8009404570250518
... Processing icd_Z85




Test accuracy is 0.7694722090316304
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            0        0        1        1        1        0        0        0   
1            0        1        0        0        1        0        0        0   
2            0        1        0        0        0        0        0        0   
3            0        0        0        0        1        1        1        0   
4            0        0        0        1        0        0        0        0   
...        ...      ...      ...      ...      ...      ...      ...      ...   
86336        0        0        0        1        0        0        0        0   
86337        0        0        0        0        0        0        0        0   
86338        0        0        0        0        0        1        0        0   
86339        0        0        0        0        0        0        0        0   
86340        0        0        0        0        0        0        1     

In [30]:
y_true = y_test.to_numpy()
y_pred5 = svc_pred.to_numpy()

In [34]:
print(y_test.columns, ((abs(np.diff([y_test.to_numpy(), nb_pred.to_numpy()], axis=0))).sum(axis=1)/len(y_test))*100)
print('Accuracy Score with the discharge_instruction as variable is for countvect linearSVC',  Accuracy(y_true, y_pred5))
print('Hamming_loss with the discharge_instruction as variable is for countvect linearSVC', Hamming_Loss(y_true, y_pred5))

Index(['icd_E11', 'icd_E78', 'icd_E87', 'icd_F32', 'icd_I16', 'icd_I50',
       'icd_N17', 'icd_Y92', 'icd_Z85'],
      dtype='object') [[37.26155592 39.04865591 39.30230134 31.73695    30.68762233 29.26419662
  38.8540786  21.79265934 24.75185601]]
Accuracy Score with the discharge_instruction as variable is for countvect linearSVC 0.26893652852670996
Hamming_loss with the discharge_instruction as variable is for countvect linearSVC 0.2543313399453588


**Countvectorizer logReg**

In [33]:
LogReg_pipeline = Pipeline([
                ('vect', CountVectorizer(stop_words=stopwords.words('english'))),
#                 ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])

var_pre = {}

for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(X_test)
    var_pre[category] = prediction
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

logreg_pred = pd.DataFrame(var_pre)

print(logreg_pred)

... Processing icd_E11




Test accuracy is 0.7079487149789787
... Processing icd_E78




Test accuracy is 0.6053091810379774
... Processing icd_E87




Test accuracy is 0.8014269003138718
... Processing icd_F32




Test accuracy is 0.7950220636777429
... Processing icd_I16




Test accuracy is 0.710195619694004
... Processing icd_I50




Test accuracy is 0.855757982881829
... Processing icd_N17




Test accuracy is 0.840539257131606
... Processing icd_Y92




Test accuracy is 0.8234442501245063
... Processing icd_Z85




Test accuracy is 0.8034074194183528
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            0        0        0        0        1        0        0        0   
1            0        1        0        0        1        0        0        0   
2            0        0        0        0        0        0        0        0   
3            0        1        0        0        0        1        1        0   
4            0        0        0        0        1        0        0        0   
...        ...      ...      ...      ...      ...      ...      ...      ...   
86336        0        0        0        1        0        0        0        0   
86337        0        0        0        0        0        0        0        0   
86338        0        0        0        0        0        1        0        0   
86339        0        0        0        0        0        0        0        0   
86340        0        0        0        0        0        0        1     

In [35]:
y_true = y_test.to_numpy()
y_pred6 = logreg_pred.to_numpy()

In [36]:
print(y_test.columns, ((abs(np.diff([y_test.to_numpy(), nb_pred.to_numpy()], axis=0))).sum(axis=1)/len(y_test))*100)
print('Accuracy Score with the discharge_instruction as variable is for countvect logReg',  Accuracy(y_true, y_pred6))
print('Hamming_loss with the discharge_instruction as variable is for countvect logReg', Hamming_Loss(y_true, y_pred6))

Index(['icd_E11', 'icd_E78', 'icd_E87', 'icd_F32', 'icd_I16', 'icd_I50',
       'icd_N17', 'icd_Y92', 'icd_Z85'],
      dtype='object') [[37.26155592 39.04865591 39.30230134 31.73695    30.68762233 29.26419662
  38.8540786  21.79265934 24.75185601]]
Accuracy Score with the discharge_instruction as variable is for countvect logReg 0.25862572325348404
Hamming_loss with the discharge_instruction as variable is for countvect logReg 0.22854984563790345
