In [1]:
import pandas as pd
from collections import Counter
import re
import numpy as np

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt


import glob
from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity

from scipy import sparse
from scipy.sparse.linalg import svds


from nltk.tokenize import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

import nltk
nltk.download('punkt')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\upadh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\upadh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df4model = pd.read_csv('../data/df4model.csv')

In [3]:
categories = ['icd_E11', 'icd_E78','icd_E87', 'icd_F32', 'icd_I16', 'icd_I50', 'icd_N17', 'icd_Y92','icd_Z85']

variables = [ 'history_present_illness', 'past_medcal_history', 'social_history', 'family_history',
       'physical_exam', 'pertinent_results', 'hospital_course',
       'medication_on_admission', 'medication_on_discharge',
       'discharge_disposition', 'discharge_diagnosis', 'discharge_condition',
       'discharge_instruction'] 

df4model['dis_mer'] = (df4model['discharge_disposition']+ df4model['discharge_diagnosis']
                        + df4model['discharge_condition']+ df4model['discharge_instruction']) #these are object do we need to use astype(str)

train, test = train_test_split(df4model, random_state=42, test_size=0.33, shuffle=True)
X_train = train.discharge_instruction.fillna(' ')
X_test = test.discharge_instruction.fillna(' ')
y_train = train[categories]
y_test = test[categories]

print(X_train.shape)
print(X_test.shape)

(176840,)
(87101,)


In [4]:
# traing classifier

# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

var_pre = {}


for category in categories:
    print('... Processing {}'.format(category))
    
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    var_pre[category] = prediction
    
    
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

nb_pred = pd.DataFrame(var_pre)

print(nb_pred)

... Processing icd_E11
Test accuracy is 0.7027359042950139
... Processing icd_E78
Test accuracy is 0.609970034787201
... Processing icd_E87
Test accuracy is 0.7906568236874433
... Processing icd_F32
Test accuracy is 0.7970861413761036
... Processing icd_I16
Test accuracy is 0.6978794732551865
... Processing icd_I50
Test accuracy is 0.8417584183878486
... Processing icd_N17
Test accuracy is 0.8223786179263154
... Processing icd_Y92
Test accuracy is 0.8178780955442532
... Processing icd_Z85
Test accuracy is 0.8020459007359273
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            0        1        0        0        0        0        0        0   
1            0        0        0        0        1        0        0        0   
2            0        0        0        0        0        0        0        0   
3            0        0        0        0        0        0        1        0   
4            0        0        0        0        0        0      

In [5]:
y_test.head(5)

Unnamed: 0,icd_E11,icd_E78,icd_E87,icd_F32,icd_I16,icd_I50,icd_N17,icd_Y92,icd_Z85
4039,0,1,0,0,1,1,0,0,0
196957,0,0,0,1,1,0,0,1,0
68719,1,1,0,0,1,0,0,0,0
18633,0,1,0,0,0,1,1,0,0
92713,1,0,0,0,0,0,0,0,0


In [6]:
nb_pred.head(5)

Unnamed: 0,icd_E11,icd_E78,icd_E87,icd_F32,icd_I16,icd_I50,icd_N17,icd_Y92,icd_Z85
0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0


In [7]:
y_true = y_test.to_numpy()
y_pred = nb_pred.to_numpy()

In [8]:
def Accuracy(y_true, y_pred):

    temp = 0

    for i in range(y_true.shape[0]):
        temp += sum(np.logical_and(y_true[i], y_pred[i])) / sum(np.logical_or(y_true[i], y_pred[i]))
    return temp / y_true.shape[0]
    
print('Accuracy Score with the discharge_instruction as variable is',  Accuracy(y_true, y_pred))

Accuracy Score with the discharge_instruction as variable is 0.17546106390265603


In [9]:
def Hamming_Loss(y_true, y_pred):
    temp=0
    for i in range(y_true.shape[0]):
        temp += np.size(y_true[i] == y_pred[i]) - np.count_nonzero(y_true[i] == y_pred[i])
    return temp/(y_true.shape[0] * y_true.shape[1])
    
print('Hamming_loss with the discharge_instruction as variable is', Hamming_Loss(y_true, y_pred))

Hamming_loss with the discharge_instruction as variable is 0.23529006555607856


**The scores changed upon cleaning the strings for the discharge_instruction 
from a accuracy score of 0.1901903837963846 to Accuracy Score with the discharge_instruction as variable is 0.17546106390265603

Hamming_loss with the discharge_instruction as variable is 0.23433714882722356 to Hamming_loss with the discharge_instruction as variable is 0.23529006555607856**

In [10]:
(abs(np.diff([y_test.to_numpy(), nb_pred.to_numpy()], axis=0)).sum(axis=1)/len(y_test))*100

array([[29.72640957, 39.00299652, 20.93431763, 20.29138586, 30.21205267,
        15.82415816, 17.76213821, 18.21219045, 19.79540993]])

In [11]:
print(y_test.columns, ((abs(np.diff([y_test.to_numpy(), nb_pred.to_numpy()], axis=0))).sum(axis=1)/len(y_test))*100)

Index(['icd_E11', 'icd_E78', 'icd_E87', 'icd_F32', 'icd_I16', 'icd_I50',
       'icd_N17', 'icd_Y92', 'icd_Z85'],
      dtype='object') [[29.72640957 39.00299652 20.93431763 20.29138586 30.21205267 15.82415816
  17.76213821 18.21219045 19.79540993]]


In [23]:
np.diff([y_test.to_numpy(), nb_pred.to_numpy()], axis=0).shape


(1, 87101, 9)

In [37]:
((abs(np.diff([y_test.to_numpy(), nb_pred.to_numpy()], axis=0)).sum(axis=2))/9).mean()

((abs(np.diff([y_test.to_numpy(), nb_pred.to_numpy()], axis=0)).sum(axis=2))/9)

#Overall 23.5% times the ICD is wrongly predicted getting from the mean

array([[22.22222222, 22.22222222, 33.33333333, ..., 11.11111111,
        55.55555556, 11.11111111]])

In [38]:
# going to use the dis_mer as the variable now
train, test = train_test_split(df4model, random_state=42, test_size=0.33, shuffle=True)
X_train = train.dis_mer.fillna(' ')
X_test = test.dis_mer.fillna(' ')
y_train = train[categories]
y_test = test[categories]

print(X_train.shape)
print(X_test.shape)

(176840,)
(87101,)


In [39]:
# traing classifier

# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

var_pre = {}


for category in categories:
    print('... Processing {}'.format(category))
    
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    var_pre[category] = prediction
    
    
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

nb_pred = pd.DataFrame(var_pre)

print(nb_pred)

... Processing icd_E11
Test accuracy is 0.7010137656284084
... Processing icd_E78
Test accuracy is 0.6127254566537698
... Processing icd_E87
Test accuracy is 0.7843537961676674
... Processing icd_F32
Test accuracy is 0.7977520349938577
... Processing icd_I16
Test accuracy is 0.6913009035487537
... Processing icd_I50
Test accuracy is 0.8334117863170343
... Processing icd_N17
Test accuracy is 0.8112421212156002
... Processing icd_Y92
Test accuracy is 0.8167300030998496
... Processing icd_Z85
Test accuracy is 0.8031480694825547
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            0        1        0        0        0        0        0        0   
1            0        0        0        0        1        0        0        0   
2            0        0        0        0        0        0        0        0   
3            0        0        0        0        0        0        0        0   
4            0        0        0        0        0        0     

In [41]:
y_pred1 = nb_pred.to_numpy()

In [47]:
    
print('Accuracy Score with the discharge_merged as variable is',  Accuracy(y_true, y_pred1))
print('Hamming_loss with the discharge_merged as variable is', Hamming_Loss(y_true, y_pred1))

Accuracy Score with the discharge_merged as variable is 0.12950424001474284
Hamming_loss with the discharge_merged as variable is 0.23870245143250046


In [44]:
# going to use the family_history as the variable now
train, test = train_test_split(df4model, random_state=42, test_size=0.33, shuffle=True)
X_train = train.family_history.fillna(' ')
X_test = test.family_history.fillna(' ')
y_train = train[categories]
y_test = test[categories]

print(X_train.shape)
print(X_test.shape)

(176840,)
(87101,)


In [45]:
# traing classifier

# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

var_pre = {}


for category in categories:
    print('... Processing {}'.format(category))
    
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    var_pre[category] = prediction
    
    
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

nb_pred = pd.DataFrame(var_pre)

print(nb_pred)

... Processing icd_E11
Test accuracy is 0.6971791368641003
... Processing icd_E78
Test accuracy is 0.6054121077829187
... Processing icd_E87
Test accuracy is 0.7741587352613633
... Processing icd_F32
Test accuracy is 0.7991986314738063
... Processing icd_I16
Test accuracy is 0.6873744273888933
... Processing icd_I50
Test accuracy is 0.7932515126117955
... Processing icd_N17
Test accuracy is 0.8030906648603345
... Processing icd_Y92
Test accuracy is 0.8025625423359088
... Processing icd_Z85
Test accuracy is 0.8044454139447308
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            0        1        0        0        0        0        0        0   
1            0        0        0        0        0        0        0        0   
2            0        0        0        0        0        0        0        0   
3            0        1        0        0        0        0        0        0   
4            0        0        0        0        0        0     

In [46]:
y_pred2 = nb_pred.to_numpy()

In [48]:
print('Accuracy Score with the family_history as variable is',  Accuracy(y_true, y_pred2))
print('Hamming_loss with the family_history as variable is', Hamming_Loss(y_true, y_pred2))

Accuracy Score with the family_history as variable is 0.10321772042417741
Hamming_loss with the family_history as variable is 0.2481474252751276


In [49]:
df4model.columns

Index(['subject_id', 'hadm_id', 'icd', 'long_title', 'chief_complaint',
       'surgical_invasive_procedure', 'history_present_illness',
       'past_medcal_history', 'social_history', 'family_history',
       'physical_exam', 'pertinent_results', 'hospital_course',
       'medication_on_admission', 'medication_on_discharge',
       'discharge_disposition', 'discharge_diagnosis', 'discharge_condition',
       'discharge_instruction', 'followup_instruction', 'icd_E11', 'icd_E78',
       'icd_E87', 'icd_F32', 'icd_I16', 'icd_I50', 'icd_N17', 'icd_Y92',
       'icd_Z85', 'dis_mer'],
      dtype='object')

In [50]:
# going to use the hopital_course as the variable now
train, test = train_test_split(df4model, random_state=42, test_size=0.33, shuffle=True)
X_train = train.hospital_course.fillna(' ')
X_test = test.hospital_course.fillna(' ')
y_train = train[categories]
y_test = test[categories]

print(X_train.shape)
print(X_test.shape)

(176840,)
(87101,)


In [51]:
# traing classifier

# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

var_pre = {}


for category in categories:
    print('... Processing {}'.format(category))
    
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    var_pre[category] = prediction
    
    
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

nb_pred = pd.DataFrame(var_pre)

print(nb_pred)

... Processing icd_E11
Test accuracy is 0.7113810404013731
... Processing icd_E78
Test accuracy is 0.626112214555516
... Processing icd_E87
Test accuracy is 0.7851344990298619
... Processing icd_F32
Test accuracy is 0.7987853181938209
... Processing icd_I16
Test accuracy is 0.686008197380053
... Processing icd_I50
Test accuracy is 0.8270743160239262
... Processing icd_N17
Test accuracy is 0.8089803791001251
... Processing icd_Y92
Test accuracy is 0.8141008714021652
... Processing icd_Z85
Test accuracy is 0.8033317642736593
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            0        1        0        0        1        1        0        0   
1            0        0        0        0        0        0        0        0   
2            0        0        0        0        0        0        0        0   
3            0        0        0        0        0        0        0        0   
4            0        0        0        0        0        0       

In [52]:
y_pred3 = nb_pred.to_numpy()

In [54]:
print('Accuracy Score with the hospital_course as variable is',  Accuracy(y_true, y_pred3))
print('Hamming_loss with the hospital_course as variable is', Hamming_Loss(y_true, y_pred3))

Accuracy Score with the hospital_course as variable is 0.11948653205556205
Hamming_loss with the hospital_course as variable is 0.23767682218216654


In [55]:
# going to use the pertinent_results as the variable now
train, test = train_test_split(df4model, random_state=42, test_size=0.33, shuffle=True)
X_train = train.pertinent_results.fillna(' ')
X_test = test.pertinent_results.fillna(' ')
y_train = train[categories]
y_test = test[categories]

print(X_train.shape)
print(X_test.shape)

(176840,)
(87101,)


In [56]:
# traing classifier

# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

var_pre = {}


for category in categories:
    print('... Processing {}'.format(category))
    
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    var_pre[category] = prediction
    
    
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

nb_pred = pd.DataFrame(var_pre)

print(nb_pred)

... Processing icd_E11
Test accuracy is 0.6863526251133741
... Processing icd_E78
Test accuracy is 0.5777545607972354
... Processing icd_E87
Test accuracy is 0.7778326310834548
... Processing icd_F32
Test accuracy is 0.7975913020516412
... Processing icd_I16
Test accuracy is 0.6873629464644493
... Processing icd_I50
Test accuracy is 0.7906912664607755
... Processing icd_N17
Test accuracy is 0.804192833606962
... Processing icd_Y92
Test accuracy is 0.8039287723447492
... Processing icd_Z85
Test accuracy is 0.8031365885581107
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            0        1        0        0        0        0        0        0   
1            0        0        0        0        0        0        0        0   
2            0        1        0        0        0        0        0        0   
3            0        1        0        0        0        0        0        0   
4            0        0        0        0        0        0      

In [58]:
y_pred4 = nb_pred.to_numpy()

print('Accuracy Score with the pertinent_results as variable is',  Accuracy(y_true, y_pred4))
print('Hamming_loss with the pertinent_results as variable is', Hamming_Loss(y_true, y_pred4))


Accuracy Score with the pertinent_results as variable is 0.04783167707330557
Hamming_loss with the pertinent_results as variable is 0.2523507192799164


In [59]:
# going to use the history_present_illness as the variable now
train, test = train_test_split(df4model, random_state=42, test_size=0.33, shuffle=True)
X_train = train.history_present_illness.fillna(' ')
X_test = test.history_present_illness.fillna(' ')
y_train = train[categories]
y_test = test[categories]

print(X_train.shape)
print(X_test.shape)

(176840,)
(87101,)


In [60]:
# traing classifier

# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

var_pre = {}


for category in categories:
    print('... Processing {}'.format(category))
    
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    var_pre[category] = prediction
    
    
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

nb_pred = pd.DataFrame(var_pre)

print(nb_pred)

... Processing icd_E11
Test accuracy is 0.6927819428020344
... Processing icd_E78
Test accuracy is 0.6250559695066646
... Processing icd_E87
Test accuracy is 0.7756856982124201
... Processing icd_F32
Test accuracy is 0.7980620199538467
... Processing icd_I16
Test accuracy is 0.6856293268733998
... Processing icd_I50
Test accuracy is 0.8001630291271054
... Processing icd_N17
Test accuracy is 0.8039861769669694
... Processing icd_Y92
Test accuracy is 0.8075797063179527
... Processing icd_Z85
Test accuracy is 0.803882848646973
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            0        1        0        0        0        0        0        0   
1            0        0        0        0        0        0        0        0   
2            0        0        0        0        0        0        0        0   
3            0        0        0        0        0        0        0        0   
4            0        0        0        0        0        0      

In [61]:
y_pred5 = nb_pred.to_numpy()

print('Accuracy Score with the history_present_illness as variable is',  Accuracy(y_true, y_pred5))
print('Hamming_loss with the history_present_illness as variable is', Hamming_Loss(y_true, y_pred5))


Accuracy Score with the history_present_illness as variable is 0.09051839654128252
Hamming_loss with the history_present_illness as variable is 0.24524147573251487


In [None]:
**For the NB none of the columns are performing 

In [68]:
print(y_test.columns, 'discharge_instruction', ((abs(np.diff([y_true, y_pred], axis=0))).sum(axis=1)/len(y_test))*100)

print ( 'percent of wrongly predicted for discharge_instruction',((abs(np.diff([y_true, y_pred], axis=0)).sum(axis=2))/9).mean())

((abs(np.diff([y_true, y_pred], axis=0)).sum(axis=2))/9)
print('-------------------------------------------------------------------------------------------------------------------')

print(y_test.columns, 'dis_mer', ((abs(np.diff([y_true, y_pred1], axis=0))).sum(axis=1)/len(y_test))*100)

print ( 'percent of wrongly predicted for dis_mer',((abs(np.diff([y_true, y_pred1], axis=0)).sum(axis=2))/9).mean())

((abs(np.diff([y_true, y_pred2], axis=0)).sum(axis=2))/9)
print('-------------------------------------------------------------------------------------------------------------------')

print(y_test.columns, 'family_history', ((abs(np.diff([y_true, y_pred2], axis=0))).sum(axis=1)/len(y_test))*100)

print ( 'percent of wrongly predicted for family_history',((abs(np.diff([y_true, y_pred2], axis=0)).sum(axis=2))/9).mean())

((abs(np.diff([y_true, y_pred2], axis=0)).sum(axis=2))/9)
print('-------------------------------------------------------------------------------------------------------------------')

print(y_test.columns, 'hospital_course', ((abs(np.diff([y_true, y_pred3], axis=0))).sum(axis=1)/len(y_test))*100)

print ( 'percent of wrongly predicted for hospital_course',((abs(np.diff([y_true, y_pred3], axis=0)).sum(axis=2))/9).mean())

((abs(np.diff([y_true, y_pred2], axis=0)).sum(axis=2))/9)
print('-------------------------------------------------------------------------------------------------------------------')

print(y_test.columns, 'pertinent_results', ((abs(np.diff([y_true, y_pred4], axis=0))).sum(axis=1)/len(y_test))*100)

print ( 'percent of wrongly predicted for pertinent_results',((abs(np.diff([y_true, y_pred4], axis=0)).sum(axis=2))/9).mean())

((abs(np.diff([y_true, y_pred2], axis=0)).sum(axis=2))/9)
print('-------------------------------------------------------------------------------------------------------------------')
print(y_test.columns, 'history_present_illness', ((abs(np.diff([y_true, y_pred5], axis=0))).sum(axis=1)/len(y_test))*100)

print ( 'percent of wrongly predicted for history_present_illness',((abs(np.diff([y_true, y_pred5], axis=0)).sum(axis=2))/9).mean())

((abs(np.diff([y_true, y_pred2], axis=0)).sum(axis=2))/9)
print('-------------------------------------------------------------------------------------------------------------------')

Index(['icd_E11', 'icd_E78', 'icd_E87', 'icd_F32', 'icd_I16', 'icd_I50',
       'icd_N17', 'icd_Y92', 'icd_Z85'],
      dtype='object') discharge_instruction [[29.72640957 39.00299652 20.93431763 20.29138586 30.21205267 15.82415816
  17.76213821 18.21219045 19.79540993]]
percent of wrongly predicted for discharge_instruction 0.23529006555607862
-------------------------------------------------------------------------------------------------------------------
Index(['icd_E11', 'icd_E78', 'icd_E87', 'icd_F32', 'icd_I16', 'icd_I50',
       'icd_N17', 'icd_Y92', 'icd_Z85'],
      dtype='object') dis_mer [[29.89862344 38.72745433 21.56462038 20.2247965  30.86990965 16.65882137
  18.87578788 18.32699969 19.68519305]]
percent of wrongly predicted for dis_mer 0.23870245143250046
-------------------------------------------------------------------------------------------------------------------
Index(['icd_E11', 'icd_E78', 'icd_E87', 'icd_F32', 'icd_I16', 'icd_I50',
       'icd_N17', 'icd_Y92', 

In [103]:
dis_inst= (((abs(np.diff([y_true, y_pred], axis=0))).sum(axis=1)/len(y_test))*100).flatten()
dis_merged= (((abs(np.diff([y_true, y_pred1], axis=0))).sum(axis=1)/len(y_test))*100).flatten()
fam_his= (((abs(np.diff([y_true, y_pred2], axis=0))).sum(axis=1)/len(y_test))*100).flatten()
hos_cour= (((abs(np.diff([y_true, y_pred3], axis=0))).sum(axis=1)/len(y_test))*100).flatten()
per_res= (((abs(np.diff([y_true, y_pred4], axis=0))).sum(axis=1)/len(y_test))*100).flatten()
his_ill= (((abs(np.diff([y_true, y_pred5], axis=0))).sum(axis=1)/len(y_test))*100).flatten()
dis_inst.shape

(9,)

In [76]:
keys = y_test.columns.to_list()

In [105]:
dic_x= {'icd':keys, 
        'discharge_instruction(%wrong_pred)':dis_inst,
       'discharged_merged(%wrong_pred)': dis_merged,
       'family_history(%wrong_pred)':fam_his,
       'hospital_course(%wrong_pred)':hos_cour,
       'pertinent_result(%wrong_pred)':per_res,
       'history_of_present_illness(%wrong_pres)':his_ill}
        
dic_x
pd.DataFrame.from_dict(dic_x)

Unnamed: 0,icd,discharge_instruction(%wrong_pred),discharged_merged(%wrong_pred),family_history(%wrong_pred),hospital_course(%wrong_pred),pertinent_result(%wrong_pred),history_of_present_illness(%wrong_pres)
0,icd_E11,29.72641,29.898623,30.282086,28.861896,31.364737,30.721806
1,icd_E78,39.002997,38.727454,39.458789,37.388779,42.224544,37.494403
2,icd_E87,20.934318,21.56462,22.584126,21.48655,22.216737,22.43143
3,icd_F32,20.291386,20.224797,20.080137,20.121468,20.24087,20.193798
4,icd_I16,30.212053,30.86991,31.262557,31.39918,31.263705,31.437067
5,icd_I50,15.824158,16.658821,20.674849,17.292568,20.930873,19.983697
6,icd_N17,17.762138,18.875788,19.690934,19.101962,19.580717,19.601382
7,icd_Y92,18.21219,18.327,19.743746,18.589913,19.607123,19.242029
8,icd_Z85,19.79541,19.685193,19.555459,19.666824,19.686341,19.611715


In [109]:
categories = ['icd_E11', 'icd_E78','icd_E87', 'icd_F32', 'icd_I16', 'icd_I50', 'icd_N17', 'icd_Y92','icd_Z85']

# variables = [ 'history_present_illness', 'past_medcal_history', 'social_history', 'family_history',
#        'physical_exam', 'pertinent_results', 'hospital_course',
#        'medication_on_admission', 'medication_on_discharge',
#        'discharge_disposition', 'discharge_diagnosis', 'discharge_condition',
#        'discharge_instruction'] 

df4model['combined'] = (df4model['discharge_disposition']+ df4model['discharge_diagnosis']
                        + df4model['discharge_condition']+ df4model['discharge_instruction']
                     +df4model['history_present_illness'] +df4model['past_medcal_history']
                        +df4model['family_history']+df4model['physical_exam']
                   + df4model['pertinent_results']+  df4model['hospital_course']
                   +df4model['medication_on_admission']+df4model['medication_on_discharge']) 

train, test = train_test_split(df4model, random_state=42, test_size=0.33, shuffle=True)
X_train = train.combined.fillna(' ')
X_test = test.combined.fillna(' ')
y_train = train[categories]
y_test = test[categories]

print(X_train.shape)
print(X_test.shape)

(176840,)
(87101,)


In [110]:
# traing classifier

# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

var_pre = {}


for category in categories:
    print('... Processing {}'.format(category))
    
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    var_pre[category] = prediction
    
    
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

nb_pred = pd.DataFrame(var_pre)

print(nb_pred)

... Processing icd_E11
Test accuracy is 0.6844812344289962
... Processing icd_E78
Test accuracy is 0.6022433726363646
... Processing icd_E87
Test accuracy is 0.7749049953502256
... Processing icd_F32
Test accuracy is 0.7975568592783091
... Processing icd_I16
Test accuracy is 0.6853078609889668
... Processing icd_I50
Test accuracy is 0.7865581336609224
... Processing icd_N17
Test accuracy is 0.8039746960425254
... Processing icd_Y92
Test accuracy is 0.8037795203269767
... Processing icd_Z85
Test accuracy is 0.803216955029219
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            0        1        0        0        0        0        0        0   
1            0        0        0        0        0        0        0        0   
2            0        0        0        0        0        0        0        0   
3            0        0        0        0        0        0        0        0   
4            0        0        0        0        0        0      

In [111]:
y_pred6 = nb_pred.to_numpy()

print('Accuracy Score with the all variable combined is',  Accuracy(y_true, y_pred6))
print('Hamming_loss with the all variable combined is', Hamming_Loss(y_true, y_pred6))


Accuracy Score with the all variable combined is 0.04634471967682954
Hamming_loss with the all variable combined is 0.250886263584166


#as all the variables are low perfringfor the Hamming_loss the best is the discharge instruction we will use that for other models

In [112]:
categories = ['icd_E11', 'icd_E78','icd_E87', 'icd_F32', 'icd_I16', 'icd_I50', 'icd_N17', 'icd_Y92','icd_Z85']

# variables = [ 'history_present_illness', 'past_medcal_history', 'social_history', 'family_history',
#        'physical_exam', 'pertinent_results', 'hospital_course',
#        'medication_on_admission', 'medication_on_discharge',
#        'discharge_disposition', 'discharge_diagnosis', 'discharge_condition',
#        'discharge_instruction'] 

df4model['combined'] = (df4model['discharge_disposition']+ df4model['discharge_diagnosis']
                        + df4model['discharge_condition']+ df4model['discharge_instruction']
                     +df4model['history_present_illness'] +df4model['past_medcal_history']
                        +df4model['family_history']+df4model['physical_exam']
                   + df4model['pertinent_results']+  df4model['hospital_course']
                   +df4model['medication_on_admission']+df4model['medication_on_discharge']) 

train, test = train_test_split(df4model, random_state=42, test_size=0.33, shuffle=True)
X_train = train.discharge_instruction.fillna(' ')
X_test = test.discharge_instruction.fillna(' ')
y_train = train[categories]
y_test = test[categories]

print(X_train.shape)
print(X_test.shape)

(176840,)
(87101,)


In [119]:
SVC_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(LinearSVC(), n_jobs=1)),
            ])
var_pre = {}

for category in categories:
    print('... Processing {}'.format(category))
   
    SVC_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = SVC_pipeline.predict(X_test)
    var_pre[category] = prediction
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
svc_pred = pd.DataFrame(var_pre)

print(svc_pred)

... Processing icd_E11
Test accuracy is 0.706283509948221
... Processing icd_E78
Test accuracy is 0.6038966257563059
... Processing icd_E87
Test accuracy is 0.7989804939093695
... Processing icd_F32
Test accuracy is 0.7964546905316816
... Processing icd_I16
Test accuracy is 0.7096818635836557
... Processing icd_I50
Test accuracy is 0.8539626410718592
... Processing icd_N17
Test accuracy is 0.8379237896235405
... Processing icd_Y92
Test accuracy is 0.825409581979541
... Processing icd_Z85
Test accuracy is 0.8006107851804227
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            0        0        0        0        0        0        0        0   
1            0        0        0        0        1        0        0        0   
2            0        0        0        0        1        0        0        0   
3            0        1        1        0        0        0        1        0   
4            0        0        0        0        0        0       

In [121]:
y_pred7 = svc_pred.to_numpy()

print('Accuracy Score with the discharge_instruction and liner-SVC is',  Accuracy(y_true, y_pred7))
print('Hamming_loss with the discharge_instruction and liner-SVC is', Hamming_Loss(y_true,y_pred7))


Accuracy Score with the all variable combined is 0.2713272522703546
Hamming_loss with the all variable combined is 0.22964400204615587


In [124]:
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])

var_pre = {}

for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(X_test)
    var_pre[category] = prediction
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

logreg_pred = pd.DataFrame(var_pre)

print(logreg_pred)

... Processing icd_E11
Test accuracy is 0.7147105084901436
... Processing icd_E78
Test accuracy is 0.6142524196048266
... Processing icd_E87
Test accuracy is 0.8028151226736777
... Processing icd_F32
Test accuracy is 0.8001515482026613
... Processing icd_I16
Test accuracy is 0.7136313015924042
... Processing icd_I50
Test accuracy is 0.8563621542806626
... Processing icd_N17
Test accuracy is 0.8410466010723183
... Processing icd_Y92
Test accuracy is 0.8276024385483519
... Processing icd_Z85
Test accuracy is 0.8033432451981034
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            0        0        0        0        0        0        0        0   
1            0        0        0        0        1        0        0        0   
2            0        1        0        0        1        0        0        0   
3            0        0        1        0        0        0        1        0   
4            0        0        0        0        0        0     

In [164]:
y_pred8 = logreg_pred.to_numpy()

print('Accuracy Score with the discharge_instruction and log_reg is',  Accuracy(y_true, y_pred8))
print('Hamming_loss with the discharge_instruction and log_reg is', Hamming_Loss(y_true,y_pred8))

Accuracy Score with the discharge_instruction and log_reg is 0.2608418879888222
Hamming_loss with the discharge_instruction and log_reg is 0.22507586977570102


In [126]:
print(y_test.columns, 'lorreg model discharge_instruction', ((abs(np.diff([y_true, y_pred8], axis=0))).sum(axis=1)/len(y_test))*100)

print ( 'percent of wrongly predicted by logreg model for discharge_instruction',((abs(np.diff([y_true, y_pred8], axis=0)).sum(axis=2))/9).mean())

((abs(np.diff([y_true, y_pred], axis=0)).sum(axis=2))/9)

Index(['icd_E11', 'icd_E78', 'icd_E87', 'icd_F32', 'icd_I16', 'icd_I50',
       'icd_N17', 'icd_Y92', 'icd_Z85'],
      dtype='object') lorreg model discharge_instruction [[28.52894915 38.57475804 19.71848773 19.98484518 28.63686984 14.36378457
  15.89533989 17.23975615 19.66567548]]
percent of wrongly predicted by logreg model for discharge_instruction 0.2251205178152056


array([[0.22222222, 0.22222222, 0.33333333, ..., 0.11111111, 0.55555556,
        0.11111111]])

In [None]:
vector = CountVectorizer(stop_words=stopwords.words('english')) #doing countvectorizer after removing stop words

vector.fit(X_train)
X_train_vector = vector.transform(X_train)

X_test_vector = vector.transform(X_test)

NB = MultinomialNB().fit(X_train_vector, y_train)

y_pred = NB.predict(X_test_vector)

print(accuracy_score(y_test, y_pred))
confusion_matrix(y_test, y_pred)

In [127]:
categories = ['icd_E11', 'icd_E78','icd_E87', 'icd_F32', 'icd_I16', 'icd_I50', 'icd_N17', 'icd_Y92','icd_Z85']

# variables = [ 'history_present_illness', 'past_medcal_history', 'social_history', 'family_history',
#        'physical_exam', 'pertinent_results', 'hospital_course',
#        'medication_on_admission', 'medication_on_discharge',
#        'discharge_disposition', 'discharge_diagnosis', 'discharge_condition',
#        'discharge_instruction'] 

df4model['combined'] = (df4model['discharge_disposition']+ df4model['discharge_diagnosis']
                        + df4model['discharge_condition']+ df4model['discharge_instruction']
                     +df4model['history_present_illness'] +df4model['past_medcal_history']
                        +df4model['family_history']+df4model['physical_exam']
                   + df4model['pertinent_results']+  df4model['hospital_course']
                   +df4model['medication_on_admission']+df4model['medication_on_discharge']) 

train, test = train_test_split(df4model, random_state=42, test_size=0.33, shuffle=True)
X_train = train.discharge_instruction.fillna(' ')
X_test = test.discharge_instruction.fillna(' ')
y_train = train[categories]
y_test = test[categories]

print(X_train.shape)
print(X_test.shape)

(176840,)
(87101,)


In [130]:
vector = CountVectorizer(stop_words=stopwords.words('english'))
vector

In [131]:
X_train_vector = vector.fit_transform(X_train)

In [132]:
#we want to take a look at all of the tokens that the CountVectorizer has seen, we can look at its vocabulary. Check the vocabulary_ attribute.

vocb_dic = vector.vocabulary_
vocb_dic

{'admitted': 3553,
 'abdominal': 2572,
 'pain': 38193,
 'due': 18521,
 'hepatitis': 25181,
 'seen': 46946,
 'found': 22850,
 'blood': 8543,
 'explains': 20945,
 'acute': 3244,
 'discussed': 17197,
 'important': 26982,
 'call': 9647,
 'follow': 22521,
 'liver': 31056,
 'also': 4275,
 'please': 40149,
 'schedule': 46648,
 'appointment': 5741,
 'hospital': 25685,
 'rehab': 44452,
 'facility': 21218,
 'weakness': 57069,
 'concerning': 12756,
 'stroke': 50096,
 'got': 23985,
 'neurology': 35256,
 'specialists': 48817,
 'felt': 21589,
 'symptoms': 51411,
 'find': 21895,
 'small': 48350,
 'clot': 11913,
 'lungs': 31521,
 'started': 49341,
 'thinner': 52774,
 'increased': 27365,
 'medications': 32439,
 'hope': 25605,
 'able': 2637,
 'continue': 13340,
 'working': 57848,
 'physical': 39754,
 'therapy': 52609,
 'see': 46926,
 'management': 31864,
 'doctors': 17709,
 'better': 8015,
 'control': 13455,
 'symptomsit': 51430,
 'pleasure': 40179,
 'taking': 51766,
 'care': 10035,
 'team': 51991,
 'ho

In [133]:
#How many total tokens are there? #Total count of token(words) in the vocabulary: 72222

key_sum = len(vocb_dic.keys())
print("Total count of token(words) in the vocabulary:", key_sum)

Total count of token(words) in the vocabulary: 58552


In [135]:
# finding the frequecy of the token

# Fill this in to build a DataFrame of words and their counts
word_counts = pd.DataFrame({
    'words': vector.get_feature_names_out(), #sorted(vocb_dic.keys()) can also give it in the same order
    #becoz the vocb_dic is not in the same order for count(frequency) so we can not use the dic.keys for word to match the grefuency
    'frequency': np.array(X_train_vector.sum(axis = 0)).flatten()
})

word_counts

Unnamed: 0,words,frequency
0,00,241
1,000,140
2,0000,6
3,0003,1
4,00085,1
...,...,...
58547,ójala,1
58548,úlcera,4
58549,úlceras,1
58550,último,2


In [142]:
word_counts.sort_values('frequency', ascending=False).head()

Unnamed: 0,words,frequency
40149,please,223890
38193,pain,164182
32439,medications,136740
51725,take,135471
10035,care,130108


In [153]:
#numeric tokens
(word_counts[word_counts['words'].str.isnumeric()].sort_values('frequency',ascending=False).head())
#  .nunique()) #there are 796 numeric tokens.

Unnamed: 0,words,frequency
97,10,23631
147,101,13125
1040,24,4172
865,20,3584
336,12,3487


In [161]:
#Non numeric toekns in the discharge_instructions
(word_counts[~word_counts['words'].str.isnumeric()].
 sort_values('frequency',ascending=False).head(10)
)

#there are 57756 non-numeric tokens

Unnamed: 0,words,frequency
40149,please,223890
38193,pain,164182
32439,medications,136740
51725,take,135471
10035,care,130108
8543,blood,117235
25685,hospital,117127
3553,admitted,109765
51766,taking,97835
32364,medication,97105


In [155]:
df4model['dis_instr_nonumber'] = (df4model['discharge_instruction'].str.replace('X', '')
 .str.replace('//', '')
 .str.replace('.00', '')
 .str.replace('xxxx', '')
 .str.replace('xx', '')
 .str.replace('\d+', ''))

  df4model['dis_instr_nonumber'] = (df4model['discharge_instruction'].str.replace('X', '')


In [157]:
df4model['combined_nonumber']= (df4model['combined'].str.replace('X', '')
 .str.replace('//', '')
 .str.replace('.00', '')
 .str.replace('xxxx', '')
 .str.replace('xx', '')
 .str.replace('\d+', ''))

  df4model['combined_nonumber']= (df4model['combined'].str.replace('X', '')


In [158]:
train, test = train_test_split(df4model, random_state=42, test_size=0.33, shuffle=True)
X_train = train.dis_instr_nonumber.fillna(' ')
X_test = test.dis_instr_nonumber.fillna(' ')
y_train = train[categories]
y_test = test[categories]

print(X_train.shape)
print(X_test.shape)

(176840,)
(87101,)


In [159]:
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])

var_pre = {}

for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(X_test)
    var_pre[category] = prediction
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

logreg_pred = pd.DataFrame(var_pre)

print(logreg_pred)

... Processing icd_E11
Test accuracy is 0.7144234853790428
... Processing icd_E78
Test accuracy is 0.6144246334714871
... Processing icd_E87
Test accuracy is 0.8027806799003456
... Processing icd_F32
Test accuracy is 0.8003352429937659
... Processing icd_I16
Test accuracy is 0.7136542634412923
... Processing icd_I50
Test accuracy is 0.8564195589028829
... Processing icd_N17
Test accuracy is 0.8412991814100872
... Processing icd_Y92
Test accuracy is 0.8274187437572473
... Processing icd_Z85
Test accuracy is 0.8035728636869841
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            0        0        0        0        0        0        0        0   
1            0        0        0        0        1        0        0        0   
2            0        1        0        0        1        0        0        0   
3            0        0        1        0        0        0        1        0   
4            0        0        0        0        0        0     

In [162]:
train, test = train_test_split(df4model, random_state=42, test_size=0.33, shuffle=True)
X_train = train.dis_instr_nonumber.fillna(' ').str.lower()
X_test = test.dis_instr_nonumber.fillna(' ').str.lower()
y_train = train[categories]
y_test = test[categories]

print(X_train.shape)
print(X_test.shape)

(176840,)
(87101,)


In [165]:
LogReg_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=1)),
            ])

var_pre = {}

for category in categories:
    print('... Processing {}'.format(category))
    # train the model using X_dtm & y
    LogReg_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = LogReg_pipeline.predict(X_test)
    var_pre[category] = prediction
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

logreg_pred = pd.DataFrame(var_pre)

print(logreg_pred)

... Processing icd_E11
Test accuracy is 0.7144234853790428
... Processing icd_E78
Test accuracy is 0.6144131525470431
... Processing icd_E87
Test accuracy is 0.8027806799003456
... Processing icd_F32
Test accuracy is 0.8003352429937659
... Processing icd_I16
Test accuracy is 0.7136657443657364
... Processing icd_I50
Test accuracy is 0.8564195589028829
... Processing icd_N17
Test accuracy is 0.8412991814100872
... Processing icd_Y92
Test accuracy is 0.8274187437572473
... Processing icd_Z85
Test accuracy is 0.8035728636869841
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            0        0        0        0        0        0        0        0   
1            0        0        0        0        1        0        0        0   
2            0        1        0        0        1        0        0        0   
3            0        0        1        0        0        0        1        0   
4            0        0        0        0        0        0     

In [166]:
y_pred9 = logreg_pred.to_numpy()

print('Accuracy Score with the discharge_instruction and liner-SVC is',  Accuracy(y_true, y_pred9))
print('Hamming_loss with the discharge_instruction and liner-SVC is', Hamming_Loss(y_true,y_pred9))

Accuracy Score with the discharge_instruction and liner-SVC is 0.2608438014762295
Hamming_loss with the discharge_instruction and liner-SVC is 0.22507459411742944


In [None]:
#we are going to use the 