In [1]:
import pandas as pd
from collections import Counter
import re
import numpy as np

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt


import glob
from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity

from scipy import sparse
from scipy.sparse.linalg import svds


from nltk.tokenize import sent_tokenize, word_tokenize, regexp_tokenize
from nltk.corpus import stopwords

import nltk
nltk.download('stopwords')

import nltk
nltk.download('punkt')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import seaborn as sns


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\upadh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\upadh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df4model = pd.read_csv('../data/df4model.csv')

In [17]:
categories = ['icd_E11', 'icd_E78','icd_E87', 'icd_F32', 'icd_I16', 'icd_I50', 'icd_N17', 'icd_Y92','icd_Z85']

variables = [ 'history_present_illness', 'past_medcal_history', 'social_history', 'family_history',
       'physical_exam', 'pertinent_results', 'hospital_course',
       'medication_on_admission', 'medication_on_discharge',
       'discharge_disposition', 'discharge_diagnosis', 'discharge_condition',
       'discharge_instruction'] 

df4model['dis_mer'] = (df4model['discharge_disposition']+ df4model['discharge_diagnosis']
                        + df4model['discharge_condition']+ df4model['discharge_instruction']) #these are object do we need to use astype(str)

train, test = train_test_split(df4model, random_state=42, test_size=0.33, shuffle=True)
X_train = train.discharge_instruction.fillna(' ')
X_test = test.discharge_instruction.fillna(' ')
y_train = train[categories]
y_test = test[categories]

print(X_train.shape)
print(X_test.shape)

(176840,)
(87101,)


In [18]:
# traing classifier

# Define a pipeline combining a text feature extractor with multi lable classifier
NB_pipeline = Pipeline([
                ('tfidf', TfidfVectorizer(stop_words=list(stop_words))),
                ('clf', OneVsRestClassifier(MultinomialNB(
                    fit_prior=True, class_prior=None))),
            ])

var_pre = {}


for category in categories:
    print('... Processing {}'.format(category))
    
    NB_pipeline.fit(X_train, train[category])
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    var_pre[category] = prediction
    
    
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))

nb_pred = pd.DataFrame(var_pre)

print(nb_pred)

... Processing icd_E11
Test accuracy is 0.7027359042950139
... Processing icd_E78
Test accuracy is 0.609970034787201
... Processing icd_E87
Test accuracy is 0.7906568236874433
... Processing icd_F32
Test accuracy is 0.7970861413761036
... Processing icd_I16
Test accuracy is 0.6978794732551865
... Processing icd_I50
Test accuracy is 0.8417584183878486
... Processing icd_N17
Test accuracy is 0.8223786179263154
... Processing icd_Y92
Test accuracy is 0.8178780955442532
... Processing icd_Z85
Test accuracy is 0.8020459007359273
       icd_E11  icd_E78  icd_E87  icd_F32  icd_I16  icd_I50  icd_N17  icd_Y92  \
0            0        1        0        0        0        0        0        0   
1            0        0        0        0        1        0        0        0   
2            0        0        0        0        0        0        0        0   
3            0        0        0        0        0        0        1        0   
4            0        0        0        0        0        0      

In [11]:
y_test.head(5)

Unnamed: 0,icd_E11,icd_E78,icd_E87,icd_F32,icd_I16,icd_I50,icd_N17,icd_Y92,icd_Z85
4039,0,1,0,0,1,1,0,0,0
196957,0,0,0,1,1,0,0,1,0
68719,1,1,0,0,1,0,0,0,0
18633,0,1,0,0,0,1,1,0,0
92713,1,0,0,0,0,0,0,0,0


In [12]:
nb_pred.head(5)

Unnamed: 0,icd_E11,icd_E78,icd_E87,icd_F32,icd_I16,icd_I50,icd_N17,icd_Y92,icd_Z85
0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0


In [13]:
y_true = y_test.to_numpy()
y_pred = nb_pred.to_numpy()

In [16]:
def Accuracy(y_true, y_pred):

    temp = 0

    for i in range(y_true.shape[0]):
        temp += sum(np.logical_and(y_true[i], y_pred[i])) / sum(np.logical_or(y_true[i], y_pred[i]))
    return temp / y_true.shape[0]
    
print('Accuracy Score with the discharge_instruction as variable is',  Accuracy(y_true, y_pred))

Accuracy Score with the discharge_instruction as variable is 0.17546106390265603


In [15]:
def Hamming_Loss(y_true, y_pred):
    temp=0
    for i in range(y_true.shape[0]):
        temp += np.size(y_true[i] == y_pred[i]) - np.count_nonzero(y_true[i] == y_pred[i])
    return temp/(y_true.shape[0] * y_true.shape[1])
    
print('Hamming_loss with the discharge_instruction as variable is', Hamming_Loss(y_true, y_pred))

Hamming_loss with the discharge_instruction as variable is 0.23529006555607856


**The scores changed upon cleaning the strings for the discharge_instruction 
from a accuracy score of 0.1901903837963846 to Accuracy Score with the discharge_instruction as variable is 0.17546106390265603

Hamming_loss with the discharge_instruction as variable is 0.23433714882722356 to Hamming_loss with the discharge_instruction as variable is 0.23529006555607856**

In [24]:
(abs(np.diff([y_test.to_numpy(), nb_pred.to_numpy()], axis=0)).sum(axis=1)/len(y_test))*100

array([[29.72640957, 39.00299652, 20.93431763, 20.29138586, 30.21205267,
        15.82415816, 17.76213821, 18.21219045, 19.79540993]])

In [19]:
print(y_test.columns, ((abs(np.diff([y_test.to_numpy(), nb_pred.to_numpy()], axis=0))).sum(axis=1)/len(y_test))*100)

Index(['icd_E11', 'icd_E78', 'icd_E87', 'icd_F32', 'icd_I16', 'icd_I50',
       'icd_N17', 'icd_Y92', 'icd_Z85'],
      dtype='object') [[29.72640957 39.00299652 20.93431763 20.29138586 30.21205267 15.82415816
  17.76213821 18.21219045 19.79540993]]


In [32]:
np.diff([y_test.to_numpy(), nb_pred.to_numpy()], axis=0).shape


(87101, 9)

In [44]:
(abs(np.diff([y_test.to_numpy(), nb_pred.to_numpy()], axis=0)).sum(axis=2)/9).mean()

#On avaerage out of the 9 times two times the icd diagnosis is wrong

0.23529006555607862

In [None]:
# going to use the dis_mer as the variable now
train, test = train_test_split(df4model, random_state=42, test_size=0.33, shuffle=True)
X_train = train.dis_mer.fillna(' ')
X_test = test.dis_mer.fillna(' ')
y_train = train[categories]
y_test = test[categories]

print(X_train.shape)
print(X_test.shape)