In [1]:
import pandas as pd
import numpy as np
import re
import datetime
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
DIR_ = "D:/Drive/OneDrive/Documents/MIMIC"
DIR = "D:/Workspace/MIMIC_DATA/mimic-iii-clinical-database-1.4/"

In [3]:
adm_notes = pd.read_csv(DIR + "readmission.csv", low_memory=False)

# Natural Language

In [4]:
import string
import nltk
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [5]:
def clean_text(texts):
    texts = texts.fillna(' ')
    texts = texts.str.replace('\n',' ')
    texts = texts.str.replace('\r',' ')

    table = str.maketrans('', '', string.punctuation + '0123456789')
    texts = [text.lower().translate(table) for text in texts]

    return texts

In [6]:
adm_notes['TEXT'] = clean_text(adm_notes['TEXT'])

In [7]:
stop_words = stopwords.words('english')
stop_words = stop_words + ['patient', 'date', 'admission', 'discharge', 'lastname', 'firstname', 'sex']

In [8]:
porter = PorterStemmer()

In [9]:
def tokenize_stem(text):
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [porter.stem(word) for word in words]
    return words

# Model
## Words, Train and Test

In [10]:
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_validate
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV


Repartition data

In [11]:
df_train, df_test = train_test_split(adm_notes, test_size=0.2)

Subsample non-readmitted patients to match size of readmitted ones

In [12]:
rows_pos = df_train['READM_WITHIN_30'] == 1
df_train_pos = df_train.loc[rows_pos]
df_train_neg = df_train.loc[~rows_pos]

df_train = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos))], axis = 0)
df_train = df_train.sample(n = len(df_train)).reset_index(drop = True)

In [13]:
rows_pos = df_test['READM_WITHIN_30'] == 1
df_test_pos = df_test.loc[rows_pos]
df_test_neg = df_test.loc[~rows_pos]

df_test = pd.concat([df_test_pos, df_test_neg.sample(n = len(df_test_pos))], axis = 0)
df_test = df_test.sample(n = len(df_test)).reset_index(drop = True)

In [14]:
print(df_train.shape)
print(df_test.shape)

(4736, 10)
(1190, 10)


Sparse Matrix with word count

In [15]:

# Assuming df_train and df_test are already defined and contain the required columns

# Vectorization
vect = CountVectorizer(lowercase=True, max_features=1000, tokenizer=tokenize_stem, token_pattern=None)

# Fit the vectorizer
vect.fit(df_train['TEXT'].values)

# Transform the text data
X_train_tf = vect.transform(df_train['TEXT'].values)
X_test_tf = vect.transform(df_test['TEXT'].values)


# Extract target variables
y_train = df_train['READM_WITHIN_30']
y_test = df_test['READM_WITHIN_30']

     

In [16]:
metrics = ['roc_auc', 'accuracy', 'precision', 'recall']

## Support Vector Machine

In [17]:
grid = {
    'kernel': ['linear', 'poly', 'rbf'],
    #'C': np.logspace(-3, 3, 7),
    'gamma': np.logspace(-3, 3, 7)
}

model_svm = GridSearchCV(SVC(), param_grid=grid)
model_svm.fit(X_test_tf, y_test)

GridSearchCV(estimator=SVC(),
             param_grid={'gamma': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
                         'kernel': ['linear', 'poly', 'rbf']})

In [18]:
model_svm.best_params_

{'gamma': 0.001, 'kernel': 'rbf'}

In [19]:
model_svm = SVC(C=0.3, gamma=0.001, kernel='rbf')
scores_svm = cross_validate(model_svm, X_train_tf, y_train, cv=5, scoring=metrics, return_train_score=True)

In [20]:
print('Set\t\t', 'Train\t\t', '\t Sd Train')
print('AUC\t\t', np.mean(scores_svm['train_roc_auc']), '\t', np.std(scores_svm['train_roc_auc']))
print('Accuracy\t', np.mean(scores_svm['train_accuracy']), '\t', np.std(scores_svm['train_accuracy']))
print('Precission\t', np.mean(scores_svm['train_precision']), '\t', np.std(scores_svm['train_precision']))
print('Recall\t\t', np.mean(scores_svm['train_recall']), '\t', np.std(scores_svm['train_recall']))

Set		 Train		 	 Sd Train
AUC		 0.8130068530783865 	 0.0028869431409570996
Accuracy	 0.6855996057057291 	 0.0021547316301761374
Precission	 0.662021267313204 	 0.0028531636733218264
Recall		 0.662021267313204 	 0.0028531636733218264


In [None]:
plt.plot(fpr, tpr, label='Train')
plt.plot(fprt, tprt, label='Test')
plt.title("ROC Curve")
plt.legend(loc="lower right")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")

plt.savefig('svm_train.png')  

In [21]:
print('Set\t\t', 'Test\t\t', '\t Sd Test')
print('AUC\t\t', np.mean(scores_svm['test_roc_auc']), '\t', np.std(scores_svm['test_roc_auc']))
print('Accuracy\t', np.mean(scores_svm['test_accuracy']), '\t', np.std(scores_svm['test_accuracy']))
print('Precission\t', np.mean(scores_svm['test_precision']), '\t', np.std(scores_svm['test_precision']))
print('Recall\t\t', np.mean(scores_svm['test_recall']), '\t', np.std(scores_svm['test_recall']))

Set		 Test		 	 Sd Test
AUC		 0.6458777265995475 	 0.014052480146093211
Accuracy	 0.6055730064739194 	 0.015416153576061173
Precission	 0.5894209949290333 	 0.013265640502348805
Recall		 0.6963684534482296 	 0.018063937139110645


## Multi-layer Perceptron

In [22]:

# Define the parameter grid
param_dist = {
    'alpha': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
    'solver': ['adam', 'sgd'],
    'learning_rate_init': [0.01, 0.001, 0.0001]
}

# Initialize the RandomizedSearchCV with the MLPClassifier
model_mlp = RandomizedSearchCV(MLPClassifier(hidden_layer_sizes=(10, 2), max_iter=1000), param_distributions=param_dist, cv=5, n_iter=10, n_jobs=-1, verbose=2)

# Fit the model to the training data
model_mlp.fit(X_train_tf, y_train)

# Access the best parameters found by the randomized search
best_params = model_mlp.best_params_
print("Best parameters found by randomized search:", best_params)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best parameters found by randomized search: {'solver': 'adam', 'learning_rate_init': 0.001, 'alpha': 0.001}


In [23]:
# Initialize the MLPClassifier with the best parameters and increased max_iter
model_mlp_best = MLPClassifier(hidden_layer_sizes=(10, 2), 
                               alpha=best_params['alpha'], 
                               learning_rate_init=best_params['learning_rate_init'], 
                               solver=best_params['solver'], 
                               max_iter=1000)

# Define the metrics for cross-validation
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']

# Perform cross-validation
scores_mlp = cross_validate(model_mlp_best, X_train_tf, y_train, cv=5, scoring=metrics, return_train_score=True, n_jobs=-1, verbose=2)


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   12.4s remaining:   18.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.7s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   16.7s finished


In [24]:

# Print cross-validation results
print('Set\t\t', 'Train\t\t', 'Sd Train')
print('AUC\t\t', np.mean(scores_mlp['train_roc_auc']), '\t', np.std(scores_mlp['train_roc_auc']))
print('Accuracy\t', np.mean(scores_mlp['train_accuracy']), '\t', np.std(scores_mlp['train_accuracy']))
print('Precision\t', np.mean(scores_mlp['train_precision']), '\t', np.std(scores_mlp['train_precision']))
print('Recall\t\t', np.mean(scores_mlp['train_recall']), '\t', np.std(scores_mlp['train_recall']))
print('F1 Score\t', np.mean(scores_mlp['train_f1']), '\t', np.std(scores_mlp['train_f1']))

Set		 Train		 Sd Train
AUC		 0.9657754597110657 	 0.05173648077126075
Accuracy	 0.9391906154173297 	 0.048347474457968054
Precision	 0.9993660670007303 	 0.0005520549699767634
Recall		 0.8789064202188273 	 0.09649267520278766
F1 Score	 0.9323968125810411 	 0.056456520655302975


In [25]:
print('Set\t\t', 'Test\t\t', '\t Sd Test')
print('AUC\t\t', np.mean(scores_mlp['test_roc_auc']), '\t', np.std(scores_mlp['test_roc_auc']))
print('Accuracy\t', np.mean(scores_mlp['test_accuracy']), '\t', np.std(scores_mlp['test_accuracy']))
print('Precission\t', np.mean(scores_mlp['test_precision']), '\t', np.std(scores_mlp['test_precision']))
print('Recall\t\t', np.mean(scores_mlp['test_recall']), '\t', np.std(scores_mlp['test_recall']))
print('F1 Score\t', np.mean(scores_mlp['test_f1']), '\t', np.std(scores_mlp['test_f1']))

Set		 Test		 	 Sd Test
AUC		 0.619740610646585 	 0.011481758410409682
Accuracy	 0.5920557478869537 	 0.013836942434386686
Precission	 0.6006336194806657 	 0.0160493266706609
Recall		 0.5510807218490469 	 0.02680437929760938
F1 Score	 0.5743881472326804 	 0.016980076531496153
