In [1]:
import pandas as pd
import numpy as np
import re
import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

In [2]:
DIR_ = "D:/Drive/OneDrive/Documents/MIMIC"
DIR = "D:/Workspace/MIMIC_DATA/mimic-iii-clinical-database-1.4/"

In [3]:
adm_notes = pd.read_csv(DIR + "readmission.csv", low_memory=False)

# Natural Language

In [4]:
import string
import nltk
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [5]:
def clean_text(texts):
    texts = texts.fillna(' ')
    texts = texts.str.replace('\n',' ')
    texts = texts.str.replace('\r',' ')

    table = str.maketrans('', '', string.punctuation + '0123456789')
    texts = [text.lower().translate(table) for text in texts]

    return texts

In [6]:
adm_notes['TEXT'] = clean_text(adm_notes['TEXT'])

In [7]:
stop_words = stopwords.words('english')
stop_words = stop_words + ['patient', 'date', 'admission', 'discharge', 'lastname', 'firstname', 'sex']

In [8]:
porter = PorterStemmer()

In [9]:
def tokenize_stem(text):
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [porter.stem(word) for word in words]
    return words

# Model
## Words, Train and Test

In [10]:
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_validate
from xgboost import XGBClassifier

Repartition data

In [11]:
df_train, df_test = train_test_split(adm_notes, test_size=0.3)

Subsample non-readmitted patients to match size of readmitted ones

In [12]:
rows_pos = df_train['READM_WITHIN_30'] == 1
df_train_pos = df_train.loc[rows_pos]
df_train_neg = df_train.loc[~rows_pos]

df_train = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos))], axis = 0)
df_train = df_train.sample(n = len(df_train)).reset_index(drop = True)

In [13]:
rows_pos = df_test['READM_WITHIN_30'] == 1
df_test_pos = df_test.loc[rows_pos]
df_test_neg = df_test.loc[~rows_pos]

df_test = pd.concat([df_test_pos, df_test_neg.sample(n = len(df_test_pos))], axis = 0)
df_test = df_test.sample(n = len(df_test)).reset_index(drop = True)

In [14]:
df_test, df_valid = train_test_split(df_test, test_size=0.5)

In [15]:
print(df_train.shape)
print(df_test.shape)
print(df_valid.shape)

(4150, 10)
(888, 10)
(888, 10)


Sparse Matrix with word count

In [16]:
# Tokenize and stem function
def tokenize_stem(text):
    stemmer = PorterStemmer()
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

# Assuming df_train and df_test are already defined and contain the required columns

# Vectorization
vect = CountVectorizer(lowercase=True, max_features=1000, tokenizer=tokenize_stem, token_pattern=None)

# Fit the vectorizer
vect.fit(df_train['TEXT'].values)

# Transform the text data
X_train_tf = vect.transform(df_train['TEXT'].values)
X_test_tf = vect.transform(df_test['TEXT'].values)
X_valid_tf = vect.transform(df_valid['TEXT'].values)

# Extract target variables
y_train = df_train['READM_WITHIN_30']
y_test = df_test['READM_WITHIN_30']
y_valid = df_valid['READM_WITHIN_30']
     


## Support Vector Machine

In [17]:
grid = {
    'kernel': ['linear', 'poly', 'rbf'],
    'C': [0.1, 1, 2]
}

model_svm = GridSearchCV(SVC(), param_grid=grid)
model_svm.fit(X_test_tf, y_test)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.1, 1, 2], 'kernel': ['linear', 'poly', 'rbf']})

In [18]:
model_svm.best_params_

{'C': 1, 'kernel': 'rbf'}

In [19]:
model_svm = SVC(kernel='rbf', C=1).fit(X_train_tf, y_train)

In [20]:
y_train_preds = model_svm.predict(X_train_tf)
y_valid_preds = model_svm.predict(X_valid_tf)

In [21]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_preds)
fprt, tprt, thresholds = metrics.roc_curve(y_valid, y_valid_preds)

auc_train = metrics.auc(fpr, tpr)
auc_test = metrics.auc(fprt, tprt)

acc_train = metrics.accuracy_score(y_train, y_train_preds)
acc_test = metrics.accuracy_score(y_valid, y_valid_preds)

prec_train = metrics.precision_score(y_train, y_train_preds)
prec_test = metrics.precision_score(y_valid, y_valid_preds)

rec_train = metrics.recall_score(y_train, y_train_preds)
rec_test = metrics.recall_score(y_valid, y_valid_preds)

In [22]:
print('Set\t\t', 'Train\t\t', '   Test')
print('AUC\t\t', auc_train, auc_test)
print('Accuracy\t', acc_train, acc_test)
print('Precission\t', prec_train, prec_test)
print('Recall\t\t', rec_train, rec_test)

Set		 Train		    Test
AUC		 0.7146987951807229 0.6241274350649351
Accuracy	 0.7146987951807229 0.6238738738738738
Precission	 0.7507034327518289 0.6357142857142857
Recall		 0.6428915662650603 0.5959821428571429


## Multi-layer Perceptron

In [24]:
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier

# Define the parameter grid
grid = {
    'alpha': [1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
    'solver': ['adam', 'sgd', 'lbfgs'],
    'learning_rate_init': [0.01, 0.001, 0.0001]
}

# Initialize the GridSearchCV with the MLPClassifier
model_mlp = GridSearchCV(MLPClassifier(hidden_layer_sizes=(10, 2), max_iter=6000), param_grid=grid, cv=5)

# Fit the model to the training data
model_mlp.fit(X_train_tf, y_train)

# Access the best parameters found by the grid search
best_params = model_mlp.best_params_
print(best_params)

{'alpha': 0.1, 'learning_rate_init': 0.001, 'solver': 'adam'}


In [25]:
model_mlp.best_params_

{'alpha': 0.1, 'learning_rate_init': 0.001, 'solver': 'adam'}

In [26]:
model_mlp = MLPClassifier(hidden_layer_sizes=(10,2), alpha=0.1, learning_rate_init=0.001, solver='lbfgs').fit(X_train_tf, y_train)

In [27]:
y_train_preds = model_mlp.predict(X_train_tf)
y_valid_preds = model_mlp.predict(X_valid_tf)

In [28]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_preds)
fprt, tprt, thresholds = metrics.roc_curve(y_valid, y_valid_preds)

auc_train = metrics.auc(fpr, tpr)
auc_test = metrics.auc(fprt, tprt)

acc_train = metrics.accuracy_score(y_train, y_train_preds)
acc_test = metrics.accuracy_score(y_valid, y_valid_preds)

prec_train = metrics.precision_score(y_train, y_train_preds)
prec_test = metrics.precision_score(y_valid, y_valid_preds)

rec_train = metrics.recall_score(y_train, y_train_preds)
rec_test = metrics.recall_score(y_valid, y_valid_preds)

In [29]:
print('Set\t\t', 'Train\t\t', '   Test')
print('AUC\t\t', auc_train, auc_test)
print('Accuracy\t', acc_train, acc_test)
print('Precission\t', prec_train, prec_test)
print('Recall\t\t', rec_train, rec_test)

Set		 Train		    Test
AUC		 0.5202409638554217 0.5042410714285714
Accuracy	 0.5202409638554217 0.5
Precission	 0.9772727272727273 0.5769230769230769
Recall		 0.04144578313253012 0.033482142857142856


## Random Forest

In [30]:
grid = {
    'max_depth': [10, 20, 30, 40],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [1, 2],
    'min_samples_split': [2, 5],
    'n_estimators': [100, 200, 300]
}
model_rf = GridSearchCV(RandomForestClassifier(bootstrap=False), param_grid=grid)
model_rf.fit(X_test_tf, y_test)

GridSearchCV(estimator=RandomForestClassifier(bootstrap=False),
             param_grid={'max_depth': [10, 20, 30, 40],
                         'max_features': ['auto', 'sqrt'],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 5],
                         'n_estimators': [100, 200, 300]})

In [31]:
model_rf.best_params_

{'max_depth': 20,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 100}

In [32]:
model_rf = RandomForestClassifier(
    bootstrap=False, max_depth=10, max_features='auto', min_samples_leaf=2, min_samples_split=2, n_estimators=100
).fit(X_train_tf, y_train)

In [33]:
y_train_preds = model_rf.predict(X_train_tf)
y_valid_preds = model_rf.predict(X_valid_tf)

In [34]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_preds)
fprt, tprt, thresholds = metrics.roc_curve(y_valid, y_valid_preds)

auc_train = metrics.auc(fpr, tpr)
auc_test = metrics.auc(fprt, tprt)

acc_train = metrics.accuracy_score(y_train, y_train_preds)
acc_test = metrics.accuracy_score(y_valid, y_valid_preds)

prec_train = metrics.precision_score(y_train, y_train_preds)
prec_test = metrics.precision_score(y_valid, y_valid_preds)

rec_train = metrics.recall_score(y_train, y_train_preds)
rec_test = metrics.recall_score(y_valid, y_valid_preds)

In [35]:
print('Set\t\t', 'Train\t\t', '   Test')
print('AUC\t\t', auc_train, auc_test)
print('Accuracy\t', acc_train, acc_test)
print('Precission\t', prec_train, prec_test)
print('Recall\t\t', rec_train, rec_test)

Set		 Train		    Test
AUC		 0.9260240963855422 0.6179991883116882
Accuracy	 0.9260240963855422 0.6182432432432432
Precission	 0.9857142857142858 0.6162046908315565
Recall		 0.864578313253012 0.6450892857142857


## Naive Bayes

In [36]:
# Define the parameter grid
param_grid = {'alpha': [10, 1, 0.1, 0.01, 0.001]}

# Initialize the GridSearchCV with the MultinomialNB
grid_search_nb = GridSearchCV(MultinomialNB(), param_grid=param_grid, cv=3, n_jobs=-1)

# Fit the model to the training data
grid_search_nb.fit(X_train_tf, y_train)

# Get the best parameters
best_alpha = grid_search_nb.best_params_['alpha']
print(f"Best alpha: {best_alpha}")



GridSearchCV(estimator=MultinomialNB(),
             param_grid={'alpha': [10, 1, 0, 0.1, 0.01, 0.001]})

In [37]:
model_nb.best_params_

{'alpha': 0}

In [38]:
# model_nb = MultinomialNB(alpha=1)
model_nb = MultinomialNB(alpha=best_alpha).fit(X_train_tf, y_train)

In [39]:
y_train_preds = model_nb.predict(X_train_tf)
y_valid_preds = model_nb.predict(X_valid_tf)

In [40]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_preds)
fprt, tprt, thresholds = metrics.roc_curve(y_valid, y_valid_preds)

auc_train = metrics.auc(fpr, tpr)
auc_test = metrics.auc(fprt, tprt)

acc_train = metrics.accuracy_score(y_train, y_train_preds)
acc_test = metrics.accuracy_score(y_valid, y_valid_preds)

prec_train = metrics.precision_score(y_train, y_train_preds)
prec_test = metrics.precision_score(y_valid, y_valid_preds)

rec_train = metrics.recall_score(y_train, y_train_preds)
rec_test = metrics.recall_score(y_valid, y_valid_preds)

In [41]:
print('Set\t\t', 'Train\t\t', '   Test')
print('AUC\t\t', auc_train, auc_test)
print('Accuracy\t', acc_train, acc_test)
print('Precission\t', prec_train, prec_test)
print('Recall\t\t', rec_train, rec_test)

Set		 Train		    Test
AUC		 0.6457831325301205 0.6150162337662337
Accuracy	 0.6457831325301204 0.6148648648648649
Precission	 0.6628971459343026 0.6232558139534884
Recall		 0.5932530120481928 0.5982142857142857


There is no improvement when using the whole Dataset with ComplementNB, same scores on test

## XGBoost

In [42]:
grid = {
    'min_child_weight': [1, 5],
    'gamma': [0.5, 1, 1.5],
    'max_depth': [3, 5],
    'eta': [0.1,0.01,0.001]
}

model_xgb = GridSearchCV(XGBClassifier(), param_grid=grid)
model_xgb.fit(X_test_tf, y_test)

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     callbacks=None, colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     early_stopping_rounds=None,
                                     enable_categorical=False, eval_metric=None,
                                     feature_types=None, gamma=None,
                                     gpu_id=None, grow_policy=None,
                                     importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=None, max_b...one,
                                     max_cat_threshold=None,
                                     max_cat_to_onehot=None,
                                     max_delta_step=None, max_depth=None,
                                     max_leaves=None, min_ch

In [43]:
model_xgb.best_params_

{'eta': 0.01, 'gamma': 1, 'max_depth': 5, 'min_child_weight': 5}

In [44]:
model_xgb = XGBClassifier(
    min_child_weight=1, gamma=0.5, max_depth=3, eta=0.01
).fit(X_train_tf, y_train)

In [45]:
y_train_preds = model_xgb.predict(X_train_tf)
y_valid_preds = model_xgb.predict(X_valid_tf)

In [46]:
fpr, tpr, thresholds = metrics.roc_curve(y_train, y_train_preds)
fprt, tprt, thresholds = metrics.roc_curve(y_valid, y_valid_preds)

auc_train = metrics.auc(fpr, tpr)
auc_test = metrics.auc(fprt, tprt)

acc_train = metrics.accuracy_score(y_train, y_train_preds)
acc_test = metrics.accuracy_score(y_valid, y_valid_preds)

prec_train = metrics.precision_score(y_train, y_train_preds)
prec_test = metrics.precision_score(y_valid, y_valid_preds)

rec_train = metrics.recall_score(y_train, y_train_preds)
rec_test = metrics.recall_score(y_valid, y_valid_preds)

In [47]:
print('Set\t\t', 'Train\t\t', '   Test')
print('AUC\t\t', auc_train, auc_test)
print('Accuracy\t', acc_train, acc_test)
print('Precission\t', prec_train, prec_test)
print('Recall\t\t', rec_train, rec_test)

Set		 Train		    Test
AUC		 0.6761445783132531 0.6070616883116883
Accuracy	 0.676144578313253 0.6069819819819819
Precission	 0.6881111682964488 0.6132723112128147
Recall		 0.6443373493975904 0.5982142857142857
