In [1]:
import pandas as pd
import numpy as np
import re
import datetime
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
DIR_ = "D:/Drive/OneDrive/Documents/MIMIC"
DIR = "D:/Workspace/MIMIC_DATA/mimic-iii-clinical-database-1.4/"

In [3]:
adm_notes = pd.read_csv(DIR + "readmission.csv", low_memory=False)

# Natural Language

In [4]:
import string
import nltk
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [5]:
def clean_text(texts):
    texts = texts.fillna(' ')
    texts = texts.str.replace('\n',' ')
    texts = texts.str.replace('\r',' ')

    table = str.maketrans('', '', string.punctuation + '0123456789')
    texts = [text.lower().translate(table) for text in texts]

    return texts

In [6]:
adm_notes['TEXT'] = clean_text(adm_notes['TEXT'])

In [7]:
stop_words = stopwords.words('english')
stop_words = stop_words + ['patient', 'date', 'admission', 'discharge', 'lastname', 'firstname', 'sex']

In [8]:
porter = PorterStemmer()

In [9]:
def tokenize_stem(text):
    words = word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    words = [porter.stem(word) for word in words]
    return words

# Model
## Words, Train and Test

In [10]:
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, cross_validate
from xgboost import XGBClassifier


Repartition data

In [11]:
df_train, df_test = train_test_split(adm_notes, test_size=0.2)

Subsample non-readmitted patients to match size of readmitted ones

In [12]:
rows_pos = df_train['READM_WITHIN_30'] == 1
df_train_pos = df_train.loc[rows_pos]
df_train_neg = df_train.loc[~rows_pos]

df_train = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos))], axis = 0)
df_train = df_train.sample(n = len(df_train)).reset_index(drop = True)

In [13]:
rows_pos = df_test['READM_WITHIN_30'] == 1
df_test_pos = df_test.loc[rows_pos]
df_test_neg = df_test.loc[~rows_pos]

df_test = pd.concat([df_test_pos, df_test_neg.sample(n = len(df_test_pos))], axis = 0)
df_test = df_test.sample(n = len(df_test)).reset_index(drop = True)

In [14]:
print(df_train.shape)
print(df_test.shape)

(4784, 10)
(1142, 10)


Sparse Matrix with word count

In [15]:
# Tokenize and stem function
def tokenize_stem(text):
    stemmer = PorterStemmer()
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

# Assuming df_train and df_test are already defined and contain the required columns

# Vectorization
vect = CountVectorizer(lowercase=True, max_features=1000, tokenizer=tokenize_stem, token_pattern=None)

# Fit the vectorizer
vect.fit(df_train['TEXT'].values)

# Transform the text data
X_train_tf = vect.transform(df_train['TEXT'].values)
X_test_tf = vect.transform(df_test['TEXT'].values)

# Extract target variables
y_train = df_train['READM_WITHIN_30']
y_test = df_test['READM_WITHIN_30']

In [16]:
metrics = ['roc_auc', 'accuracy', 'precision', 'recall']

## Random Forest

In [17]:


# Define the parameter grid
grid = {
    'max_depth': [2, 3, 4, 6],
    'min_samples_leaf': [1, 2, 5],
    'min_samples_split': [2, 5, 10],  # Corrected min_samples_split values
    'n_estimators': [100, 200, 300]
}

# Initialize the GridSearchCV with the RandomForestClassifier
model_rf = GridSearchCV(RandomForestClassifier(bootstrap=True), param_grid=grid, cv=5, n_jobs=-1)

# Fit the model to the training data
model_rf.fit(X_train_tf, y_train)

# Access the best parameters found by the grid search
best_params = model_rf.best_params_
print(best_params)


{'max_depth': 6, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}


In [18]:
model_rf = RandomForestClassifier(
    bootstrap=True, max_depth=3, min_samples_leaf=1, min_samples_split=2, n_estimators=200
)
scores_rf = cross_validate(model_rf, X_train_tf, y_train, cv=5, scoring=metrics, return_train_score=True)

In [19]:
print('Set\t\t', 'Train\t\t', '\t Sd Train')
print('AUC\t\t', np.mean(scores_rf['train_roc_auc']), '\t', np.std(scores_rf['train_roc_auc']))
print('Accuracy\t', np.mean(scores_rf['train_accuracy']), '\t', np.std(scores_rf['train_accuracy']))
print('Precission\t', np.mean(scores_rf['train_precision']), '\t', np.std(scores_rf['train_precision']))
print('Recall\t\t', np.mean(scores_rf['train_precision']), '\t', np.std(scores_rf['train_precision']))

Set		 Train		 	 Sd Train
AUC		 0.6994328359468142 	 0.004046382916521413
Accuracy	 0.6433945657524944 	 0.002546900807092098
Precission	 0.654664853526655 	 0.004265564324130587
Recall		 0.654664853526655 	 0.004265564324130587


In [20]:
print('Set\t\t', 'Test\t\t', '\t Sd Test')
print('AUC\t\t', np.mean(scores_rf['test_roc_auc']), '\t', np.std(scores_rf['test_roc_auc']))
print('Accuracy\t', np.mean(scores_rf['test_accuracy']), '\t', np.std(scores_rf['test_accuracy']))
print('Precission\t', np.mean(scores_rf['test_precision']), '\t', np.std(scores_rf['test_precision']))
print('Recall\t\t', np.mean(scores_rf['test_recall']), '\t', np.std(scores_rf['test_recall']))

Set		 Test		 	 Sd Test
AUC		 0.6596509546686578 	 0.007848848705528533
Accuracy	 0.6208177577244091 	 0.010454471615483863
Precission	 0.6304081389081835 	 0.0138191193794036
Recall		 0.5852822739144486 	 0.011181659485025548
