In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix,f1_score,accuracy_score,matthews_corrcoef,roc_auc_score,roc_curve
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler,StandardScaler,RobustScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# Reading the data
df = pd.read_csv('hospital_deaths_train.csv')
df.drop('recordid',axis=1,inplace=True)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(mech_data, mech_clean , test_size=0.2,random_state=11)

In [3]:
mech_data = df[~df['MechVentLast8Hour'].isnull()]
data_copy = mech_data.copy()
mask = np.random.choice([True,False], size=mech_data.shape[0],p=[0.1,0.9])

mech_data.loc[mask,'MechVentLast8Hour'] = np.nan


In [7]:
mech_data.shape

(2060, 115)

In [4]:
nan_index = mech_data[mech_data['MechVentLast8Hour'].isnull()].index
nan_index

Int64Index([   3,    4,   26,   44,   67,   84,  107,  143,  167,  180,
            ...
            3065, 3069, 3120, 3145, 3146, 3161, 3176, 3188, 3226, 3240],
           dtype='int64', length=198)

In [5]:
values = data_copy.loc[nan_index, 'MechVentLast8Hour']
values

3       0.0
4       0.0
26      1.0
44      1.0
67      1.0
       ... 
3161    1.0
3176    0.0
3188    1.0
3226    1.0
3240    1.0
Name: MechVentLast8Hour, Length: 198, dtype: float64

In [8]:
impute = KNNImputer(n_neighbors=9)
x_impute = pd.DataFrame(impute.fit_transform(mech_data),columns=data_copy.columns,index=data_copy.index,copy=True)
x_impute

ValueError: Shape of passed values is (2060, 110), indices imply (2060, 115)

In [None]:
impute_mech = x_impute[108]
impute_mech

In [None]:
mech_data['MechVentLast8Hour'].isnull().sum()

In [None]:
X = df.drop('In-hospital_death',axis=1)
Y = df['In-hospital_death']

In [None]:
# Unique values of X
X.nunique().sum()

In [None]:
mech_clean = X['MechVentLast8Hour'].dropna()
mech_clean

In [None]:
X_train, X_test, y_train, y_test = train_test_split(mech_clean, Y , test_size=0.2,random_state=11)

Imputing the missing data

In [None]:
# Filling nans with KNN Imputer
impute = KNNImputer(n_neighbors=9)
x_impute = impute.fit_transform(X)

In [None]:
# the number of unique values in dataframe after imputing the missing data
pd.DataFrame(x_impute).nunique().sum() 
pd.DataFrame(x_impute).isnull().sum().sum()

Balancing the classes with SMOTE

In [None]:
sm = SMOTE(random_state=11,k_neighbors=5)
x_sm, y_sm = sm.fit_resample(x_impute, Y)

In [None]:
# the number of unique values in dataframe after balancing classes
pd.DataFrame(x_sm).nunique().sum()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y , test_size=0.2,random_state=11)

In [None]:
X_test.isnull().sum().sum()

In [None]:
impute = KNNImputer(n_neighbors=9)
x_impute = impute.fit_transform(X_train)

In [None]:
sm = SMOTE(random_state=11,k_neighbors=5)
x_sm, y_sm = sm.fit_resample(x_impute, y_train)

In [None]:
X_test.fillna(X_test.mean(),inplace=True)

In [None]:
X_test.isnull().sum().sum()
X_train.isnull().sum().sum()

Random Forest

In [None]:
forest = RandomForestClassifier(n_estimators=140,random_state=11,oob_score=True,max_features='log2',min_samples_split=5,class_weight='balanced')
forest.fit(x_impute,y_train)
pred = forest.predict(X_test)


In [None]:
y_pred_proba = forest.predict_proba(X_test)[:, 1]  # Probabilities of positive class
print(np.unique(pred,return_counts=True))
# changinh the threshold to increase sensitivity
threshold = 0.25
pred = (y_pred_proba > threshold).astype(int)
print(np.unique(pred,return_counts=True))


In [None]:
forest.score(X_test,y_test),f1_score(y_test,pred),forest.oob_score_,accuracy_score(y_test,pred)

In [None]:
ax = plt.axes()
df_cm = (confusion_matrix(y_test, pred, normalize="true")*100).astype(int)

sns.heatmap(df_cm, annot=True, annot_kws={"size": 30}, fmt='d',cmap="Blues", ax = ax )
ax.set_title('Confusion Matrix')
plt.show()

In [None]:
# ROC curve for Random Forest
lr_probs = forest.predict_proba(X_test)
lr_probs = lr_probs[:, 1]
lr_auc = roc_auc_score(y_test, lr_probs)

lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)

plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

forest_accuracy = accuracy_score(y_test,pred)
# Calculate sensitivity 
forest_sensitivity = tp / (tp + fn)

# Calculate specificity
forest_specificity = tn / (tn + fp)

# Calculate AUC
forest_auc = roc_auc_score(y_test, pred)

# Calculate MCC
forest_mcc = matthews_corrcoef(y_test, pred)

# Print the results
print("Sensitivity:", forest_sensitivity)
print("Specificity:", forest_specificity)
print("AUC:", forest_auc)
print("MCC:", forest_mcc)

In [None]:
# Cross validation
params={'n_estimators': [140,150,200], 'max_features': ['log2','sqrt','None']}
choice = GridSearchCV(forest,params,cv=8).fit(X_train,y_train)
choice.best_params_

logistic regression

In [None]:
# Scaling the data
scaler = RobustScaler()
X_scaled = scaler.fit_transform(x_sm)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_sm , test_size=0.2, random_state=11)

In [None]:
classifier = LogisticRegression(C=0.5,penalty='l1', solver='saga')
classifier.fit(X_train, y_train)

In [None]:
X_train

In [None]:
pred = classifier.predict(X_test)

In [None]:
ax = plt.axes()
df_cm = (confusion_matrix(y_test, pred, normalize="true")*100).astype(int)

sns.heatmap(df_cm, annot=True, annot_kws={"size": 30}, fmt='d',cmap="Blues", ax = ax )
ax.set_title('Confusion Matrix')
ax.set_xlabel('Predicted')
ax.set_ylabel('True')
plt.show()

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
logistic_accuracy = accuracy_score(y_test,pred)
# Calculate sensitivity (recall or true positive rate)
logistic_sensitivity = tp / (tp + fn)

# Calculate specificity
logistic_specificity = tn / (tn + fp)

# Calculate AUC
logistic_auc = roc_auc_score(y_test, pred)

# Calculate MCC
logistic_mcc = matthews_corrcoef(y_test, pred)

# Print the results
# print("Sensitivity:", sensitivity)
# print("Specificity:", specificity)
# print("AUC:", auc)
# print("MCC:", mcc)

In [None]:
params = {'C': [0.1,0.5,1,2,5],'penalty': ['l1','l2','elasticnet'],'solver': ['saga','lgfgs']}
choice = GridSearchCV(classifier,params,cv=8).fit(X_train,y_train)
choice.best_params_

In [None]:
classifier.score(X_test,y_test),f1_score(y_test,pred),accuracy_score(y_test,pred)

SVM

In [None]:
# Scaling the data
scaler = RobustScaler()
X_scaled = scaler.fit_transform(x_sm)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_sm , test_size=0.2, random_state=11)

In [None]:
svm = SVC(kernel='rbf',C=6,gamma=0.02,probability=True).fit(X_train,y_train)
pred = svm.predict(X_test)

In [None]:
svm.score(X_test,y_test),f1_score(y_test,pred),accuracy_score(y_test,pred)

In [None]:
y_pred_proba = svm.predict_proba(X_test)[:, 1]  # Probabilities of positive class
print(np.unique(pred,return_counts=True))
# changinh the threshold to increase sensitivity
threshold = 0.5
pred = (y_pred_proba > threshold).astype(int)
print(np.unique(pred,return_counts=True))

In [None]:
ax = plt.axes()
df_cm = (confusion_matrix(y_test, pred, normalize="true")*100).astype(int)

sns.heatmap(df_cm, annot=True, annot_kws={"size": 30}, fmt='d',cmap="Blues", ax = ax )
ax.set_title('Confusion Matrix')
plt.show()

In [None]:
# ROC curve for SVM
lr_probs = svm.predict_proba(X_test)
lr_probs = lr_probs[:, 1]
lr_auc = roc_auc_score(y_test, lr_probs)

lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)

plt.plot(lr_fpr, lr_tpr, marker='.', label='Logistic')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.show()

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

svm_accuracy = accuracy_score(y_test,pred)
# Calculate sensitivity (recall or true positive rate)
svm_sensitivity = tp / (tp + fn)

# Calculate specificity
svm_specificity = tn / (tn + fp)

# Calculate AUC
svm_auc = roc_auc_score(y_test, pred)

# Calculate MCC
svm_mcc = matthews_corrcoef(y_test, pred)

# Print the results
print("Sensitivity:", svm_sensitivity)
print("Specificity:", svm_specificity)
print("AUC:", svm_auc)
print("MCC:", svm_mcc)

In [None]:
params={'C': [6,7,9],'gamma': [0.02,0.05,0.09]}
choice = GridSearchCV(svm,params,cv=6).fit(X_train,y_train)
choice.best_params_

Naive Bias

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_sm, y_sm , test_size=0.2, random_state=11)

In [None]:
naive = GaussianNB(var_smoothing=0).fit(X_train,y_train)
X_train.shape,y_train.shape

In [None]:
pred = naive.predict(X_test)

In [None]:
ax = plt.axes()
df_cm = (confusion_matrix(y_test, pred, normalize="true")*100).astype(int)

sns.heatmap(df_cm, annot=True, annot_kws={"size": 30}, fmt='d',cmap="Blues", ax = ax )
ax.set_title('Confusion Matrix')
plt.show()

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
naive_accuracy = accuracy_score(y_test,pred)
# Calculate sensitivity (recall or true positive rate)
naive_sensitivity = tp / (tp + fn)

# Calculate specificity
naive_specificity = tn / (tn + fp)

# Calculate AUC
naive_auc = roc_auc_score(y_test, pred)

# Calculate MCC
naive_mcc = matthews_corrcoef(y_test, pred)

# Print the results
# print("Sensitivity:", sensitivity)
# print("Specificity:", specificity)
# print("AUC:", auc)
# print("MCC:", mcc)

In [None]:
params = {
    'var_smoothing': [-1,0,1e-100,1e-10,1e-9] # Example hyperparameter values for var_smoothing
}



choice = GridSearchCV(naive,params,cv=5).fit(X_train,y_train)


# Print the best hyperparameter values and corresponding score
print("Best Hyperparameters: ", choice.best_params_)
print("Best Score: ", choice.best_score_)

LDA

In [None]:
lda = LinearDiscriminantAnalysis(solver='svd')

lda.fit(X_train, y_train)

pred = lda.predict(X_test)

In [None]:
ax = plt.axes()
df_cm = (confusion_matrix(y_test, pred, normalize="true")*100).astype(int)

sns.heatmap(df_cm, annot=True, annot_kws={"size": 30}, fmt='d',cmap="Blues", ax = ax )
ax.set_title('Confusion Matrix')
plt.show()

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

# Calculate sensitivity (recall or true positive rate)
sensitivity = tp / (tp + fn)

# Calculate specificity
specificity = tn / (tn + fp)

# Calculate AUC
auc = roc_auc_score(y_test, pred)

# Calculate MCC
mcc = matthews_corrcoef(y_test, pred)

# Print the results
print("Sensitivity:", sensitivity)
print("Specificity:", specificity)
print("AUC:", auc)
print("MCC:", mcc)

In [None]:
param_grid = {'solver': ['svd', 'lsqr', 'eigen'], 'n_components': [None, 1, 2, 3]}

grid_search = GridSearchCV(lda, param_grid, cv=5).fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Parameters: ", best_params)

QDA

In [None]:
qda = QuadraticDiscriminantAnalysis()

qda.fit(X_train, y_train)

pred = qda.predict(X_test)

In [None]:
ax = plt.axes()
df_cm = (confusion_matrix(y_test, pred, normalize="true")*100).astype(int)

sns.heatmap(df_cm, annot=True, annot_kws={"size": 30}, fmt='d',cmap="Blues", ax = ax )
ax.set_title('Confusion Matrix')
plt.show()

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()
accuracy_score(y_test,pred)
# Calculate sensitivity (recall or true positive rate)
qda_qda_sensitivity = tp / (tp + fn)

# Calculate specificity
qda_specificity = tn / (tn + fp)

# Calculate AUC
qda_auc = roc_auc_score(y_test, pred)

# Calculate MCC
qda_mcc = matthews_corrcoef(y_test, pred)

# Print the results
print("Sensitivity:", sensitivity)
print("Specificity:", specificity)
print("AUC:", auc)
print("MCC:", mcc)

In [None]:
param_grid = {'reg_param': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5]}
choice = GridSearchCV(qda,param_grid,cv=10).fit(X_train,y_train)
choice.best_params_

KNN

In [None]:
# Scaling the data
scaler = RobustScaler()
X_scaled = scaler.fit_transform(x_impute)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y , test_size=0.2, random_state=11)


In [None]:
knn = KNeighborsClassifier(n_neighbors=7).fit(X_train,y_train)

In [None]:
pred = knn.predict(X_test)
X_train.shape

In [None]:
ax = plt.axes()
df_cm = (confusion_matrix(y_test, pred, normalize="true")*100).astype(int)

sns.heatmap(df_cm, annot=True, annot_kws={"size": 30}, fmt='d',cmap="Blues", ax = ax )
ax.set_title('Confusion Matrix')
plt.show()

In [None]:
tn, fp, fn, tp = confusion_matrix(y_test, pred).ravel()

# Calculate sensitivity (recall or true positive rate)
sensitivity = tp / (tp + fn)

# Calculate specificity
specificity = tn / (tn + fp)

# Calculate AUC
auc = roc_auc_score(y_test, pred)

# Calculate MCC
mcc = matthews_corrcoef(y_test, pred)

# Print the results
print("Sensitivity:", sensitivity)
print("Specificity:", specificity)
print("AUC:", auc)
print("MCC:", mcc)

In [None]:
params = {'n_neighbors': [1,2,3,4,5,6,7,8,9]}
choice = GridSearchCV(knn,params,cv=10).fit(X_train,y_train)
choice.best_params_


In [None]:
import matplotlib.pyplot as plt

# Define the classifiers and their corresponding metric values
classifiers = ['','Random Forest', 'SVM', 'Logistic Regression', 'Naive Bayes']
AUC = ['AUC', forest_auc, svm_auc, logistic_auc, naive_auc]
MCC = ['MCC', forest_mcc, svm_mcc, logistic_mcc, naive_mcc]
Sensitivity = ['Sensitivity', forest_sensitivity, svm_sensitivity, logistic_sensitivity, naive_sensitivity]
Specificity = ['Specificity', forest_specificity, svm_specificity, logistic_specificity, naive_specificity ]
Accuracy = ['Accuracy', forest_accuracy, svm_accuracy, logistic_accuracy, naive_accuracy]

# Create the table as a matplotlib figure
fig, ax = plt.subplots()

# Define the table data as a list of lists
table_data = [
    classifiers,
    AUC,
    MCC,
    Sensitivity,
    Specificity,
    Accuracy
]





# Transpose the table data to have classifiers as columns
table_data = list(map(list, zip(*table_data)))


df = pd.DataFrame(table_data)
df = df.rename(columns=df.iloc[0]).loc[1:]



# # Set the index column as the index of the dataframe
df.set_index('', inplace=True)

# Convert numeric columns to numeric data type
df.iloc[:, 1:] = df.iloc[:, 1:].apply(pd.to_numeric)

# Sort the dataframe by all numeric columns in descending order
df = df.sort_values(by=list(df.select_dtypes(exclude='object').columns), axis=0, ascending=False)


df=df.reset_index()






# Create the table as a matplotlib figure
# fig, ax = plt.subplots()

# Create the table
table = ax.table(cellText=df.values, colLabels=df.columns, loc='center')

# Set table properties
table.auto_set_font_size(False)
table.set_fontsize(40)
table.scale(10, 10)  # Adjust table size as desired

# Hide table axes
ax.axis('off')

# Save the table as an image
plt.savefig('table.png', bbox_inches='tight')


