In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

In [2]:
csv_path = os.path.join("..", "..", "data","LINKED_DATA", "TSR_EHR", "TSR_1_CLEANED.csv")
tsr_1 = pd.read_csv(csv_path)
tsr_1.head()

Unnamed: 0,height_nm,weight_nm,edu_id,pro_id,opc_id,ih_fl,ivtpamg_nm,hospitalised_time,nivtpa_id,nivtpa1_fl,...,nihs_6br_out,nihs_7_out,nihs_8_out,nihs_9_out,nihs_10_out,nihs_11_out,total_out,SexName,Age,mrs_tx_1
0,150.0,49.0,2,1,3,0,0.0,16.0,1,0,...,0,0,0,0,0,0,8,0,66.0,4
1,153.0,62.0,3,1,3,0,0.0,8.0,0,999,...,0,1,1,0,1,0,4,0,67.0,1
2,152.0,62.0,3,1,2,0,0.0,4.0,0,999,...,0,0,1,0,0,0,1,0,69.0,1
3,148.0,56.0,2,1,2,0,0.0,5.0,0,999,...,0,0,1,0,0,0,2,0,71.0,0
4,152.0,56.0,4,1,2,0,0.0,3.0,1,0,...,0,0,0,0,0,0,0,0,71.0,0


In [3]:
tsr_1_input = tsr_1.drop(["mrs_tx_1"], axis=1)
tsr_1_input[tsr_1_input == "N"] = 0
tsr_1_input[tsr_1_input == "Y"] = 1
tsr_1_input = tsr_1_input.astype("float64")
tsr_1_input = np.array(tsr_1_input.values)

# 6 classes

In [4]:
tsr_1_output = tsr_1.mrs_tx_1
tsr_1_output = tsr_1_output.astype("float64")
tsr_1_output = np.array(tsr_1_output.values)

## SVM

In [5]:
svc = CalibratedClassifierCV(LinearSVC(penalty = "l2", dual=False, loss = "squared_hinge", C = 1, multi_class = "ovr", 
                                       random_state = 19)) 
svc_scores = cross_val_score(svc,tsr_1_input,tsr_1_output,cv = 10, scoring='accuracy')
print(svc_scores)
print(svc_scores.mean(), svc_scores.std())



[0.5437037  0.61185185 0.59703704 0.5837037  0.59111111 0.59851852
 0.64888889 0.61481481 0.64740741 0.50445104]
0.5941488075612705 0.04160480608794081


In [6]:
svc.fit(tsr_1_input,tsr_1_output)
svc_predict =svc.predict_proba(tsr_1_input)
print(svc_predict)

[[2.31323920e-03 5.64950227e-02 1.80969857e-02 1.44723624e-01
  6.87404006e-01 9.09671225e-02]
 [1.57908339e-01 4.70547833e-01 2.45126360e-01 5.29856972e-02
  6.78717876e-02 5.55998369e-03]
 [1.55776108e-01 5.38575822e-01 1.87319876e-01 4.72654617e-02
  6.53425242e-02 5.72020765e-03]
 ...
 [1.24693229e-04 2.06883688e-03 1.12793003e-02 1.73281790e-02
  3.28468421e-02 9.36352148e-01]
 [8.12034271e-05 3.26782484e-03 2.46202607e-03 1.10798464e-01
  2.87746559e-01 5.95643923e-01]
 [5.45186602e-04 1.99804931e-02 1.43399335e-02 1.03844678e-01
  3.55589796e-01 5.05699913e-01]]


In [7]:
svc_pred = cross_val_predict(svc,tsr_1_input,tsr_1_output,cv = 10)
confusion_matrix(tsr_1_output, svc_pred)



array([[ 272,  220,   25,   28,   11,    2],
       [ 117,  881,  170,   53,   45,    3],
       [  18,  340,  349,  107,  113,    5],
       [   8,   62,  161,  229,  370,   25],
       [   1,   29,   44,  110,  983,  333],
       [   0,    8,   11,   16,  304, 1296]], dtype=int64)

## RF

In [8]:
rf = CalibratedClassifierCV(RandomForestClassifier(criterion = "gini", n_estimators = 15, bootstrap=True, random_state = 19,
                                                  max_features = 0.8)) 
rf_scores = cross_val_score(rf,tsr_1_input,tsr_1_output,cv = 10, scoring='accuracy')
print(rf_scores)
print(rf_scores.mean(), rf_scores.std())

[0.69481481 0.76592593 0.72296296 0.7037037  0.74074074 0.75555556
 0.80148148 0.79407407 0.78222222 0.64094955]
0.7402431036377622 0.047813339288996774


In [9]:
rf.fit(tsr_1_input,tsr_1_output)
rf_predict =rf.predict_proba(tsr_1_input)
print(rf_predict)

[[0.02161392 0.03984294 0.03374814 0.06501049 0.80230888 0.03747563]
 [0.03740463 0.82121215 0.04082079 0.03206861 0.04079358 0.02770024]
 [0.02284936 0.84885682 0.03493287 0.03083941 0.03628596 0.02623558]
 ...
 [0.0197908  0.03658606 0.03069316 0.02950689 0.03468654 0.84873654]
 [0.0197908  0.03658606 0.03069316 0.02950689 0.03468654 0.84873654]
 [0.02123273 0.03912224 0.03308034 0.03169159 0.58416506 0.29070806]]


In [10]:
rf_pred = cross_val_predict(rf,tsr_1_input,tsr_1_output,cv = 10)
confusion_matrix(tsr_1_output, rf_pred)

array([[ 348,  149,   41,   15,    4,    1],
       [  79,  848,  271,   47,   17,    7],
       [   5,   92,  646,  155,   31,    3],
       [   0,   15,   63,  583,  183,   11],
       [   2,   13,   21,  104, 1204,  156],
       [   1,    5,   10,   17,  235, 1367]], dtype=int64)

## XGBoost

In [11]:
xgb = CalibratedClassifierCV(XGBClassifier(booster = "gbtree", objective="multi:softprob", eval_metric = "auc", 
                                            use_label_encoder = False, random_state = 19)) 
xgb_scores = cross_val_score(xgb,tsr_1_input,tsr_1_output,cv = 10)
print(xgb_scores)
print(xgb_scores.mean(), xgb_scores.std())

[0.72296296 0.76888889 0.72740741 0.70074074 0.74222222 0.75259259
 0.78814815 0.78962963 0.78814815 0.62017804]
0.7400918782283769 0.04944189398432775


In [12]:
xgb.fit(tsr_1_input,tsr_1_output)
xgb_predict =xgb.predict_proba(tsr_1_input)
print(xgb_predict)

[[0.0275015  0.05208126 0.03855417 0.0386787  0.80867637 0.034508  ]
 [0.0289722  0.81297651 0.03895788 0.03819483 0.04622477 0.03467381]
 [0.03112147 0.81009014 0.03908046 0.03833065 0.04658651 0.03479077]
 ...
 [0.02530468 0.04752478 0.03552796 0.03503736 0.04285983 0.81374539]
 [0.02530961 0.04753619 0.03553634 0.035038   0.04318393 0.81339592]
 [0.02789582 0.05191449 0.0391564  0.04082976 0.64052162 0.19968191]]


In [13]:
xgb_pred = cross_val_predict(xgb,tsr_1_input,tsr_1_output,cv = 10)
confusion_matrix(tsr_1_output, xgb_pred)

array([[ 349,  141,   48,   15,    4,    1],
       [  76,  869,  252,   49,   18,    5],
       [   6,  123,  609,  152,   40,    2],
       [   0,   17,   66,  582,  179,   11],
       [   2,   11,   26,  107, 1195,  159],
       [   1,    5,   12,   14,  212, 1391]], dtype=int64)

# 2 classes

In [14]:
tsr_1_output[(tsr_1_output == 0)|(tsr_1_output == 1)|(tsr_1_output == 2)] = 0
tsr_1_output[(tsr_1_output == 3)|(tsr_1_output == 4)|(tsr_1_output == 5)] = 1

## SVM

In [15]:
svc2 = CalibratedClassifierCV(LinearSVC(penalty = "l2", dual=False, loss = "squared_hinge", C = 1, multi_class = "ovr", 
                                       random_state = 19)) 
svc_scores2 = cross_val_score(svc2,tsr_1_input,tsr_1_output,cv = 10, scoring='accuracy')
print(svc_scores2)
print(svc_scores2.mean(), svc_scores2.std())



[0.90222222 0.91851852 0.92740741 0.89777778 0.93481481 0.87111111
 0.91703704 0.91851852 0.95111111 0.81008902]
0.9048607539290032 0.037741057245826076


In [16]:
svc2.fit(tsr_1_input,tsr_1_output)
svc_predict2 =svc2.predict_proba(tsr_1_input)
print(svc_predict2)

[[2.04827188e-02 9.79517281e-01]
 [9.51035202e-01 4.89647978e-02]
 [9.70868093e-01 2.91319065e-02]
 ...
 [9.14343149e-04 9.99085657e-01]
 [1.17378180e-04 9.99882622e-01]
 [1.74508862e-02 9.82549114e-01]]


In [17]:
svc_pred2 = cross_val_predict(svc2,tsr_1_input,tsr_1_output,cv = 10)
confusion_matrix(tsr_1_output, svc_pred2)



array([[2423,  336],
       [ 306, 3684]], dtype=int64)

## RF

In [18]:
rf2 = CalibratedClassifierCV(RandomForestClassifier(criterion = "gini", n_estimators = 15, bootstrap=True, random_state = 19,
                                                  max_features = 0.8)) 
rf_scores2 = cross_val_score(rf2,tsr_1_input,tsr_1_output,cv = 10, scoring='accuracy')
print(rf_scores2)
print(rf_scores2.mean(), rf_scores2.std())

[0.91851852 0.94518519 0.95703704 0.92740741 0.95555556 0.92
 0.9437037  0.93481481 0.95555556 0.89020772]
0.9347985492911309 0.020175354104742166


In [19]:
rf2.fit(tsr_1_input,tsr_1_output)
rf_predict2 =rf2.predict_proba(tsr_1_input)
print(rf_predict2)

[[0.02953139 0.97046861]
 [0.9314039  0.0685961 ]
 [0.9708441  0.0291559 ]
 ...
 [0.02702468 0.97297532]
 [0.02702468 0.97297532]
 [0.06925199 0.93074801]]


In [20]:
rf_pred2 = cross_val_predict(rf2,tsr_1_input,tsr_1_output,cv = 10)
confusion_matrix(tsr_1_output, rf_pred2)

array([[2461,  298],
       [ 142, 3848]], dtype=int64)

## XGBoost

In [21]:
xgb2 = CalibratedClassifierCV(XGBClassifier(booster = "gbtree", objective="binary:logistic", eval_metric = "auc", 
                                            use_label_encoder = False, random_state = 19)) 
xgb_scores2 = cross_val_score(xgb2,tsr_1_input,tsr_1_output,cv = 10)
print(xgb_scores2)
print(xgb_scores2.mean(), xgb_scores2.std())

[0.91703704 0.93777778 0.94666667 0.92296296 0.95111111 0.89925926
 0.9437037  0.93777778 0.95407407 0.87240356]
0.928277393120123 0.02462117885649458


In [22]:
xgb2.fit(tsr_1_input,tsr_1_output)
xgb_predict2 =xgb2.predict_proba(tsr_1_input)
print(xgb_predict2)

[[0.04257464 0.95742536]
 [0.95331117 0.04668883]
 [0.95336592 0.04663408]
 ...
 [0.04213203 0.95786797]
 [0.04213833 0.95786167]
 [0.042196   0.957804  ]]


In [23]:
xgb_pred2 = cross_val_predict(xgb2,tsr_1_input,tsr_1_output,cv = 10)
confusion_matrix(tsr_1_output, xgb_pred2)

array([[2474,  285],
       [ 199, 3791]], dtype=int64)

# Summary

## Mean & Std

In [24]:
svc_mean = np.array([svc_scores.mean(), svc_scores.std(), svc_scores[0], svc_scores[1], svc_scores[2], svc_scores[3],
                     svc_scores[4], svc_scores[5], svc_scores[6], svc_scores[7], svc_scores[8], svc_scores[9]])
rf_mean = np.array([rf_scores.mean(), rf_scores.std(), rf_scores[0], rf_scores[1], rf_scores[2], rf_scores[3],rf_scores[4], 
                    rf_scores[5], rf_scores[6], rf_scores[7], rf_scores[8], rf_scores[9]])
xgb_mean = np.array([xgb_scores.mean(), xgb_scores.std(), xgb_scores[0], xgb_scores[1], xgb_scores[2], xgb_scores[3],
                     xgb_scores[4], xgb_scores[5], xgb_scores[6], xgb_scores[7], xgb_scores[8], xgb_scores[9]])
svc_mean2 = np.array([svc_scores2.mean(), svc_scores2.std(), svc_scores2[0], svc_scores2[1], svc_scores2[2], svc_scores2[3],
                     svc_scores2[4], svc_scores2[5], svc_scores2[6], svc_scores2[7], svc_scores2[8], svc_scores2[9]])
rf_mean2 = np.array([rf_scores2.mean(), rf_scores2.std(), rf_scores2[0], rf_scores2[1], rf_scores2[2], rf_scores2[3],rf_scores2[4], 
                    rf_scores2[5], rf_scores2[6], rf_scores2[7], rf_scores2[8], rf_scores2[9]])
xgb_mean2 = np.array([xgb_scores2.mean(), xgb_scores2.std(), xgb_scores2[0], xgb_scores2[1], xgb_scores2[2], xgb_scores2[3],
                     xgb_scores2[4], xgb_scores2[5], xgb_scores2[6], xgb_scores2[7], xgb_scores2[8], xgb_scores2[9]])

In [25]:
tsr_1_mean = pd.DataFrame([svc_mean, rf_mean, xgb_mean, svc_mean2, rf_mean2, xgb_mean2]).T
tsr_1_mean.index = ["Mean", "Std", "mean_1", "mean_2", "mean_3", "mean_4", "mean_5", "mean_6", "mean_7", "mean_8", "mean_9", "mean_10"]
tsr_1_mean.columns = ["svc", 'rf', 'xgb', 'svc2', 'rf2', 'xgb2']

In [26]:
csv_save = os.path.join(".", "tsr_1_mean.csv")
tsr_1_mean.to_csv(csv_save, index = True)

## Predicted Probability

In [27]:
svc_predict_0 = svc_predict[:, 0]
svc_predict_1 = svc_predict[:, 1]
svc_predict_2 = svc_predict[:, 2]
svc_predict_3 = svc_predict[:, 3]
svc_predict_4 = svc_predict[:, 4]
svc_predict_5 = svc_predict[:, 5]

rf_predict_0 = rf_predict[:, 0]
rf_predict_1 = rf_predict[:, 1]
rf_predict_2 = rf_predict[:, 2]
rf_predict_3 = rf_predict[:, 3]
rf_predict_4 = rf_predict[:, 4]
rf_predict_5 = rf_predict[:, 5]

xgb_predict_0 = xgb_predict[:, 0]
xgb_predict_1 = xgb_predict[:, 1]
xgb_predict_2 = xgb_predict[:, 2]
xgb_predict_3 = xgb_predict[:, 3]
xgb_predict_4 = xgb_predict[:, 4]
xgb_predict_5 = xgb_predict[:, 5]

svc_predict2_0 = svc_predict2[:, 0]
svc_predict2_1 = svc_predict2[:, 1]

rf_predict2_0 = rf_predict2[:, 0]
rf_predict2_1 = rf_predict2[:, 1]

xgb_predict2_0 = xgb_predict2[:, 0]
xgb_predict2_1 = xgb_predict2[:, 1]

In [28]:
tsr_1_pred_prob = pd.DataFrame([svc_predict_0, svc_predict_1, svc_predict_2, svc_predict_3, svc_predict_4, svc_predict_5, 
                               rf_predict_0, rf_predict_1, rf_predict_2, rf_predict_3, rf_predict_4, rf_predict_5,
                               xgb_predict_0, xgb_predict_1, xgb_predict_2, xgb_predict_3, xgb_predict_4, xgb_predict_5,
                               svc_predict2_0, svc_predict2_1, rf_predict2_0, rf_predict2_1, xgb_predict2_0, xgb_predict2_1]).T
tsr_1_pred_prob.columns = ["svc_predict_0", "svc_predict_1", "svc_predict_2", "svc_predict_3", "svc_predict_4", "svc_predict_5", 
                           "rf_predict_0", "rf_predict_1", "rf_predict_2", "rf_predict_3", "rf_predict_4", "rf_predict_5",
                           "xgb_predict_0", "xgb_predict_1", "xgb_predict_2", 'xgb_predict_3', "xgb_predict_4", "xgb_predict_5",
                           "svc_predict2_0", "svc_predict2_1", "rf_predict2_0", "rf_predict2_1", "xgb_predict2_0", "xgb_predict2_1"]

In [29]:
csv_save2 = os.path.join(".", "tsr_1_pred_prob.csv")
tsr_1_pred_prob.to_csv(csv_save2, index = False)