In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

In [2]:
csv_path = os.path.join("..", "..", "data","LINKED_DATA", "TSR_EHR", "TSR_1_CLEANED.csv")
tsr_1 = pd.read_csv(csv_path)
tsr_1.head()

Unnamed: 0,height_nm,weight_nm,edu_id,pro_id,opc_id,ih_fl,ivtpamg_nm,hospitalised_time,nivtpa_id,nivtpa1_fl,...,nihs_6br_out,nihs_7_out,nihs_8_out,nihs_9_out,nihs_10_out,nihs_11_out,total_out,SexName,Age,mrs_tx_1
0,150.0,49.0,2,1,3,0,0.0,16.0,1,0,...,0,0,0,0,0,0,8,0,66.0,4
1,153.0,62.0,3,1,3,0,0.0,8.0,0,999,...,0,1,1,0,1,0,4,0,67.0,1
2,152.0,62.0,3,1,2,0,0.0,4.0,0,999,...,0,0,1,0,0,0,1,0,69.0,1
3,148.0,56.0,2,1,2,0,0.0,5.0,0,999,...,0,0,1,0,0,0,2,0,71.0,0
4,152.0,56.0,4,1,2,0,0.0,3.0,1,0,...,0,0,0,0,0,0,0,0,71.0,0


In [3]:
tsr_1_input = tsr_1.drop(["mrs_tx_1"], axis=1)
tsr_1_input[tsr_1_input == "N"] = 0
tsr_1_input[tsr_1_input == "Y"] = 1
tsr_1_input = tsr_1_input.astype("float64")
tsr_1_input = np.array(tsr_1_input.values)

# 6 classes

In [4]:
tsr_1_output = tsr_1.mrs_tx_1
tsr_1_output = tsr_1_output.astype("float64")
tsr_1_output = np.array(tsr_1_output.values)

## SVM

In [5]:
svr = LinearSVR(epsilon = 0, dual=False, loss = "squared_epsilon_insensitive", C = 1, random_state = 19)
svr_scores = cross_val_score(svr,tsr_1_input,tsr_1_output,cv = 10, scoring='r2')
print(svr_scores)
print("Mean of R^2:", svr_scores.mean())
print("Std of R^2:", svr_scores.std())

[0.75819834 0.81292278 0.81497893 0.81372612 0.81603806 0.84767392
 0.79382794 0.80927838 0.72774844 0.66216536]
Mean of R^2: 0.7856558275219963
Std of R^2: 0.0520984455211836


In [6]:
acc_svr = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_1_input, tsr_1_output, test_size=0.3, random_state=i)
    svr.fit(X_train,y_train)
    svr_predict = svr.predict(X_test)
    #print(svr_predict)
    svr_predict = np.round(svr_predict)
    #print(svr_predict)
    accuracy = (y_test == svr_predict).sum() / len(svr_predict)
    acc_svr.append(accuracy)

print("Accuracy of RF:", acc_svr)
print("Mean of Accuracy of RF:", sum(acc_svr)/10)

Accuracy of RF: [0.5930864197530864, 0.5851851851851851, 0.5960493827160493, 0.614320987654321, 0.5930864197530864, 0.6049382716049383, 0.5975308641975309, 0.605925925925926, 0.5965432098765432, 0.5911111111111111]
Mean of Accuracy of RF: 0.5977777777777777


In [7]:
svr_pred = cross_val_predict(svr,tsr_1_input,tsr_1_output,cv = 10)
svr_pred = np.round(svr_pred)
confusion_matrix(tsr_1_output, svr_pred)

array([[  23,  452,   69,   13,    0,    1,    0],
       [   2,  963,  254,   40,    8,    2,    0],
       [   1,  318,  472,  128,   12,    1,    0],
       [   0,   24,  239,  494,   94,    4,    0],
       [   0,   13,   44,  347,  861,  235,    0],
       [   0,    8,   22,   35,  361, 1177,   32],
       [   0,    0,    0,    0,    0,    0,    0]], dtype=int64)

In [8]:
svr_pred[svr_pred == -1] = 0
svr_pred[svr_pred == 6] = 5
confusion_matrix(tsr_1_output, svr_pred)

array([[  23,  452,   69,   13,    0,    1],
       [   2,  963,  254,   40,    8,    2],
       [   1,  318,  472,  128,   12,    1],
       [   0,   24,  239,  494,   94,    4],
       [   0,   13,   44,  347,  861,  235],
       [   0,    8,   22,   35,  361, 1209]], dtype=int64)

## RF

In [9]:
rfr = RandomForestRegressor(criterion = "mse", n_estimators = 15, bootstrap=True, random_state = 19,max_features = 0.8)
rfr_scores = cross_val_score(rfr,tsr_1_input,tsr_1_output,cv = 10, scoring='r2')
print(rfr_scores)
print("Mean of R^2:", rfr_scores.mean())
print("Std of R^2:", rfr_scores.std())

[0.80253601 0.85442144 0.84661287 0.83234409 0.854438   0.87123276
 0.82762541 0.88558175 0.78763589 0.80643935]
Mean of R^2: 0.8368867559211093
Std of R^2: 0.029832924726383228


In [10]:
acc_rfr = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_1_input, tsr_1_output, test_size=0.3, random_state=i)
    rfr.fit(X_train,y_train)
    rfr_predict = rfr.predict(X_test)
    #print(rfr_predict)
    rfr_predict = np.round(rfr_predict)
    #print(rfr_predict)
    accuracy = (y_test == rfr_predict).sum() / len(rfr_predict)
    acc_rfr.append(accuracy)

print("Accuracy of RF:", acc_rfr)
print("Mean of Accuracy of RF:", sum(acc_rfr)/10)

Accuracy of RF: [0.7175308641975309, 0.7219753086419753, 0.7096296296296296, 0.7185185185185186, 0.7288888888888889, 0.7234567901234568, 0.7190123456790124, 0.7234567901234568, 0.7239506172839506, 0.731358024691358]
Mean of Accuracy of RF: 0.7217777777777779


In [11]:
rfr_pred = cross_val_predict(rfr,tsr_1_input,tsr_1_output,cv = 10)
rfr_pred = np.round(rfr_pred)
confusion_matrix(tsr_1_output, rfr_pred)

array([[ 340,  172,   29,   16,    0,    1],
       [  74,  881,  258,   43,   11,    2],
       [   3,  117,  648,  146,   18,    0],
       [   0,   16,  166,  515,  151,    7],
       [   0,   14,   29,  222, 1105,  130],
       [   0,    4,   13,   31,  291, 1296]], dtype=int64)

## XGBoost

In [12]:
xgbr = XGBRegressor(n_estimators  = 15, objective="reg:squarederror", random_state = 19) 
xgbr_scores = cross_val_score(xgbr,tsr_1_input,tsr_1_output,cv = 10, scoring='r2')
print(xgbr_scores)
print("Mean of R^2:", xgbr_scores.mean())
print("Std of R^2:", xgbr_scores.std())

[0.81509538 0.85465433 0.85629336 0.84323481 0.85948638 0.87759867
 0.82663422 0.88904219 0.79003615 0.8080127 ]
Mean of R^2: 0.8420088202986703
Std of R^2: 0.029945460047586483


In [13]:
acc_xgbr = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_1_input, tsr_1_output, test_size=0.3, random_state=i)
    xgbr.fit(X_train,y_train)
    xgbr_predict = xgbr.predict(X_test)
    #print(xgbr_predict)
    xgbr_predict = np.round(xgbr_predict)
    #print(xgbr_predict)
    accuracy = (y_test == xgbr_predict).sum() / len(xgbr_predict)
    acc_xgbr.append(accuracy)

print("Accuracy of RF:", acc_xgbr)
print("Mean of Accuracy of RF:", sum(acc_xgbr)/10)

Accuracy of RF: [0.7125925925925926, 0.7185185185185186, 0.7234567901234568, 0.7140740740740741, 0.7195061728395061, 0.7269135802469135, 0.7219753086419753, 0.731358024691358, 0.7224691358024692, 0.7229629629629629]
Mean of Accuracy of RF: 0.7213827160493826


In [14]:
xgbr_pred = cross_val_predict(xgbr,tsr_1_input,tsr_1_output,cv = 10)
xgbr_pred = np.round(xgbr_pred)
confusion_matrix(tsr_1_output, xgbr_pred)

array([[ 350,  159,   34,   14,    0,    1],
       [  72,  881,  259,   46,    9,    2],
       [   3,   92,  682,  139,   16,    0],
       [   0,   14,  167,  518,  149,    7],
       [   0,   12,   36,  183, 1135,  134],
       [   0,    4,   15,   29,  290, 1297]], dtype=int64)

# 2 classes

In [15]:
tsr_1_output[(tsr_1_output == 0)|(tsr_1_output == 1)|(tsr_1_output == 2)] = 0
tsr_1_output[(tsr_1_output == 3)|(tsr_1_output == 4)|(tsr_1_output == 5)] = 1

## SVM

In [16]:
svr2 = LinearSVR(epsilon = 0, dual=False, loss = "squared_epsilon_insensitive", C = 1, random_state = 19)
svr_scores2 = cross_val_score(svr2,tsr_1_input,tsr_1_output,cv = 10, scoring='r2')
print(svr_scores2)
print("Mean of R^2:", svr_scores2.mean())
print("Std of R^2:", svr_scores2.std())

[0.65463265 0.67281146 0.72629149 0.72803586 0.72637818 0.71985612
 0.69138906 0.72825933 0.45705818 0.55765237]
Mean of R^2: 0.6662364692825355
Std of R^2: 0.086093723164523


In [17]:
acc_svr2 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_1_input, tsr_1_output, test_size=0.3, random_state=i)
    svr2.fit(X_train,y_train)
    svr2_predict = svr2.predict(X_test)
    #print(svr2_predict)
    svr2_predict = np.round(svr2_predict)
    #print(svr2_predict)
    accuracy = (y_test == svr2_predict).sum() / len(svr2_predict)
    acc_svr2.append(accuracy)

print("Accuracy of RF:", acc_svr2)
print("Mean of Accuracy of RF:", sum(acc_svr2)/10)

Accuracy of RF: [0.9091358024691358, 0.9165432098765433, 0.9120987654320988, 0.92, 0.9185185185185185, 0.9234567901234568, 0.9155555555555556, 0.9274074074074075, 0.9165432098765433, 0.9125925925925926]
Mean of Accuracy of RF: 0.9171851851851853


In [18]:
svr_pred2 = cross_val_predict(svr2,tsr_1_input,tsr_1_output,cv = 10)
svr_pred2 = np.round(svr_pred2)
confusion_matrix(tsr_1_output, svr_pred2)

array([[2523,  236],
       [ 332, 3658]], dtype=int64)

## RF

In [19]:
rfr2 = RandomForestRegressor(criterion = "mse", n_estimators = 15, bootstrap=True, random_state = 19,max_features = 0.8)
rfr_scores2 = cross_val_score(rfr2,tsr_1_input,tsr_1_output,cv = 10, scoring='r2')
print(rfr_scores2)
print("Mean of R^2:", rfr_scores2.mean())
print("Std of R^2:", rfr_scores2.std())

[0.70414766 0.78598706 0.80979341 0.80851794 0.74737315 0.76380059
 0.76552779 0.78078116 0.65483732 0.72780284]
Mean of R^2: 0.7548568911288809
Std of R^2: 0.04593017505732098


In [20]:
acc_rfr2 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_1_input, tsr_1_output, test_size=0.3, random_state=i)
    rfr2.fit(X_train,y_train)
    rfr2_predict = rfr2.predict(X_test)
    #print(rfr2_predict)
    rfr2_predict = np.round(rfr2_predict)
    #print(rfr2_predict)
    accuracy = (y_test == rfr2_predict).sum() / len(rfr2_predict)
    acc_rfr2.append(accuracy)

print("Accuracy of RF:", acc_rfr2)
print("Mean of Accuracy of RF:", sum(acc_rfr2)/10)

Accuracy of RF: [0.9323456790123457, 0.9362962962962963, 0.928395061728395, 0.9392592592592592, 0.9377777777777778, 0.9358024691358025, 0.937283950617284, 0.9362962962962963, 0.9328395061728395, 0.934320987654321]
Mean of Accuracy of RF: 0.9350617283950617


In [21]:
rf_pred2 = cross_val_predict(rfr2,tsr_1_input,tsr_1_output,cv = 10)
rf_pred2 = np.round(rf_pred2)
confusion_matrix(tsr_1_output, rf_pred2)

array([[2495,  264],
       [ 171, 3819]], dtype=int64)

## XGBoost

In [22]:
xgbr2 = XGBRegressor(n_estimators  = 15, objective="reg:squarederror", random_state = 19) 
xgbr_scores2 = cross_val_score(xgbr2,tsr_1_input,tsr_1_output,cv = 10, scoring='r2')
print(xgbr_scores2)
print("Mean of R^2:", xgbr_scores2.mean())
print("Std of R^2:", xgbr_scores2.std())

[0.72319331 0.79790321 0.80933179 0.80348947 0.7580319  0.78636109
 0.76204846 0.81063156 0.68175723 0.76111351]
Mean of R^2: 0.769386151505116
Std of R^2: 0.03952853476472426


In [23]:
acc_xgbr2 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_1_input, tsr_1_output, test_size=0.3, random_state=i)
    xgbr2.fit(X_train,y_train)
    xgbr2_predict = xgbr2.predict(X_test)
    #print(xgbr2_predict)
    xgbr2_predict = np.round(xgbr2_predict)
    #print(xgbr2_predict)
    accuracy = (y_test == xgbr2_predict).sum() / len(xgbr2_predict)
    acc_xgbr2.append(accuracy)

print("Accuracy of RF:", acc_xgbr2)
print("Mean of Accuracy of RF:", sum(acc_xgbr2)/10)

Accuracy of RF: [0.9387654320987654, 0.9338271604938272, 0.9293827160493827, 0.9412345679012346, 0.9402469135802469, 0.9353086419753086, 0.9323456790123457, 0.9382716049382716, 0.928395061728395, 0.9358024691358025]
Mean of Accuracy of RF: 0.935358024691358


In [24]:
xgbr_pred2 = cross_val_predict(xgbr2,tsr_1_input,tsr_1_output,cv = 10)
xgbr_pred2 = np.round(xgbr_pred2)
confusion_matrix(tsr_1_output, xgbr_pred2)

array([[2501,  258],
       [ 161, 3829]], dtype=int64)

# Summary

## Mean & Std

In [25]:
svr_mean = np.array([svr_scores.mean(), svr_scores.std(), svr_scores[0], svr_scores[1], svr_scores[2], svr_scores[3],
                     svr_scores[4], svr_scores[5], svr_scores[6], svr_scores[7], svr_scores[8], svr_scores[9]])
rfr_mean = np.array([rfr_scores.mean(), rfr_scores.std(), rfr_scores[0], rfr_scores[1], rfr_scores[2], rfr_scores[3],rfr_scores[4], 
                    rfr_scores[5], rfr_scores[6], rfr_scores[7], rfr_scores[8], rfr_scores[9]])
xgbr_mean = np.array([xgbr_scores.mean(), xgbr_scores.std(), xgbr_scores[0], xgbr_scores[1], xgbr_scores[2], xgbr_scores[3],
                     xgbr_scores[4], xgbr_scores[5], xgbr_scores[6], xgbr_scores[7], xgbr_scores[8], xgbr_scores[9]])
svr_mean2 = np.array([svr_scores2.mean(), svr_scores2.std(), svr_scores2[0], svr_scores2[1], svr_scores2[2], svr_scores2[3],
                     svr_scores2[4], svr_scores2[5], svr_scores2[6], svr_scores2[7], svr_scores2[8], svr_scores2[9]])
rfr_mean2 = np.array([rfr_scores2.mean(), rfr_scores2.std(), rfr_scores2[0], rfr_scores2[1], rfr_scores2[2], rfr_scores2[3],rfr_scores2[4], 
                    rfr_scores2[5], rfr_scores2[6], rfr_scores2[7], rfr_scores2[8], rfr_scores2[9]])
xgbr_mean2 = np.array([xgbr_scores2.mean(), xgbr_scores2.std(), xgbr_scores2[0], xgbr_scores2[1], xgbr_scores2[2], xgbr_scores2[3],
                     xgbr_scores2[4], xgbr_scores2[5], xgbr_scores2[6], xgbr_scores2[7], xgbr_scores2[8], xgbr_scores2[9]])

In [26]:
tsr_1_mean = pd.DataFrame([svr_mean, rfr_mean, xgbr_mean, svr_mean2, rfr_mean2, xgbr_mean2]).T
tsr_1_mean.index = ["Mean", "Std", "R^2_1", "R^2_2", "R^2_3", "R^2_4", "R^2_5", "R^2_6", "R^2_7", "R^2_8", "R^2_9", "R^2_10"]
tsr_1_mean.columns = ["svr", 'rfr', 'xgbr', 'svr2', 'rfr2', 'xgbr2']

In [27]:
csv_save = os.path.join(".", "tsr_1_mean_regression.csv")
tsr_1_mean.to_csv(csv_save, index = True)