In [1]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix

In [2]:
csv_path = os.path.join("..", "..", "data","LINKED_DATA", "TSR_EHR", "TSR_6_CLEANED.csv")
tsr_6 = pd.read_csv(csv_path)
tsr_6.head()

Unnamed: 0,height_nm,weight_nm,edu_id,pro_id,opc_id,ih_fl,ivtpamg_nm,hospitalised_time,nivtpa_id,nivtpa1_fl,...,nihs_8_out,nihs_9_out,nihs_10_out,nihs_11_out,total_out,SexName,Age,mrs_tx_1,mrs_tx_3,mrs_tx_6
0,153.0,62.0,3,1,3,0,0.0,8.0,0,999,...,1,0,1,0,4,0,67.0,1,1,1
1,152.0,62.0,3,1,2,0,0.0,4.0,0,999,...,1,0,0,0,1,0,69.0,1,0,0
2,148.0,56.0,2,1,2,0,0.0,5.0,0,999,...,1,0,0,0,2,0,71.0,0,0,0
3,152.0,56.0,4,1,2,0,0.0,3.0,1,0,...,0,0,0,0,0,0,71.0,0,0,0
4,160.0,60.0,2,1,3,0,0.0,4.0,0,999,...,0,0,0,0,4,0,62.0,3,3,3


In [3]:
tsr_6_input = tsr_6.drop(["mrs_tx_6"], axis=1)
tsr_6_input[tsr_6_input == "N"] = 0
tsr_6_input[tsr_6_input == "Y"] = 1
tsr_6_input = tsr_6_input.astype("float64")
tsr_6_input = np.array(tsr_6_input.values)

tsr_6_input_nomrs = tsr_6.drop(["mrs_tx_6", "mrs_tx_3", "mrs_tx_1"], axis=1)
tsr_6_input_nomrs[tsr_6_input_nomrs == "N"] = 0
tsr_6_input_nomrs[tsr_6_input_nomrs == "Y"] = 1
tsr_6_input_nomrs = tsr_6_input_nomrs.astype("float64")
tsr_6_input_nomrs = np.array(tsr_6_input_nomrs.values)

# 6 classes

In [4]:
tsr_6_output = tsr_6.mrs_tx_1
tsr_6_output = tsr_6_output.astype("float64")
tsr_6_output = np.array(tsr_6_output.values)

## SVM

In [5]:
svr = LinearSVR(epsilon = 0, dual=False, loss = "squared_epsilon_insensitive", C = 1, random_state = 19)
svr_scores = cross_val_score(svr,tsr_6_input,tsr_6_output,cv = 10, scoring='r2')
print(svr_scores)
print("Mean of R^2:", svr_scores.mean())
print("Std of R^2:", svr_scores.std())

[0.97298991 0.97765583 0.9822613  0.9043261  0.99057859 0.97934303
 0.97305357 0.98671394 0.96758493 0.92350859]
Mean of R^2: 0.9658015800762796
Std of R^2: 0.02705812134930812


In [6]:
acc_svr = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input, tsr_6_output, test_size=0.3, random_state=i)
    svr.fit(X_train,y_train)
    svr_predict = svr.predict(X_test)
    print(svr_predict)
    svr_predict = np.round(svr_predict)
    print(svr_predict)
    accuracy = (y_test == svr_predict).sum() / len(svr_predict)
    acc_svr.append(accuracy)

print("Accuracy of RF:", acc_svr)
print("Mean of Accuracy of RF:", sum(acc_svr)/10)

[ 4.58220739  3.73811773  3.94748385 ... -0.05735297  3.67172496
  1.04913939]
[ 5.  4.  4. ... -0.  4.  1.]
[4.58821536 2.50220275 3.79844501 ... 2.67594207 3.09239123 3.87866723]
[5. 3. 4. ... 3. 3. 4.]
[4.3158863  4.97551555 0.75071333 ... 1.03935331 3.95548251 0.0191575 ]
[4. 5. 1. ... 1. 4. 0.]
[4.0613744  1.07699694 0.26825867 ... 2.00094894 1.0852585  0.89742411]
[4. 1. 0. ... 2. 1. 1.]
[3.00394115 3.80012033 1.18189494 ... 0.01407841 3.01472744 3.11119519]
[3. 4. 1. ... 0. 3. 3.]
[4.03634993 0.89762987 3.83235552 ... 3.33858494 3.89067855 0.16829999]
[4. 1. 4. ... 3. 4. 0.]
[ 3.02256103  4.78775915  3.27112474 ... -0.0063484   5.12908047
  1.00495649]
[ 3.  5.  3. ... -0.  5.  1.]
[1.04139318 4.81234015 4.01386339 ... 5.07539115 0.00904217 0.22240637]
[1. 5. 4. ... 5. 0. 0.]
[1.11100296 3.08622226 4.74131152 ... 2.99334468 4.02754797 3.50037999]
[1. 3. 5. ... 3. 4. 4.]
[4.1687502  2.15434266 4.40581069 ... 4.6742488  0.45265339 3.32409872]
[4. 2. 4. ... 5. 0. 3.]
Accuracy of RF

In [7]:
svr_pred = cross_val_predict(svr,tsr_6_input,tsr_6_output,cv = 10)
svr_pred = np.round(svr_pred)
confusion_matrix(tsr_6_output, svr_pred)

array([[   0,    0,    0,    0,    0,    0,    0,    0],
       [   1,  402,   55,    8,    0,    0,    0,    0],
       [   0,    1, 1010,   55,    1,    0,    0,    0],
       [   0,    1,   21,  642,   19,    0,    0,    0],
       [   0,    0,    0,   32,  556,    8,    0,    0],
       [   0,    0,    0,    0,   57,  750,    9,    0],
       [   0,    0,    0,    0,    5,   47,  469,    2],
       [   0,    0,    0,    0,    0,    0,    0,    0]], dtype=int64)

In [8]:
svr_pred[svr_pred == -1] = 0
confusion_matrix(tsr_6_output, svr_pred)

array([[ 403,   55,    8,    0,    0,    0,    0],
       [   1, 1010,   55,    1,    0,    0,    0],
       [   1,   21,  642,   19,    0,    0,    0],
       [   0,    0,   32,  556,    8,    0,    0],
       [   0,    0,    0,   57,  750,    9,    0],
       [   0,    0,    0,    5,   47,  469,    2],
       [   0,    0,    0,    0,    0,    0,    0]], dtype=int64)

In [9]:
svr1 = LinearSVR(epsilon = 0, dual=False, loss = "squared_epsilon_insensitive", C = 1, random_state = 19)
svr_scores1 = cross_val_score(svr1,tsr_6_input_nomrs,tsr_6_output,cv = 10, scoring='r2')
print(svr_scores1)
print("Mean of R^2:", svr_scores1.mean())
print("Std of R^2:", svr_scores1.std())

[0.67045177 0.78329337 0.81744363 0.79407382 0.81889027 0.79818859
 0.83967165 0.74176129 0.75511674 0.64201807]
Mean of R^2: 0.7660909189303222
Std of R^2: 0.06180736630616617


In [10]:
acc_svr1 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input_nomrs, tsr_6_output, test_size=0.3, random_state=i)
    svr1.fit(X_train,y_train)
    svr1_predict = svr1.predict(X_test)
    print(svr1_predict)
    svr1_predict = np.round(svr1_predict)
    print(svr1_predict)
    accuracy = (y_test == svr1_predict).sum() / len(svr1_predict)
    acc_svr1.append(accuracy)

print("Accuracy of RF:", acc_svr1)
print("Mean of Accuracy of RF:", sum(acc_svr1)/10)

[4.45932022 2.76426442 3.09596483 ... 0.74628467 2.81999047 0.79129908]
[4. 3. 3. ... 1. 3. 1.]
[4.29347801 1.20399972 3.09447794 ... 2.79997903 2.85946775 3.56198483]
[4. 1. 3. ... 3. 3. 4.]
[3.3844966  4.57711604 0.97995747 ... 1.09125877 3.0692339  0.62530585]
[3. 5. 1. ... 1. 3. 1.]
[3.34352621 0.89151353 1.35281944 ... 1.48908961 1.28731971 1.13965001]
[3. 1. 1. ... 1. 1. 1.]
[3.1505186  2.77783396 2.08617902 ... 0.72240521 3.08936883 3.74337168]
[3. 3. 2. ... 1. 3. 4.]
[3.98965751 0.80443951 3.52794331 ... 3.47670524 3.35431741 0.41131998]
[4. 1. 4. ... 3. 3. 0.]
[2.59532269 4.51222102 2.82965199 ... 0.62721193 5.4104717  1.18552866]
[3. 5. 3. ... 1. 5. 1.]
[0.9694227  4.32900087 3.23871927 ... 5.2193328  0.39016271 0.96111579]
[1. 4. 3. ... 5. 0. 1.]
[1.10267847 2.25786597 3.83311328 ... 2.56532274 3.73536248 2.94308232]
[1. 2. 4. ... 3. 4. 3.]
[4.16935229 2.13884364 4.75830994 ... 4.42590561 0.90226517 2.91602378]
[4. 2. 5. ... 4. 1. 3.]
Accuracy of RF: [0.5184590690208668, 0.5

In [11]:
svr_pred1 = cross_val_predict(svr1,tsr_6_input_nomrs,tsr_6_output,cv = 10)
svr_pred1 = np.round(svr_pred1)
confusion_matrix(tsr_6_output, svr_pred1)

array([[  0,   0,   0,   0,   0,   0,   0,   0],
       [  1,  33, 362,  58,  11,   0,   1,   0],
       [  1,   8, 813, 207,  30,   7,   1,   0],
       [  1,   3, 281, 296,  92,  10,   0,   0],
       [  0,   0,  23, 183, 330,  55,   5,   0],
       [  0,   0,   8,  32, 227, 447, 102,   0],
       [  0,   1,   5,   6,  24, 140, 339,   8],
       [  0,   0,   0,   0,   0,   0,   0,   0]], dtype=int64)

In [12]:
svr_pred1[svr_pred1 == 6] = 5
confusion_matrix(tsr_6_output, svr_pred1)

array([[  0,   0,   0,   0,   0,   0,   0],
       [  1,  33, 362,  58,  11,   0,   1],
       [  1,   8, 813, 207,  30,   7,   1],
       [  1,   3, 281, 296,  92,  10,   0],
       [  0,   0,  23, 183, 330,  55,   5],
       [  0,   0,   8,  32, 227, 447, 102],
       [  0,   1,   5,   6,  24, 140, 347]], dtype=int64)

## RF

In [13]:
rfr = RandomForestRegressor(criterion = "mse", n_estimators = 15, bootstrap=True, random_state = 19,max_features = 0.8)
rfr_scores = cross_val_score(rfr,tsr_6_input,tsr_6_output,cv = 10, scoring='r2')
print(rfr_scores)
print("Mean of R^2:", rfr_scores.mean())
print("Std of R^2:", rfr_scores.std())

[0.99996182 0.99996745 0.99998849 0.99991848 0.9999752  0.99996659
 0.99991892 0.99985967 0.99998559 0.99999551]
Mean of R^2: 0.9999537729717906
Std of R^2: 4.0187223876986936e-05


In [14]:
acc_rfr = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input, tsr_6_output, test_size=0.3, random_state=i)
    rfr.fit(X_train,y_train)
    rfr_predict = rfr.predict(X_test)
    print(rfr_predict)
    rfr_predict = np.round(rfr_predict)
    print(rfr_predict)
    accuracy = (y_test == rfr_predict).sum() / len(rfr_predict)
    acc_rfr.append(accuracy)

print("Accuracy of RF:", acc_rfr)
print("Mean of Accuracy of RF:", sum(acc_rfr)/10)

[5. 4. 4. ... 0. 4. 1.]
[5. 4. 4. ... 0. 4. 1.]
[5.         2.93333333 4.         ... 3.         3.         4.        ]
[5. 3. 4. ... 3. 3. 4.]
[4. 5. 1. ... 1. 4. 0.]
[4. 5. 1. ... 1. 4. 0.]
[4. 1. 0. ... 2. 1. 1.]
[4. 1. 0. ... 2. 1. 1.]
[3. 4. 1. ... 0. 3. 4.]
[3. 4. 1. ... 0. 3. 4.]
[4. 1. 4. ... 3. 4. 0.]
[4. 1. 4. ... 3. 4. 0.]
[3. 5. 4. ... 0. 5. 1.]
[3. 5. 4. ... 0. 5. 1.]
[1. 5. 4. ... 5. 0. 0.]
[1. 5. 4. ... 5. 0. 0.]
[1.         3.         4.93333333 ... 3.         4.         4.        ]
[1. 3. 5. ... 3. 4. 4.]
[4. 2. 4. ... 5. 0. 4.]
[4. 2. 4. ... 5. 0. 4.]
Accuracy of RF: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Mean of Accuracy of RF: 1.0


In [15]:
rfr_pred = cross_val_predict(rfr,tsr_6_input,tsr_6_output,cv = 10)
rfr_pred = np.round(rfr_pred)
confusion_matrix(tsr_6_output, rfr_pred)

array([[ 466,    0,    0,    0,    0,    0],
       [   0, 1067,    0,    0,    0,    0],
       [   0,    0,  683,    0,    0,    0],
       [   0,    0,    0,  596,    0,    0],
       [   0,    0,    0,    0,  816,    0],
       [   0,    0,    0,    0,    0,  523]], dtype=int64)

In [16]:
rfr1 = RandomForestRegressor(criterion = "mse", n_estimators = 15, bootstrap=True, random_state = 19,max_features = 0.8)
rfr_scores1 = cross_val_score(rfr1,tsr_6_input_nomrs,tsr_6_output,cv = 10, scoring='r2')
print(rfr_scores1)
print("Mean of R^2:", rfr_scores1.mean())
print("Std of R^2:", rfr_scores1.std())

[0.78461641 0.83812407 0.85364225 0.80021598 0.83401323 0.82829245
 0.88028419 0.75923783 0.8452423  0.79136409]
Mean of R^2: 0.8215032792722285
Std of R^2: 0.03482763312237961


In [17]:
acc_rfr1 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input_nomrs, tsr_6_output, test_size=0.3, random_state=i)
    rfr1.fit(X_train,y_train)
    rfr1_predict = rfr1.predict(X_test)
    print(rfr1_predict)
    rfr1_predict = np.round(rfr1_predict)
    print(rfr1_predict)
    accuracy = (y_test == rfr1_predict).sum() / len(rfr1_predict)
    acc_rfr1.append(accuracy)

print("Accuracy of RF:", acc_rfr1)
print("Mean of Accuracy of RF:", sum(acc_rfr1)/10)

[4.93333333 3.33333333 2.66666667 ... 0.         3.2        0.8       ]
[5. 3. 3. ... 0. 3. 1.]
[4.46666667 0.86666667 4.13333333 ... 2.33333333 3.66666667 3.93333333]
[4. 1. 4. ... 2. 4. 4.]
[3.86666667 4.13333333 1.         ... 1.46666667 3.46666667 0.06666667]
[4. 4. 1. ... 1. 3. 0.]
[2.86666667 0.93333333 1.53333333 ... 1.66666667 1.26666667 0.93333333]
[3. 1. 2. ... 2. 1. 1.]
[3.2        3.8        2.26666667 ... 0.         3.         3.93333333]
[3. 4. 2. ... 0. 3. 4.]
[4.2        1.46666667 4.2        ... 3.86666667 4.         0.06666667]
[4. 1. 4. ... 4. 4. 0.]
[3.         3.86666667 2.8        ... 0.         4.66666667 1.06666667]
[3. 4. 3. ... 0. 5. 1.]
[0.8        4.86666667 4.13333333 ... 4.93333333 0.         1.06666667]
[1. 5. 4. ... 5. 0. 1.]
[1.13333333 3.         4.06666667 ... 2.93333333 3.73333333 2.93333333]
[1. 3. 4. ... 3. 4. 3.]
[4.33333333 1.73333333 4.8        ... 4.06666667 1.         3.6       ]
[4. 2. 5. ... 4. 1. 4.]
Accuracy of RF: [0.666131621187801, 0.66

In [18]:
rfr_pred1 = cross_val_predict(rfr1,tsr_6_input_nomrs,tsr_6_output,cv = 10)
rfr_pred1 = np.round(rfr_pred1)
confusion_matrix(tsr_6_output, rfr_pred1)

array([[282, 140,  26,  17,   0,   1],
       [ 54, 757, 205,  44,   5,   2],
       [  2, 132, 435, 102,  12,   0],
       [  0,  14, 130, 378,  72,   2],
       [  0,   7,  16, 149, 599,  45],
       [  0,   3,   2,  12, 146, 360]], dtype=int64)

## XGBoost

In [19]:
xgbr = XGBRegressor(n_estimators  = 15, objective="reg:squarederror", random_state = 19) 
xgbr_scores = cross_val_score(xgbr,tsr_6_input,tsr_6_output,cv = 10, scoring='r2')
print(xgbr_scores)
print("Mean of R^2:", xgbr_scores.mean())
print("Std of R^2:", xgbr_scores.std())

[0.99993897 0.99995103 0.99994665 0.99995117 0.99994241 0.99994807
 0.99994522 0.99994435 0.99993649 0.99991847]
Mean of R^2: 0.999942283896733
Std of R^2: 9.123852646233461e-06


In [20]:
acc_xgbr = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input, tsr_6_output, test_size=0.3, random_state=i)
    xgbr.fit(X_train,y_train)
    xgbr_predict = xgbr.predict(X_test)
    print(xgbr_predict)
    xgbr_predict = np.round(xgbr_predict)
    print(xgbr_predict)
    accuracy = (y_test == xgbr_predict).sum() / len(xgbr_predict)
    acc_xgbr.append(accuracy)

print("Accuracy of RF:", acc_xgbr)
print("Mean of Accuracy of RF:", sum(acc_xgbr)/10)

[4.9782729e+00 3.9831913e+00 3.9831913e+00 ... 2.4217656e-03 3.9831913e+00
 9.9760616e-01]
[5. 4. 4. ... 0. 4. 1.]
[4.9782605 2.9879477 3.9831862 ... 2.9879477 2.9879477 3.9831862]
[5. 3. 4. ... 3. 3. 4.]
[3.9831941e+00 4.9782672e+00 9.9760556e-01 ... 9.9760556e-01 3.9831941e+00
 2.4207328e-03]
[4. 5. 1. ... 1. 4. 0.]
[3.9831975e+00 9.9760628e-01 2.4219174e-03 ... 1.9927814e+00 9.9760628e-01
 9.9760628e-01]
[4. 1. 0. ... 2. 1. 1.]
[2.9879458e+00 3.9831891e+00 9.9760610e-01 ... 2.4217656e-03 2.9879458e+00
 3.9831891e+00]
[3. 4. 1. ... 0. 3. 4.]
[3.9831920e+00 9.9760544e-01 3.9831920e+00 ... 2.9879527e+00 3.9831920e+00
 2.4223770e-03]
[4. 1. 4. ... 3. 4. 0.]
[2.9879479e+00 4.9782534e+00 3.9831984e+00 ... 2.4210233e-03 4.9782534e+00
 9.9760568e-01]
[3. 5. 4. ... 0. 5. 1.]
[9.9760598e-01 4.9782658e+00 3.9831913e+00 ... 4.9782658e+00 2.4216154e-03
 2.4216154e-03]
[1. 5. 4. ... 5. 0. 0.]
[0.99760574 2.9879508  4.9782515  ... 2.9879508  3.9832036  3.9832036 ]
[1. 3. 5. ... 3. 4. 4.]
[3.983192

In [21]:
xgbr_pred = cross_val_predict(xgbr,tsr_6_input,tsr_6_output,cv = 10)
xgbr_pred = np.round(xgbr_pred)
confusion_matrix(tsr_6_output, xgbr_pred)

array([[ 466,    0,    0,    0,    0,    0],
       [   0, 1067,    0,    0,    0,    0],
       [   0,    0,  683,    0,    0,    0],
       [   0,    0,    0,  596,    0,    0],
       [   0,    0,    0,    0,  816,    0],
       [   0,    0,    0,    0,    0,  523]], dtype=int64)

In [22]:
xgbr1 = RandomForestRegressor(criterion = "mse", n_estimators = 15, bootstrap=True, random_state = 19,max_features = 0.8)
xgbr_scores1 = cross_val_score(xgbr1,tsr_6_input_nomrs,tsr_6_output,cv = 10, scoring='r2')
print(xgbr_scores1)
print("Mean of R^2:", xgbr_scores1.mean())
print("Std of R^2:", xgbr_scores1.std())

[0.78461641 0.83812407 0.85364225 0.80021598 0.83401323 0.82829245
 0.88028419 0.75923783 0.8452423  0.79136409]
Mean of R^2: 0.8215032792722285
Std of R^2: 0.03482763312237961


In [23]:
acc_xgbr1 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input_nomrs, tsr_6_output, test_size=0.3, random_state=i)
    xgbr1.fit(X_train,y_train)
    xgbr1_predict = xgbr1.predict(X_test)
    print(xgbr1_predict)
    xgbr1_predict = np.round(xgbr1_predict)
    print(xgbr1_predict)
    accuracy = (y_test == xgbr1_predict).sum() / len(xgbr1_predict)
    acc_xgbr1.append(accuracy)

print("Accuracy of RF:", acc_xgbr1)
print("Mean of Accuracy of RF:", sum(acc_xgbr1)/10)

[4.93333333 3.33333333 2.66666667 ... 0.         3.2        0.8       ]
[5. 3. 3. ... 0. 3. 1.]
[4.46666667 0.86666667 4.13333333 ... 2.33333333 3.66666667 3.93333333]
[4. 1. 4. ... 2. 4. 4.]
[3.86666667 4.13333333 1.         ... 1.46666667 3.46666667 0.06666667]
[4. 4. 1. ... 1. 3. 0.]
[2.86666667 0.93333333 1.53333333 ... 1.66666667 1.26666667 0.93333333]
[3. 1. 2. ... 2. 1. 1.]
[3.2        3.8        2.26666667 ... 0.         3.         3.93333333]
[3. 4. 2. ... 0. 3. 4.]
[4.2        1.46666667 4.2        ... 3.86666667 4.         0.06666667]
[4. 1. 4. ... 4. 4. 0.]
[3.         3.86666667 2.8        ... 0.         4.66666667 1.06666667]
[3. 4. 3. ... 0. 5. 1.]
[0.8        4.86666667 4.13333333 ... 4.93333333 0.         1.06666667]
[1. 5. 4. ... 5. 0. 1.]
[1.13333333 3.         4.06666667 ... 2.93333333 3.73333333 2.93333333]
[1. 3. 4. ... 3. 4. 3.]
[4.33333333 1.73333333 4.8        ... 4.06666667 1.         3.6       ]
[4. 2. 5. ... 4. 1. 4.]
Accuracy of RF: [0.666131621187801, 0.66

In [24]:
xgbr_pred1 = cross_val_predict(xgbr1,tsr_6_input_nomrs,tsr_6_output,cv = 10)
xgbr_pred1 = np.round(xgbr_pred1)
confusion_matrix(tsr_6_output, xgbr_pred1)

array([[282, 140,  26,  17,   0,   1],
       [ 54, 757, 205,  44,   5,   2],
       [  2, 132, 435, 102,  12,   0],
       [  0,  14, 130, 378,  72,   2],
       [  0,   7,  16, 149, 599,  45],
       [  0,   3,   2,  12, 146, 360]], dtype=int64)

# 2 classes

In [25]:
tsr_6_output[(tsr_6_output == 0)|(tsr_6_output == 1)|(tsr_6_output == 2)] = 0
tsr_6_output[(tsr_6_output == 3)|(tsr_6_output == 4)|(tsr_6_output == 5)] = 1

## SVM

In [26]:
svr2 = LinearSVR(epsilon = 0, dual=False, loss = "squared_epsilon_insensitive", C = 1, random_state = 19)
svr_scores2 = cross_val_score(svr2,tsr_6_input,tsr_6_output,cv = 10, scoring='r2')
print(svr_scores2)
print("Mean of R^2:", svr_scores2.mean())
print("Std of R^2:", svr_scores2.std())

[0.727862   0.81356089 0.82028994 0.8450266  0.82888114 0.82684768
 0.78140753 0.80205846 0.80881142 0.73299616]
Mean of R^2: 0.7987741823736992
Std of R^2: 0.03779395446884126


In [27]:
acc_svr2 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input, tsr_6_output, test_size=0.3, random_state=i)
    svr2.fit(X_train,y_train)
    svr2_predict = svr2.predict(X_test)
    print(svr2_predict)
    svr2_predict = np.round(svr2_predict)
    print(svr2_predict)
    accuracy = (y_test == svr2_predict).sum() / len(svr2_predict)
    acc_svr2.append(accuracy)

print("Accuracy of RF:", acc_svr2)
print("Mean of Accuracy of RF:", sum(acc_svr2)/10)

[ 0.75781747  0.81620929  0.9585553  ... -0.14269086  0.86187971
 -0.01030904]
[ 1.  1.  1. ... -0.  1. -0.]
[1.23249856 0.34602821 0.8637123  ... 0.68432869 0.70787881 0.84225802]
[1. 0. 1. ... 1. 1. 1.]
[ 1.10548017  1.2155176  -0.08518498 ...  0.0337493   0.8176167
 -0.17534512]
[ 1.  1. -0. ...  0.  1. -0.]
[ 0.95709807  0.02324432 -0.20425835 ...  0.13270699 -0.01337053
 -0.00117275]
[ 1.  0. -0. ...  0. -0. -0.]
[ 0.7035395   0.78637946  0.21106204 ... -0.1184989   0.77296097
  0.85277214]
[ 1.  1.  0. ... -0.  1.  1.]
[ 0.97068853 -0.0828431   0.76397916 ...  0.75267554  0.73937193
 -0.17948684]
[ 1. -0.  1. ...  1.  1. -0.]
[ 0.76490701  1.23007638  0.62510142 ... -0.131473    1.12573497
 -0.03106504]
[ 1.  1.  1. ... -0.  1. -0.]
[ 0.00494504  1.01569223  0.78480079 ...  1.01882287 -0.08616495
 -0.0951651 ]
[ 0.  1.  1. ...  1. -0. -0.]
[0.07127283 0.60552209 1.2002353  ... 0.67190728 0.89788554 0.78208576]
[0. 1. 1. ... 1. 1. 1.]
[ 0.95146268  0.52345125  0.91938696 ...  1.24

In [28]:
svr_pred2 = cross_val_predict(svr2,tsr_6_input,tsr_6_output,cv = 10)
svr_pred2 = np.round(svr_pred2)
confusion_matrix(tsr_6_output, svr_pred2)

array([[   0,    0,    0],
       [   2, 2147,   67],
       [   0,   71, 1864]], dtype=int64)

In [29]:
svr_pred2[svr_pred2 == -1] = 0
svr_pred2[svr_pred2 == 2] = 1
confusion_matrix(tsr_6_output, svr_pred2)

array([[2149,   67],
       [  71, 1864]], dtype=int64)

In [30]:
svr3 = LinearSVR(epsilon = 0, dual=False, loss = "squared_epsilon_insensitive", C = 1, random_state = 19)
svr_scores3 = cross_val_score(svr3,tsr_6_input_nomrs,tsr_6_output,cv = 10, scoring='r2')
print(svr_scores3)
print("Mean of R^2:", svr_scores3.mean())
print("Std of R^2:", svr_scores3.std())

[0.56391089 0.66895285 0.72511638 0.73974173 0.7378679  0.71606311
 0.71009548 0.66075518 0.735074   0.53413458]
Mean of R^2: 0.6791712103358022
Std of R^2: 0.07030762706798502


In [31]:
acc_svr3 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input_nomrs, tsr_6_output, test_size=0.3, random_state=i)
    svr3.fit(X_train,y_train)
    svr3_predict = svr3.predict(X_test)
    print(svr3_predict)
    svr3_predict = np.round(svr3_predict)
    print(svr3_predict)
    accuracy = (y_test == svr3_predict).sum() / len(svr3_predict)
    acc_svr3.append(accuracy)

print("Accuracy of RF:", acc_svr3)
print("Mean of Accuracy of RF:", sum(acc_svr3)/10)

[ 0.85806713  0.6474175   0.78337678 ... -0.01938124  0.70762786
 -0.0930031 ]
[ 1.  1.  1. ... -0.  1. -0.]
[1.10314371 0.04672379 0.67955114 ... 0.65159814 0.70241887 0.76542962]
[1. 0. 1. ... 1. 1. 1.]
[ 8.93002713e-01  1.14056331e+00 -2.96491330e-02 ... -9.11488045e-04
  5.92487590e-01 -4.45403240e-02]
[ 1.  1. -0. ... -0.  1. -0.]
[ 0.77686294 -0.02845329  0.01532818 ... -0.00499205 -0.02968107
  0.03084538]
[ 1. -0.  0. ... -0. -0.  0.]
[ 0.72116574  0.67637886  0.37895233 ... -0.01416306  0.7718958
  0.99385778]
[ 1.  1.  0. ... -0.  1.  1.]
[ 0.89975054 -0.09084631  0.67827471 ...  0.87074913  0.59467186
 -0.03225209]
[ 1. -0.  1. ...  1.  1. -0.]
[ 0.64167997  1.18813081  0.51550701 ... -0.00613614  1.2587248
  0.03468118]
[ 1.  1.  1. ... -0.  1.  0.]
[-0.02499695  0.95096492  0.66811186 ...  1.02188329 -0.02819109
  0.01776138]
[-0.  1.  1. ...  1. -0.  0.]
[0.09950253 0.40090181 1.03848018 ... 0.62095958 0.86999202 0.65835281]
[0. 0. 1. ... 1. 1. 1.]
[ 0.96650478  0.5139708

In [32]:
svr3.fit(tsr_6_input_nomrs,tsr_6_output)
svr_predict3 =svr3.predict(tsr_6_input_nomrs)
print(svr_predict3)
svr_predict3 = np.round(svr_predict3)
print(svr_predict3)
print("Accuracy of RF:", (tsr_6_output == svr_predict3).sum() / len(svr_predict3))

[ 0.01243191 -0.02560191  0.00387048 ...  1.23686719  0.21027683
  0.10932557]
[ 0. -0.  0. ...  1.  0.  0.]
Accuracy of RF: 0.9159238737653578


In [33]:
svr_pred3 = cross_val_predict(svr3,tsr_6_input_nomrs,tsr_6_output,cv = 10)
svr_pred3 = np.round(svr_pred3)
confusion_matrix(tsr_6_output, svr_pred3)

array([[2034,  182],
       [ 223, 1712]], dtype=int64)

## RF

In [34]:
rfr2 = RandomForestRegressor(criterion = "mse", n_estimators = 15, bootstrap=True, random_state = 19,max_features = 0.8)
rfr_scores2 = cross_val_score(rfr2,tsr_6_input,tsr_6_output,cv = 10, scoring='r2')
print(rfr_scores2)
print("Mean of R^2:", rfr_scores2.mean())
print("Std of R^2:", rfr_scores2.std())

[1.         1.         1.         0.99986738 1.         1.
 1.         1.         1.         1.        ]
Mean of R^2: 0.9999867382481705
Std of R^2: 3.9785255488455905e-05


In [35]:
acc_rfr2 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input, tsr_6_output, test_size=0.3, random_state=i)
    rfr2.fit(X_train,y_train)
    rfr2_predict = rfr2.predict(X_test)
    print(rfr2_predict)
    rfr2_predict = np.round(rfr2_predict)
    print(rfr2_predict)
    accuracy = (y_test == rfr2_predict).sum() / len(rfr2_predict)
    acc_rfr2.append(accuracy)

print("Accuracy of RF:", acc_rfr2)
print("Mean of Accuracy of RF:", sum(acc_rfr2)/10)

[1. 1. 1. ... 0. 1. 0.]
[1. 1. 1. ... 0. 1. 0.]
[1. 1. 1. ... 1. 1. 1.]
[1. 1. 1. ... 1. 1. 1.]
[1. 1. 0. ... 0. 1. 0.]
[1. 1. 0. ... 0. 1. 0.]
[1. 0. 0. ... 0. 0. 0.]
[1. 0. 0. ... 0. 0. 0.]
[1. 1. 0. ... 0. 1. 1.]
[1. 1. 0. ... 0. 1. 1.]
[1. 0. 1. ... 1. 1. 0.]
[1. 0. 1. ... 1. 1. 0.]
[1. 1. 1. ... 0. 1. 0.]
[1. 1. 1. ... 0. 1. 0.]
[0. 1. 1. ... 1. 0. 0.]
[0. 1. 1. ... 1. 0. 0.]
[0. 1. 1. ... 1. 1. 1.]
[0. 1. 1. ... 1. 1. 1.]
[1. 0. 1. ... 1. 0. 1.]
[1. 0. 1. ... 1. 0. 1.]
Accuracy of RF: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
Mean of Accuracy of RF: 1.0


In [36]:
rf_pred2 = cross_val_predict(rfr2,tsr_6_input,tsr_6_output,cv = 10)
rf_pred2 = np.round(rf_pred2)
confusion_matrix(tsr_6_output, rf_pred2)

array([[2216,    0],
       [   0, 1935]], dtype=int64)

In [37]:
rfr3 = RandomForestRegressor(criterion = "mse", n_estimators = 15, bootstrap=True, random_state = 19,max_features = 0.8)
rfr_scores3 = cross_val_score(rfr3,tsr_6_input_nomrs,tsr_6_output,cv = 10, scoring='r2')
print(rfr_scores3)
print("Mean of R^2:", rfr_scores3.mean())
print("Std of R^2:", rfr_scores3.std())

[0.61955993 0.74186447 0.83176085 0.76283567 0.78815204 0.73151756
 0.75826261 0.72771581 0.82543574 0.62756759]
Mean of R^2: 0.7414672274832117
Std of R^2: 0.06800337265622568


In [38]:
acc_rfr3 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input_nomrs, tsr_6_output, test_size=0.3, random_state=i)
    rfr3.fit(X_train,y_train)
    rfr3_predict = rfr3.predict(X_test)
    print(rfr3_predict)
    rfr3_predict = np.round(rfr3_predict)
    print(rfr3_predict)
    accuracy = (y_test == rfr3_predict).sum() / len(rfr3_predict)
    acc_rfr3.append(accuracy)

print("Accuracy of RF:", acc_rfr3)
print("Mean of Accuracy of RF:", sum(acc_rfr3)/10)

[1.         0.66666667 0.86666667 ... 0.         0.86666667 0.        ]
[1. 1. 1. ... 0. 1. 0.]
[1.         0.         1.         ... 0.66666667 1.         1.        ]
[1. 0. 1. ... 1. 1. 1.]
[1.         1.         0.         ... 0.2        0.73333333 0.        ]
[1. 1. 0. ... 0. 1. 0.]
[0.53333333 0.         0.13333333 ... 0.         0.         0.        ]
[1. 0. 0. ... 0. 0. 0.]
[0.66666667 0.86666667 0.46666667 ... 0.         0.86666667 0.86666667]
[1. 1. 0. ... 0. 1. 1.]
[1.         0.06666667 1.         ... 1.         0.73333333 0.        ]
[1. 0. 1. ... 1. 1. 0.]
[0.86666667 1.         0.73333333 ... 0.         1.         0.        ]
[1. 1. 1. ... 0. 1. 0.]
[0.         1.         0.93333333 ... 1.         0.         0.        ]
[0. 1. 1. ... 1. 0. 0.]
[0.         0.8        1.         ... 0.8        0.93333333 0.73333333]
[0. 1. 1. ... 1. 1. 1.]
[1.  0.  1.  ... 1.  0.  0.8]
[1. 0. 1. ... 1. 0. 1.]
Accuracy of RF: [0.9325842696629213, 0.9085072231139647, 0.9237560192616372, 0.909

In [39]:
rfr_pred3 = cross_val_predict(rfr3,tsr_6_input_nomrs,tsr_6_output,cv = 10)
rfr_pred3 = np.round(rfr_pred3)
confusion_matrix(tsr_6_output, rfr_pred3)

array([[2009,  207],
       [ 121, 1814]], dtype=int64)

## XGBoost

In [40]:
xgbr2 = XGBRegressor(n_estimators  = 15, objective="reg:squarederror", random_state = 19) 
xgbr_scores2 = cross_val_score(xgbr2,tsr_6_input,tsr_6_output,cv = 10, scoring='r2')
print(xgbr_scores2)
print("Mean of R^2:", xgbr_scores2.mean())
print("Std of R^2:", xgbr_scores2.std())

[0.99997719 0.99997704 0.99997708 0.99997658 0.99997729 0.99997701
 0.99997684 0.9999771  0.99997714 0.99997599]
Mean of R^2: 0.999976926155173
Std of R^2: 3.6484455271521257e-07


In [41]:
acc_xgbr2 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input, tsr_6_output, test_size=0.3, random_state=i)
    xgbr2.fit(X_train,y_train)
    xgbr2_predict = xgbr2.predict(X_test)
    print(xgbr2_predict)
    xgbr2_predict = np.round(xgbr2_predict)
    print(xgbr2_predict)
    accuracy = (y_test == xgbr2_predict).sum() / len(xgbr2_predict)
    acc_xgbr2.append(accuracy)

print("Accuracy of RF:", acc_xgbr2)
print("Mean of Accuracy of RF:", sum(acc_xgbr2)/10)

[0.9976149  0.9976149  0.9976149  ... 0.00238363 0.9976149  0.00238363]
[1. 1. 1. ... 0. 1. 0.]
[0.99761474 0.99761474 0.99761474 ... 0.99761474 0.99761474 0.99761474]
[1. 1. 1. ... 1. 1. 1.]
[0.9976149  0.9976149  0.00238364 ... 0.00238364 0.9976149  0.00238364]
[1. 1. 0. ... 0. 1. 0.]
[0.99761486 0.00238357 0.00238357 ... 0.00238357 0.00238357 0.00238357]
[1. 0. 0. ... 0. 0. 0.]
[0.99761486 0.99761486 0.00238357 ... 0.00238357 0.99761486 0.99761486]
[1. 1. 0. ... 0. 1. 1.]
[0.99761504 0.00238371 0.99761504 ... 0.99761504 0.99761504 0.00238371]
[1. 0. 1. ... 1. 1. 0.]
[0.997615   0.997615   0.997615   ... 0.00238366 0.997615   0.00238366]
[1. 1. 1. ... 0. 1. 0.]
[0.00238354 0.9976148  0.9976148  ... 0.9976148  0.00238354 0.00238354]
[0. 1. 1. ... 1. 0. 0.]
[0.0023838  0.99761516 0.99761516 ... 0.99761516 0.99761516 0.99761516]
[0. 1. 1. ... 1. 1. 1.]
[0.9976148  0.00238355 0.9976148  ... 0.9976148  0.00238355 0.9976148 ]
[1. 0. 1. ... 1. 0. 1.]
Accuracy of RF: [1.0, 1.0, 1.0, 1.0, 1.0

In [42]:
xgbr_pred2 = cross_val_predict(xgbr2,tsr_6_input,tsr_6_output,cv = 10)
xgbr_pred2 = np.round(xgbr_pred2)
confusion_matrix(tsr_6_output, xgbr_pred2)

array([[2216,    0],
       [   0, 1935]], dtype=int64)

In [43]:
xgbr3 = XGBRegressor(n_estimators  = 15, objective="reg:squarederror", random_state = 19) 
xgbr_scores3 = cross_val_score(xgbr3,tsr_6_input_nomrs,tsr_6_output,cv = 10, scoring='r2')
print(xgbr_scores3)
print("Mean of R^2:", xgbr_scores3.mean())
print("Std of R^2:", xgbr_scores3.std())

[0.66684919 0.7667298  0.81267796 0.76909761 0.77169132 0.7166649
 0.78748053 0.68365637 0.83661724 0.61305075]
Mean of R^2: 0.7424515666316938
Std of R^2: 0.06675556623211351


In [44]:
acc_xgbr3 = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(tsr_6_input_nomrs, tsr_6_output, test_size=0.3, random_state=i)
    xgbr3.fit(X_train,y_train)
    xgbr3_predict = xgbr3.predict(X_test)
    print(xgbr3_predict)
    xgbr3_predict = np.round(xgbr3_predict)
    print(xgbr3_predict)
    accuracy = (y_test == xgbr3_predict).sum() / len(xgbr3_predict)
    acc_xgbr3.append(accuracy)

print("Accuracy of RF:", acc_xgbr3)
print("Mean of Accuracy of RF:", sum(acc_xgbr3)/10)

[ 9.1425282e-01  8.7836862e-01  9.2156327e-01 ... -8.7596243e-04
  9.8789114e-01  4.4615595e-03]
[ 1.  1.  1. ... -0.  1.  0.]
[ 1.0023288 -0.0074449  0.8080402 ...  0.8441419  0.9192201  1.0176051]
[ 1. -0.  1. ...  1.  1.  1.]
[0.8695944  0.9827018  0.0012117  ... 0.4121816  0.9345941  0.00659465]
[1. 1. 0. ... 0. 1. 0.]
[ 0.2781615   0.01376157 -0.00159715 ...  0.01517371 -0.00272453
  0.01114556]
[ 0.  0. -0. ...  0. -0.  0.]
[0.7300358  0.6739242  0.7824044  ... 0.00269532 0.93173903 0.91431457]
[1. 1. 1. ... 0. 1. 1.]
[ 0.98446137  0.01122049  0.8603599  ...  1.0189377   0.7383886
 -0.00197598]
[ 1.  0.  1. ...  1.  1. -0.]
[9.5698106e-01 9.9468923e-01 9.3026048e-01 ... 1.5293702e-04 9.9763811e-01
 3.3327300e-02]
[1. 1. 1. ... 0. 1. 0.]
[ 0.0299752   1.0103089   0.99754614 ...  0.9980083  -0.00706562
  0.0388917 ]
[ 0.  1.  1. ...  1. -0.  0.]
[0.01814644 0.58289903 1.0050956  ... 0.808921   0.9621824  0.90636224]
[0. 1. 1. ... 1. 1. 1.]
[ 1.086577    0.06123479  1.0174953  ...  

In [45]:
xgbr_pred3 = cross_val_predict(xgbr3,tsr_6_input_nomrs,tsr_6_output,cv = 10)
xgbr_pred3 = np.round(xgbr_pred3)
confusion_matrix(tsr_6_output, xgbr_pred3)

array([[2008,  208],
       [ 118, 1817]], dtype=int64)

# Summary

## Mean & Std

In [46]:
svr_mean = np.array([svr_scores.mean(), svr_scores.std(), svr_scores[0], svr_scores[1], svr_scores[2], svr_scores[3],
                     svr_scores[4], svr_scores[5], svr_scores[6], svr_scores[7], svr_scores[8], svr_scores[9]])
rfr_mean = np.array([rfr_scores.mean(), rfr_scores.std(), rfr_scores[0], rfr_scores[1], rfr_scores[2], rfr_scores[3],rfr_scores[4], 
                    rfr_scores[5], rfr_scores[6], rfr_scores[7], rfr_scores[8], rfr_scores[9]])
xgbr_mean = np.array([xgbr_scores.mean(), xgbr_scores.std(), xgbr_scores[0], xgbr_scores[1], xgbr_scores[2], xgbr_scores[3],
                     xgbr_scores[4], xgbr_scores[5], xgbr_scores[6], xgbr_scores[7], xgbr_scores[8], xgbr_scores[9]])
svr_mean2 = np.array([svr_scores2.mean(), svr_scores2.std(), svr_scores2[0], svr_scores2[1], svr_scores2[2], svr_scores2[3],
                     svr_scores2[4], svr_scores2[5], svr_scores2[6], svr_scores2[7], svr_scores2[8], svr_scores2[9]])
rfr_mean2 = np.array([rfr_scores2.mean(), rfr_scores2.std(), rfr_scores2[0], rfr_scores2[1], rfr_scores2[2], rfr_scores2[3],rfr_scores2[4], 
                    rfr_scores2[5], rfr_scores2[6], rfr_scores2[7], rfr_scores2[8], rfr_scores2[9]])
xgbr_mean2 = np.array([xgbr_scores2.mean(), xgbr_scores2.std(), xgbr_scores2[0], xgbr_scores2[1], xgbr_scores2[2], xgbr_scores2[3],
                     xgbr_scores2[4], xgbr_scores2[5], xgbr_scores2[6], xgbr_scores2[7], xgbr_scores2[8], xgbr_scores2[9]])

In [47]:
tsr_6_mean = pd.DataFrame([svr_mean, rfr_mean, xgbr_mean, svr_mean2, rfr_mean2, xgbr_mean2]).T
tsr_6_mean.index = ["Mean", "Std", "R^2_1", "R^2_2", "R^2_3", "R^2_4", "R^2_5", "R^2_6", "R^2_7", "R^2_8", "R^2_9", "R^2_10"]
tsr_6_mean.columns = ["svr", 'rfr', 'xgbr', 'svr2', 'rfr2', 'xgbr2']

In [48]:
csv_save = os.path.join(".", "tsr_6_mean_regression.csv")
tsr_6_mean.to_csv(csv_save, index = True)