In [1]:
import pandas as pd
import numpy as np
from scipy import stats
from tqdm import tqdm

In [2]:
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

In [3]:
data = pd.read_csv("TwoNullDataset.csv")

In [4]:
data

Unnamed: 0,PATIENTID,Site,MORPH_ICD10_O2,BEHAVIOUR_ICD10_O2,T_BEST,N_BEST,M_BEST,GRADE,AGE,SEX,...,CLINICAL_TRIAL,CHEMO_RADIATION,REGIMEN_MOD_TIME_DELAY,REGIMEN_MOD_STOPPED_EARLY,REGIMEN_OUTCOME_SUMMARY,CYCLE_NUMBER,ACTUAL_DOSE_PER_ADMINISTRATION,ADMINISTRATION_ROUTE,DRUG_GROUP,diff
0,40060864,C34,8140.0,MALIGNANT,1a,2.0,0,G1,64,FEMALE,...,Not Taking Part,NO,NO,NO,0.0,3,150.0,2,ERLOTINIB,1598
1,40126475,C34,8230.0,MALIGNANT,3,0,0,GX,70,MALE,...,Not Taking Part,NO,NO,NO,0.0,1,250.0,1,ETOPOSIDE,1673
2,40088098,C34,8070.0,MALIGNANT,2a,0,1b,GX,76,MALE,...,Not Taking Part,NO,NO,NO,0,1,600.0,1,CARBOPLATIN,2005
3,40018498,C34,8070.0,MALIGNANT,3,0,0,GX,47,MALE,...,Not Taking Part,NO,NO,NO,5,5,110.0,2,VINORELBINE,1421
4,120017191,C34,8010.0,MALIGNANT,2a,0,0,GX,73,MALE,...,Not Taking Part,NO,NO,NO,0,1,128.0,1,CISPLATIN,897
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
945,40019453,C34,8140.0,MALIGNANT,3,3,1b,GX,57,MALE,...,Not Taking Part,NO,YES,NO,0.0,4,8.0,2,NOT CHEMO,174
946,40089637,C34,8070.0,MALIGNANT,4,2,1b,GX,76,MALE,...,Not Taking Part,NO,NO,NO,0.0,4,70.0,2,VINORELBINE,383
947,40109002,C34,8046.0,MALIGNANT,3,0.0,0,GX,59,FEMALE,...,Not Taking Part,NO,YES,NO,0.0,6,40.0,1,NOT CHEMO,250
948,40080704,C34,8070.0,MALIGNANT,4,3,1b,GX,69,MALE,...,Not Taking Part,NO,NO,NO,0.0,1,560.0,1,CARBOPLATIN,162


In [5]:
data= data.rename(columns={"MORPH_ICD10_O2":"Morph","BEHAVIOUR_ICD10_O2":"Behaviour","T_BEST": "T Best",
                          "N_BEST" : "N Best", "M_BEST" : "M Best", "GRADE" : "Grade", "AGE" : "Age",
                          "SEX":"Sex","CANCERCAREPLANINTENT":"Cancer Plan", "NEWVITALSTATUS" : "Vital Status",
                          "HEIGHT_AT_START_OF_REGIMEN":"Height","WEIGHT_AT_START_OF_REGIMEN":"Weight","MAPPED_REGIMEN":"Regimen","CLINICAL_TRIAL":"Clinical Trial",
                          "CHEMO_RADIATION":"Chemo Radiation","REGIMEN_MOD_TIME_DELAY":"Regimen Time Delay","REGIMEN_MOD_STOPPED_EARLY":"Regimen Stopped Early",
                          "REGIMEN_OUTCOME_SUMMARY":"Outcome","CYCLE_NUMBER":"Cycle","ACTUAL_DOSE_PER_ADMINISTRATION":"Dose Administration",
                           "ADMINISTRATION_ROUTE":"Administration Route", "DRUG_GROUP":"Drug Group","ACE27":"ACE"})


In [6]:
from sklearn import preprocessing
import matplotlib.pyplot as plt
plt.rc("font", size = 14)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid",color_codes=True)

from sklearn.preprocessing import OneHotEncoder

In [7]:
data['Vital Status'].value_counts()

D    554
A    396
Name: Vital Status, dtype: int64

In [8]:
data = data.rename(columns={"Vital Status":"Alive"})

In [9]:
cat_vars = data.columns.tolist()
cat_vars.remove("Site")
cat_vars.remove("Height")
cat_vars.remove("Weight")
cat_vars.remove("Morph")
cat_vars.remove("Age")
cat_vars.remove("Cycle")
cat_vars.remove("Dose Administration")
cat_vars.remove("diff")
cat_vars

['PATIENTID',
 'Behaviour',
 'T Best',
 'N Best',
 'M Best',
 'Grade',
 'Sex',
 'Cancer Plan',
 'CNS',
 'ACE',
 'Alive',
 'Regimen',
 'Clinical Trial',
 'Chemo Radiation',
 'Regimen Time Delay',
 'Regimen Stopped Early',
 'Outcome',
 'Administration Route',
 'Drug Group']

In [10]:
num_vars = ["diff","Height","Weight","Morph","Age","Cycle","Dose Administration"]

In [11]:
data_final = data.copy()
dete = pd.get_dummies(data_final)
dete

Unnamed: 0,PATIENTID,Morph,Age,ACE,Height,Weight,Cycle,Dose Administration,Administration Route,diff,...,Drug Group_PAZOPANIB,Drug Group_PEMBROLIZUMAB,Drug Group_PEMETREXED,Drug Group_RITUXIMAB,Drug Group_STEROID,Drug Group_TOPOTECAN,Drug Group_TRASTUZUMAB,Drug Group_TRIAL,Drug Group_VINCRISTINE,Drug Group_VINORELBINE
0,40060864,8140.0,64,9,1.63,92.4,3,150.0,2,1598,...,0,0,0,0,0,0,0,0,0,0
1,40126475,8230.0,70,9,1.64,67.6,1,250.0,1,1673,...,0,0,0,0,0,0,0,0,0,0
2,40088098,8070.0,76,9,1.83,80.2,1,600.0,1,2005,...,0,0,0,0,0,0,0,0,0,0
3,40018498,8070.0,47,9,1.56,68.0,5,110.0,2,1421,...,0,0,0,0,0,0,0,0,0,1
4,120017191,8010.0,73,9,1.72,88.0,1,128.0,1,897,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
945,40019453,8140.0,57,0,1.65,62.5,4,8.0,2,174,...,0,0,0,0,0,0,0,0,0,0
946,40089637,8070.0,76,9,1.55,82.6,4,70.0,2,383,...,0,0,0,0,0,0,0,0,0,1
947,40109002,8046.0,59,9,1.59,57.1,6,40.0,1,250,...,0,0,0,0,0,0,0,0,0,0
948,40080704,8070.0,69,9,1.75,91.9,1,560.0,1,162,...,0,0,0,0,0,0,0,0,0,0


In [12]:
#X = X.drop(columns=["Site","Alive"], axis = 1)
for var in cat_vars:
    data_final = pd.concat((data_final,pd.get_dummies(data[var], prefix = var)),1)


Ytrain = data_final[["diff"]]
Xtrain = data_final.drop(columns=["Site","diff"])
for var in cat_vars:
    Xtrain = Xtrain.drop(columns=[var])


  data_final = pd.concat((data_final,pd.get_dummies(data[var], prefix = var)),1)


In [13]:
Ytrain

Unnamed: 0,diff
0,1598
1,1673
2,2005
3,1421
4,897
...,...
945,174
946,383
947,250
948,162


In [14]:
Xtrain

Unnamed: 0,Morph,Age,Height,Weight,Cycle,Dose Administration,PATIENTID_10276519,PATIENTID_10277939,PATIENTID_10279100,PATIENTID_10279378,...,Drug Group_PAZOPANIB,Drug Group_PEMBROLIZUMAB,Drug Group_PEMETREXED,Drug Group_RITUXIMAB,Drug Group_STEROID,Drug Group_TOPOTECAN,Drug Group_TRASTUZUMAB,Drug Group_TRIAL,Drug Group_VINCRISTINE,Drug Group_VINORELBINE
0,8140.0,64,1.63,92.4,3,150.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8230.0,70,1.64,67.6,1,250.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8070.0,76,1.83,80.2,1,600.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8070.0,47,1.56,68.0,5,110.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,8010.0,73,1.72,88.0,1,128.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
945,8140.0,57,1.65,62.5,4,8.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
946,8070.0,76,1.55,82.6,4,70.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
947,8046.0,59,1.59,57.1,6,40.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
948,8070.0,69,1.75,91.9,1,560.0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
X_train, X_test, Y_train, Y_test = train_test_split(Xtrain, Ytrain, test_size = 0.3, random_state = 30)

# XGBoost

In [16]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import r2_score


In [17]:
xgb_r = xgb.XGBRegressor(objective ='reg:squarederror', n_estimators = 100, seed = 123)
  
# Fitting the model
xgb_r.fit(X_train, Y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=123,
             reg_alpha=0, reg_lambda=1, ...)

In [18]:
pred = xgb_r.predict(X_test)

In [19]:
# RMSE Computation
rmse = np.sqrt(MSE(Y_test, pred))
print("RMSE : % f" %(rmse))

RMSE :  421.582749


In [20]:
r2_score(Y_test, pred)

0.5603897723765621

# Explainable Boosting Machine (EBM)


In [21]:
from interpret.glassbox import ExplainableBoostingRegressor
from interpret import show

In [22]:
ebm = ExplainableBoostingRegressor()
ebm.fit(X_train, Y_train)

#ebm_global = ebm.explain_global()
#show(emb_global)

#ebm_local = ebm.explain_local(X_test[:5], Y_test[:5])
#show(ebm_local)

ExplainableBoostingRegressor()

In [23]:
preds = ebm.predict(X_test)

In [24]:
# RMSE Computation
rmse = np.sqrt(MSE(Y_test, preds))
print("RMSE : % f" %(rmse))

RMSE :  446.864478


In [25]:
r2_score(Y_test, preds)

0.5060832074791729

# Random Forest

In [26]:
from sklearn.ensemble import RandomForestRegressor

In [27]:
#Create a Gaussian Classifier
clf=RandomForestRegressor(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,Y_train.values.ravel())
pred=clf.predict(X_test)

In [28]:
# RMSE Computation
rmse = np.sqrt(MSE(Y_test, pred))
print("RMSE : % f" %(rmse))

RMSE :  411.575867


In [29]:
r2_score(Y_test, pred)

0.581011668965979

# CatBoost

In [30]:
from catboost import CatBoostRegressor

In [31]:
model = CatBoostRegressor()
# Fit model
model.fit(X_train, Y_train)
# Get predictions
pred = model.predict(X_test)

Learning rate set to 0.038387
0:	learn: 648.5841833	total: 147ms	remaining: 2m 27s
1:	learn: 636.1729538	total: 152ms	remaining: 1m 15s
2:	learn: 624.4401056	total: 156ms	remaining: 51.9s
3:	learn: 613.1949875	total: 160ms	remaining: 39.9s
4:	learn: 603.2908647	total: 164ms	remaining: 32.6s
5:	learn: 593.4481972	total: 168ms	remaining: 27.8s
6:	learn: 583.6563807	total: 171ms	remaining: 24.3s
7:	learn: 574.3785182	total: 175ms	remaining: 21.7s
8:	learn: 565.5850310	total: 179ms	remaining: 19.8s
9:	learn: 557.6912431	total: 184ms	remaining: 18.2s
10:	learn: 549.7645607	total: 188ms	remaining: 16.9s
11:	learn: 542.6887094	total: 192ms	remaining: 15.8s
12:	learn: 535.7550827	total: 196ms	remaining: 14.9s
13:	learn: 529.4265113	total: 201ms	remaining: 14.1s
14:	learn: 523.3801854	total: 205ms	remaining: 13.4s
15:	learn: 517.9153771	total: 209ms	remaining: 12.8s
16:	learn: 512.4996584	total: 212ms	remaining: 12.3s
17:	learn: 507.5958398	total: 217ms	remaining: 11.8s
18:	learn: 502.8700375	t

201:	learn: 348.9389055	total: 917ms	remaining: 3.62s
202:	learn: 348.7580113	total: 921ms	remaining: 3.62s
203:	learn: 348.5902244	total: 925ms	remaining: 3.61s
204:	learn: 348.2994222	total: 929ms	remaining: 3.6s
205:	learn: 348.0309957	total: 933ms	remaining: 3.6s
206:	learn: 347.5062519	total: 937ms	remaining: 3.59s
207:	learn: 347.2884709	total: 940ms	remaining: 3.58s
208:	learn: 347.1277276	total: 945ms	remaining: 3.57s
209:	learn: 346.8164650	total: 948ms	remaining: 3.57s
210:	learn: 346.6474173	total: 952ms	remaining: 3.56s
211:	learn: 346.4641283	total: 956ms	remaining: 3.55s
212:	learn: 346.2565575	total: 960ms	remaining: 3.55s
213:	learn: 346.0721512	total: 964ms	remaining: 3.54s
214:	learn: 345.7664694	total: 967ms	remaining: 3.53s
215:	learn: 345.3084389	total: 971ms	remaining: 3.52s
216:	learn: 345.0511940	total: 975ms	remaining: 3.52s
217:	learn: 344.8716762	total: 978ms	remaining: 3.51s
218:	learn: 344.5267327	total: 982ms	remaining: 3.5s
219:	learn: 344.2981217	total: 

389:	learn: 295.1592480	total: 1.64s	remaining: 2.57s
390:	learn: 295.0503271	total: 1.65s	remaining: 2.56s
391:	learn: 294.9430484	total: 1.65s	remaining: 2.56s
392:	learn: 294.5855168	total: 1.65s	remaining: 2.56s
393:	learn: 294.4777921	total: 1.66s	remaining: 2.55s
394:	learn: 294.3714445	total: 1.66s	remaining: 2.55s
395:	learn: 293.9085490	total: 1.67s	remaining: 2.54s
396:	learn: 293.8059838	total: 1.67s	remaining: 2.54s
397:	learn: 293.1303689	total: 1.67s	remaining: 2.53s
398:	learn: 292.6341477	total: 1.68s	remaining: 2.53s
399:	learn: 292.5376302	total: 1.68s	remaining: 2.52s
400:	learn: 292.2178581	total: 1.69s	remaining: 2.52s
401:	learn: 292.1122716	total: 1.69s	remaining: 2.51s
402:	learn: 292.0061020	total: 1.69s	remaining: 2.51s
403:	learn: 291.9013838	total: 1.7s	remaining: 2.5s
404:	learn: 291.7427693	total: 1.7s	remaining: 2.5s
405:	learn: 290.7967086	total: 1.7s	remaining: 2.49s
406:	learn: 290.6923388	total: 1.71s	remaining: 2.49s
407:	learn: 290.0309254	total: 1.

577:	learn: 258.8875762	total: 2.37s	remaining: 1.73s
578:	learn: 258.8081366	total: 2.38s	remaining: 1.73s
579:	learn: 258.7289405	total: 2.38s	remaining: 1.72s
580:	learn: 258.6186667	total: 2.38s	remaining: 1.72s
581:	learn: 258.5397458	total: 2.39s	remaining: 1.72s
582:	learn: 258.4601420	total: 2.39s	remaining: 1.71s
583:	learn: 258.3816501	total: 2.4s	remaining: 1.71s
584:	learn: 258.3033823	total: 2.4s	remaining: 1.7s
585:	learn: 258.2252691	total: 2.4s	remaining: 1.7s
586:	learn: 258.1477195	total: 2.41s	remaining: 1.69s
587:	learn: 258.0703529	total: 2.41s	remaining: 1.69s
588:	learn: 257.9918497	total: 2.41s	remaining: 1.68s
589:	learn: 257.9144302	total: 2.42s	remaining: 1.68s
590:	learn: 257.8371355	total: 2.42s	remaining: 1.68s
591:	learn: 257.7601837	total: 2.42s	remaining: 1.67s
592:	learn: 257.6831994	total: 2.43s	remaining: 1.67s
593:	learn: 257.6065961	total: 2.43s	remaining: 1.66s
594:	learn: 257.5299727	total: 2.44s	remaining: 1.66s
595:	learn: 257.4535276	total: 2.

767:	learn: 232.6340131	total: 3.1s	remaining: 936ms
768:	learn: 232.5707643	total: 3.1s	remaining: 932ms
769:	learn: 232.5067860	total: 3.11s	remaining: 928ms
770:	learn: 232.0674094	total: 3.11s	remaining: 924ms
771:	learn: 232.0048098	total: 3.12s	remaining: 920ms
772:	learn: 231.9060460	total: 3.12s	remaining: 916ms
773:	learn: 231.8437121	total: 3.12s	remaining: 912ms
774:	learn: 231.7814677	total: 3.13s	remaining: 908ms
775:	learn: 231.7190605	total: 3.13s	remaining: 904ms
776:	learn: 231.5114544	total: 3.13s	remaining: 900ms
777:	learn: 231.4492025	total: 3.14s	remaining: 896ms
778:	learn: 231.3872767	total: 3.14s	remaining: 891ms
779:	learn: 231.2935290	total: 3.15s	remaining: 887ms
780:	learn: 231.2223965	total: 3.15s	remaining: 883ms
781:	learn: 231.1603109	total: 3.15s	remaining: 879ms
782:	learn: 230.8298768	total: 3.16s	remaining: 875ms
783:	learn: 230.7676812	total: 3.16s	remaining: 871ms
784:	learn: 230.4944916	total: 3.16s	remaining: 867ms
785:	learn: 230.2987434	total:

959:	learn: 210.5962787	total: 3.83s	remaining: 159ms
960:	learn: 210.5444176	total: 3.83s	remaining: 155ms
961:	learn: 210.4918132	total: 3.83s	remaining: 151ms
962:	learn: 210.4389724	total: 3.84s	remaining: 148ms
963:	learn: 210.2890361	total: 3.85s	remaining: 144ms
964:	learn: 209.8769215	total: 3.85s	remaining: 140ms
965:	learn: 209.8251884	total: 3.85s	remaining: 136ms
966:	learn: 209.7733330	total: 3.86s	remaining: 132ms
967:	learn: 209.7216204	total: 3.86s	remaining: 128ms
968:	learn: 209.4231575	total: 3.86s	remaining: 124ms
969:	learn: 209.3712771	total: 3.87s	remaining: 120ms
970:	learn: 209.1035143	total: 3.87s	remaining: 116ms
971:	learn: 209.0518746	total: 3.88s	remaining: 112ms
972:	learn: 208.8807284	total: 3.88s	remaining: 108ms
973:	learn: 208.8290633	total: 3.88s	remaining: 104ms
974:	learn: 208.3091467	total: 3.89s	remaining: 99.7ms
975:	learn: 208.2394748	total: 3.89s	remaining: 95.7ms
976:	learn: 208.1758418	total: 3.9s	remaining: 91.7ms
977:	learn: 208.1242294	to

In [32]:
# RMSE Computation
rmse = np.sqrt(MSE(Y_test, pred))
print("RMSE : % f" %(rmse))

RMSE :  417.967999


In [33]:
r2_score(Y_test, pred)

0.5678960975779539

# Artificial Neural Network (ANN)

In [34]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

In [35]:
torch.manual_seed(1)  # Set seed for reproducibility.
trainloader = torch.utils.data.DataLoader(data, batch_size=10, shuffle=True, num_workers=1)

In [37]:
n_input, n_hidden, n_out, batch_size, learning_rate = 1052, 500, 1, 100, 0.01

input_tensor = torch.from_numpy(X_train.to_numpy()).type(torch.FloatTensor)
label_tensor = torch.from_numpy(Y_train.to_numpy()).type(torch.FloatTensor)
test_input_tensor = torch.from_numpy(X_test.to_numpy()).type(torch.FloatTensor)

model = nn.Sequential(nn.Linear(n_input, n_hidden),
                      nn.ReLU(),
                      nn.Linear(n_hidden, n_out),
                      )
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

losses = []
for epoch in range(5000):
    
    pred = model(input_tensor)
    loss = loss_function(pred, label_tensor)
    losses.append(loss.item())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

In [38]:
new_pred = model(test_input_tensor)
new_pred

tensor([[1243.0995],
        [1144.1577],
        [ 771.1276],
        [1532.4316],
        [2195.6243],
        [ 645.2599],
        [ 523.4536],
        [1481.4688],
        [1403.2515],
        [1773.1488],
        [ 137.5220],
        [1091.6593],
        [ 358.8759],
        [1044.0382],
        [ 335.9128],
        [ 301.1922],
        [ 931.3238],
        [1557.9773],
        [ 548.9955],
        [ 215.1204],
        [ 547.1447],
        [1590.1532],
        [1408.7618],
        [1876.0822],
        [ 488.2521],
        [ 703.4067],
        [ 426.4345],
        [ 593.8884],
        [ 421.6435],
        [ 653.5319],
        [ 400.1796],
        [ 741.6392],
        [1230.3676],
        [ 450.2458],
        [ 814.8197],
        [1256.0426],
        [ 591.4106],
        [1124.8906],
        [ 974.0771],
        [ 255.4327],
        [ 294.6690],
        [1259.8805],
        [ 612.9322],
        [ 353.4044],
        [ 653.8937],
        [ 684.9293],
        [ 407.4479],
        [1166

In [39]:
test_label_tensor = torch.from_numpy(Y_test.to_numpy()).type(torch.FloatTensor)

In [40]:
from torchmetrics.functional import mean_squared_error
from torchmetrics.functional import r2_score

In [41]:
mean_squared_error(new_pred, test_label_tensor, squared = False)

tensor(437.4086, grad_fn=<SqrtBackward0>)

In [42]:
r2_score(new_pred, test_label_tensor)

tensor(0.5268, grad_fn=<MeanBackward0>)