# Load and visualize data

In [46]:
import pandas as pd
import numpy as np
import os

from preprocessor.tabular import tabular_processor_ss as tp

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [68]:
data_file = os.path.join("/home/m143713/research/capstone/data","hecktor2022_training_data.csv")
hecktor_processor = tp.TabularProcessor(input_file=data_file, anonymize = False)
# Impute categorical fields
hecktor_processor.impute_categorical_data(["tobacco", "alcohol", "performance_status", "hpv_status_(0=-,_1=+)", "surgery"])
# Impute numerical fields
hecktor_processor.impute_weight(["weight"])
hecktor_processor.one_hot_encoder(['gender'])
hecktor_processor.rename_columns({'hpv_status_(0=-,_1=+)': 'hpv_status'})
convert_dict = {'tobacco': 'int32', 'alcohol': 'int32', 'performance_status': 'int32', 'hpv_status': 'int32', 'surgery': 'int32'}
hecktor_processor.convert_data_types(convert_dict)
hecktor_processor.drop_columns(['gender_f'])
hecktor_processor.check_for_nan()


Count total NaN at each column in a DataFrame : 
 patientid             0
task_1                0
task_2                0
centerid              0
age                   0
weight                0
tobacco               0
alcohol               0
performance_status    0
hpv_status            0
surgery               0
chemotherapy          0
relapse               0
rfs                   0
gender_m              0
dtype: int64


In [69]:
hecktor_processor.data

Unnamed: 0,patientid,task_1,task_2,centerid,age,weight,tobacco,alcohol,performance_status,hpv_status,surgery,chemotherapy,relapse,rfs,gender_m
0,chum-001,1,1,1,82,80.000000,-1,-1,-1,-1,-1,1,0,1704,1
1,chum-002,1,1,1,73,55.000000,-1,-1,-1,-1,-1,1,1,439,1
2,chum-006,1,1,1,65,101.000000,-1,-1,-1,-1,-1,1,0,1186,1
3,chum-007,1,1,1,70,80.000000,-1,-1,-1,-1,-1,0,0,1702,0
4,chum-008,1,1,1,67,91.000000,-1,-1,-1,-1,-1,1,0,1499,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
483,hmr-028,1,1,7,73,87.000000,-1,-1,-1,-1,-1,1,0,419,1
484,hmr-029,1,1,7,57,96.090909,-1,-1,-1,1,-1,1,0,1736,1
485,hmr-030,1,1,7,70,96.090909,-1,-1,-1,-1,-1,1,0,1385,1
486,hmr-034,1,1,7,85,53.000000,-1,-1,-1,-1,-1,1,0,1570,0


In [70]:
hecktor_processor.random_split(test_ratio= 0.1, valid_ratio=0, random_state = 1042)
hecktor_processor.upsample_train_data(total_rows = 1200)
hecktor_processor.save_splits("../data/task2")

# calculate nnet survival output

In [73]:

from preprocessor.nnet_survival import nnet_survival
import pandas as pd
import numpy as np

# We need to split the data set into train and test, and fill in time & event for both groups
train_data = pd.read_csv("../data/task2/train_data.csv")
test_data = pd.read_csv("../data/task2/test_data.csv")
time = train_data['rfs']
event = train_data['relapse']
timeTest = test_data['rfs']
eventTest = test_data['relapse']
x_train = train_data[['centerid',
                      'age',
                      'weight',
                      'tobacco',
                      'alcohol',
                      'performance_status',
                      'hpv_status',
                      'surgery',
                      'chemotherapy',
                      'gender_m']].values


#Convert event data to array format

halflife=365.*4
breaks=-np.log(1-np.arange(0.0,0.96,0.05))*halflife/np.log(2) 
#breaks=np.concatenate((np.arange(0,200,10),np.arange(200,1001,25)))

n_intervals=len(breaks)-1
timegap = breaks[1:] - breaks[:-1]
y_train_array=nnet_survival.make_surv_array(time,event,breaks)


In [72]:
x_train

array([[ 5., 66., 97., ...,  0.,  1.,  1.],
       [ 5., 52., 77., ...,  0.,  0.,  1.],
       [ 5., 46., 72., ...,  0.,  1.,  1.],
       ...,
       [ 5., 58., 83., ...,  0.,  0.,  1.],
       [ 5., 56., 93., ...,  0.,  1.,  1.],
       [ 5., 51., 90., ...,  1.,  1.,  0.]])

In [25]:
df = pd.DataFrame(y_train_array)

In [26]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,37
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1196,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1197,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1198,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
x_train

array([[1, 'm', 59, ..., -1, -1, 1],
       [5, 'm', 41, ..., 1, 0, 1],
       [5, 'm', 55, ..., 1, 0, 1],
       ...,
       [5, 'f', 47, ..., 1, 0, 0],
       [5, 'm', 53, ..., 1, 0, 1],
       [1, 'm', 66, ..., -1, -1, 1]], dtype=object)