# Final Test RFS
- input: test file
- output: ID | RFS

## Process
- Replace 999 to Nan
- SimpleImputer
- OrdinalEncoder
- StandardScaler
- RandomForestRegressor -> get selected features
- load each model
- Voting

## Data Preprocessing

In [13]:
import pandas as pd
import numpy as np

### Read data

In [14]:
test_df = pd.read_excel('./data/TestDatasetExample.xls')
id = test_df['ID']
id

0    TRG002728
1    TRG002649
2    TRG002628
Name: ID, dtype: object

In [15]:
test_df = test_df.drop(['ID'], axis=1)
test_df

Unnamed: 0,Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,LNStatus,TumourStage,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,56.881588,0,0,0,1,3,3,999,0,2,...,0.194591,0.194591,2.846439,0.001281,4168474.0,131.044541,0.002335,0.109755,0.013383,0.002051
1,60.0,0,0,1,0,2,1,1,0,3,...,0.309999,0.309996,2.975317,0.007253,173658.5,23.967478,0.011285,0.05589,0.003163,0.009553
2,58.234086,0,0,0,1,3,3,1,1,4,...,0.328377,0.328377,3.785966,0.003185,3607821.0,223.279556,0.001334,0.101628,0.010844,0.001194


### Load Components for Preprocessing

In [16]:
import pickle

imputer = pickle.load(open('./regression_pkl/imp_reg.pkl', 'rb'))
ordinal_encoder = pickle.load(open('./regression_pkl/OrdinalEncoder_reg.pkl', 'rb'))
standard_scaler = pickle.load(open('./regression_pkl/scale_reg.pkl', 'rb'))
rf_select = pickle.load(open('./regression_pkl/SelectFromModel.pkl', 'rb'))

### Define Columns

In [17]:
binary_only_col = ['ChemoGrade', 'Proliferation', 'TumourStage']
cate_only_col = [
    'pCR (outcome)',
    'ER',
    'PgR',
    'HER2',
    'TrippleNegative',
    'HistologyType',
    'LNStatus'
]

cate_col = binary_only_col + cate_only_col

con_col = [
    'RelapseFreeSurvival (outcome)',
    'Age', 
    'original_shape_Elongation',
    'original_shape_Flatness',
    'original_shape_LeastAxisLength',
    'original_shape_MajorAxisLength',
    'original_shape_Maximum2DDiameterColumn',
    'original_shape_Maximum2DDiameterRow',
    'original_shape_Maximum2DDiameterSlice',
    'original_shape_Maximum3DDiameter',
    'original_shape_MeshVolume',
    'original_shape_MinorAxisLength',
    'original_shape_Sphericity',
    'original_shape_SurfaceArea',
    'original_shape_SurfaceVolumeRatio',
    'original_shape_VoxelVolume',
    'original_firstorder_10Percentile',
    'original_firstorder_90Percentile',
    'original_firstorder_Energy',
    'original_firstorder_Entropy',
    'original_firstorder_InterquartileRange',
    'original_firstorder_Kurtosis',
    'original_firstorder_Maximum',
    'original_firstorder_MeanAbsoluteDeviation',
    'original_firstorder_Mean',
    'original_firstorder_Median',
    'original_firstorder_Minimum',
    'original_firstorder_Range',
    'original_firstorder_RobustMeanAbsoluteDeviation',
    'original_firstorder_RootMeanSquared',
    'original_firstorder_Skewness',
    'original_firstorder_TotalEnergy',
    'original_firstorder_Uniformity',
    'original_firstorder_Variance',
    'original_glcm_Autocorrelation',
    'original_glcm_ClusterProminence',
    'original_glcm_ClusterShade',
    'original_glcm_ClusterTendency',
    'original_glcm_Contrast',
    'original_glcm_Correlation',
    'original_glcm_DifferenceAverage',
    'original_glcm_DifferenceEntropy',
    'original_glcm_DifferenceVariance',
    'original_glcm_Id',
    'original_glcm_Idm',
    'original_glcm_Idmn',
    'original_glcm_Idn',
    'original_glcm_Imc1',
    'original_glcm_Imc2',
    'original_glcm_InverseVariance',
    'original_glcm_JointAverage',
    'original_glcm_JointEnergy',
    'original_glcm_JointEntropy',
    'original_glcm_MCC',
    'original_glcm_MaximumProbability',
    'original_glcm_SumAverage',
    'original_glcm_SumEntropy',
    'original_glcm_SumSquares',
    'original_gldm_DependenceEntropy',
    'original_gldm_DependenceNonUniformity',
    'original_gldm_DependenceNonUniformityNormalized',
    'original_gldm_DependenceVariance',
    'original_gldm_GrayLevelNonUniformity',
    'original_gldm_GrayLevelVariance',
    'original_gldm_HighGrayLevelEmphasis',
    'original_gldm_LargeDependenceEmphasis',
    'original_gldm_LargeDependenceHighGrayLevelEmphasis',
    'original_gldm_LargeDependenceLowGrayLevelEmphasis',
    'original_gldm_LowGrayLevelEmphasis',
    'original_gldm_SmallDependenceEmphasis',
    'original_gldm_SmallDependenceHighGrayLevelEmphasis',
    'original_gldm_SmallDependenceLowGrayLevelEmphasis',
    'original_glrlm_GrayLevelNonUniformity',
    'original_glrlm_GrayLevelNonUniformityNormalized',
    'original_glrlm_GrayLevelVariance',
    'original_glrlm_HighGrayLevelRunEmphasis',
    'original_glrlm_LongRunEmphasis',
    'original_glrlm_LongRunHighGrayLevelEmphasis',
    'original_glrlm_LongRunLowGrayLevelEmphasis',
    'original_glrlm_LowGrayLevelRunEmphasis',
    'original_glrlm_RunEntropy',
    'original_glrlm_RunLengthNonUniformity',
    'original_glrlm_RunLengthNonUniformityNormalized',
    'original_glrlm_RunPercentage',
    'original_glrlm_RunVariance',
    'original_glrlm_ShortRunEmphasis',
    'original_glrlm_ShortRunHighGrayLevelEmphasis',
    'original_glrlm_ShortRunLowGrayLevelEmphasis',
    'original_glszm_GrayLevelNonUniformity',
    'original_glszm_GrayLevelNonUniformityNormalized',
    'original_glszm_GrayLevelVariance',
    'original_glszm_HighGrayLevelZoneEmphasis',
    'original_glszm_LargeAreaEmphasis',
    'original_glszm_LargeAreaHighGrayLevelEmphasis',
    'original_glszm_LargeAreaLowGrayLevelEmphasis',
    'original_glszm_LowGrayLevelZoneEmphasis',
    'original_glszm_SizeZoneNonUniformity',
    'original_glszm_SizeZoneNonUniformityNormalized',
    'original_glszm_SmallAreaEmphasis',
    'original_glszm_SmallAreaHighGrayLevelEmphasis',
    'original_glszm_SmallAreaLowGrayLevelEmphasis',
    'original_glszm_ZoneEntropy',
    'original_glszm_ZonePercentage',
    'original_glszm_ZoneVariance',
    'original_ngtdm_Busyness',
    'original_ngtdm_Coarseness',
    'original_ngtdm_Complexity',
    'original_ngtdm_Contrast',
    'original_ngtdm_Strength'
]

cate_col_no_target = [ele for ele in cate_col if ele != 'pCR (outcome)']
con_col_no_target = [ele for ele in con_col if ele != 'RelapseFreeSurvival (outcome)']

### Run preprocess

In [18]:
# replace 999 with nan
test_df = test_df.replace(999, np.nan)

# data imputation
test_df_fake_target = test_df.copy()

test_df_fake_target.insert(0, 'RelapseFreeSurvival (outcome)', 0)
test_df_fake_target.insert(0, 'pCR (outcome)', 0)
df_imp = pd.DataFrame(imputer.transform(test_df_fake_target), columns=test_df_fake_target.columns)

# encode categorical data
df_cate_col_encode = df_imp.copy()
df_cate_col_encode[cate_col] = ordinal_encoder.transform(df_imp[cate_col])

# z-normalization
znorm_df = pd.DataFrame(standard_scaler.fit_transform(df_cate_col_encode[con_col_no_target]), columns=con_col_no_target)
df_normalized = pd.concat([znorm_df, df_cate_col_encode[cate_col_no_target]], axis=1)

# znorm_df = df_cate_col_encode.copy()
# znorm_df[con_col_no_target] = standard_scaler.transform(df_cate_col_encode[con_col_no_target])

# cate_col_with_two_target = cate_col + ["RelapseFreeSurvival (outcome)"]
# df_cate_col_encode_norm = pd.concat([df_cate_col_encode[cate_col_with_two_target], znorm_df], axis=1)

# drop fake target
# znorm_df.drop(['RelapseFreeSurvival (outcome)', 'pCR (outcome)'], axis=1, inplace=True)


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dt

In [19]:
df_normalized

Unnamed: 0,Age,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,...,original_ngtdm_Strength,ChemoGrade,Proliferation,TumourStage,ER,PgR,HER2,TrippleNegative,HistologyType,LNStatus
0,-1.167208,-1.36729,1.314862,-0.537659,-0.660853,-0.578504,-1.3054,-0.21803,-0.686934,-0.158879,...,-0.589836,2.0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.275138,0.996517,-0.206483,-0.863951,-0.752373,-0.828334,0.181587,-1.101087,-0.72709,-1.137552,...,1.408053,1.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.107929,0.370773,-1.108379,1.40161,1.413226,1.406839,1.123813,1.319117,1.414024,1.296431,...,-0.818217,2.0,2.0,3.0,0.0,0.0,0.0,1.0,0.0,1.0


### Select Features

In [20]:
# features selected from random forest feature selection
selected_features = ['Age', 'original_shape_Elongation', 'original_shape_Flatness',
       'original_shape_LeastAxisLength', 'original_shape_MajorAxisLength',
       'original_shape_Maximum2DDiameterRow',
       'original_shape_Maximum2DDiameterSlice',
       'original_shape_Maximum3DDiameter', 'original_shape_MinorAxisLength',
       'original_shape_Sphericity', 'original_shape_SurfaceVolumeRatio',
       'original_firstorder_10Percentile', 'original_firstorder_90Percentile',
       'original_firstorder_InterquartileRange',
       'original_firstorder_Kurtosis', 'original_firstorder_Maximum',
       'original_firstorder_MeanAbsoluteDeviation',
       'original_firstorder_Minimum', 'original_firstorder_Range',
       'original_firstorder_RobustMeanAbsoluteDeviation',
       'original_firstorder_RootMeanSquared', 'original_firstorder_Skewness',
       'original_firstorder_Variance', 'original_glcm_Imc1',
       'original_gldm_LargeDependenceLowGrayLevelEmphasis',
       'original_gldm_SmallDependenceEmphasis',
       'original_gldm_SmallDependenceHighGrayLevelEmphasis',
       'original_gldm_SmallDependenceLowGrayLevelEmphasis',
       'original_glrlm_RunEntropy', 'original_glrlm_RunLengthNonUniformity',
       'original_glrlm_RunVariance',
       'original_glrlm_ShortRunHighGrayLevelEmphasis',
       'original_glszm_GrayLevelNonUniformity',
       'original_glszm_SizeZoneNonUniformity',
       'original_glszm_SizeZoneNonUniformityNormalized',
       'original_glszm_SmallAreaEmphasis',
       'original_glszm_SmallAreaHighGrayLevelEmphasis',
       'original_glszm_ZoneEntropy', 'original_glszm_ZonePercentage']


X = znorm_df[selected_features]
X

Unnamed: 0,Age,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MinorAxisLength,original_shape_Sphericity,...,original_glrlm_RunLengthNonUniformity,original_glrlm_RunVariance,original_glrlm_ShortRunHighGrayLevelEmphasis,original_glszm_GrayLevelNonUniformity,original_glszm_SizeZoneNonUniformity,original_glszm_SizeZoneNonUniformityNormalized,original_glszm_SmallAreaEmphasis,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage
0,-1.167208,-1.36729,1.314862,-0.537659,-0.660853,-1.3054,-0.21803,-0.686934,-0.981456,1.398612,...,-0.017683,0.713997,-1.004683,-0.991523,-1.334243,0.146855,-1.402812,-1.402814,-0.856377,-1.053922
1,1.275138,0.996517,-0.206483,-0.863951,-0.752373,0.181587,-1.101087,-0.72709,-0.391064,-0.880729,...,-1.215807,-1.414191,1.364292,-0.377543,0.261108,1.144696,0.546198,0.546217,-0.546473,1.343623
2,-0.107929,0.370773,-1.108379,1.40161,1.413226,1.123813,1.319117,1.414024,1.37252,-0.517883,...,1.233491,0.700195,-0.359609,1.369066,1.073135,-1.291551,0.856614,0.856597,1.402849,-0.2897


## Model Prediction

### Load Model

In [21]:
estimators = pickle.load(open('./regression_pkl/estimators_reg.pkl', 'rb'))
estimators

[BaggingRegressor(estimator=RandomForestRegressor(ccp_alpha=0.01,
                                                  criterion='friedman_mse',
                                                  max_depth=20, n_jobs=-1,
                                                  random_state=5555),
                  random_state=42),
 GradientBoostingRegressor(alpha=0.2, ccp_alpha=0.01, criterion='squared_error',
                           learning_rate=0.02, random_state=0),
 BaggingRegressor(estimator=GradientBoostingRegressor(alpha=0.2, ccp_alpha=0.01,
                                                      criterion='squared_error',
                                                      learning_rate=0.02,
                                                      random_state=5555),
                  random_state=42),
 BaggingRegressor(random_state=42),
 BaggingRegressor(random_state=42)]

### Voting

In [22]:
from sklearn.ensemble import VotingRegressor

Voting_Rergressor = VotingRegressor(estimators=estimators)
Voting_Rergressor.estimators_ = estimators
pred = Voting_Rergressor.predict(X)
# score = Voting_Rergressor(y, pred)
pred

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([58.08898484, 53.06121701, 49.64488352])

## Export to csv

In [23]:
result_df = pd.DataFrame()
result_df['ID'] = id
result_df['predicted RFS'] = pred

result_df

Unnamed: 0,ID,predicted RFS
0,TRG002728,58.088985
1,TRG002649,53.061217
2,TRG002628,49.644884


In [24]:
result_df.to_csv('./FinalTestRFS.csv', index=False)