# Final Test RFS
- input: test file
- output: ID | RFS

## Process
- Replace 999 to Nan
- SimpleImputer
- OrdinalEncoder
- StandardScaler
- RandomForestRegressor -> get selected features
- load each model
- Voting

## Data Preprocessing

In [4]:
import pandas as pd
import numpy as np

### Read data

In [5]:
test_df = pd.read_excel('TrainDataset2023.xls')
id = test_df['ID']
id

0      TRG002174
1      TRG002178
2      TRG002204
3      TRG002206
4      TRG002210
         ...    
395    TRG002948
396    TRG002954
397    TRG002958
398    TRG002961
399    TRG002962
Name: ID, Length: 400, dtype: object

In [6]:
test_df = test_df.drop(['ID'], axis=1)
test_df

Unnamed: 0,pCR (outcome),RelapseFreeSurvival (outcome),Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,1,144.000000,41.0,0,0,0,1,3,3,1,...,0.517172,0.375126,3.325332,0.002314,3.880772e+06,473.464852,0.000768,0.182615,0.030508,0.000758
1,0,142.000000,39.0,1,1,0,0,3,3,1,...,0.444391,0.444391,3.032144,0.005612,2.372010e+06,59.459710,0.004383,0.032012,0.001006,0.003685
2,1,135.000000,31.0,0,0,0,1,2,1,1,...,0.534549,0.534549,2.485848,0.006752,1.540027e+06,33.935384,0.007584,0.024062,0.000529,0.006447
3,0,12.000000,35.0,0,0,0,1,3,3,1,...,0.506185,0.506185,2.606255,0.003755,6.936741e+06,46.859265,0.005424,0.013707,0.000178,0.004543
4,0,109.000000,61.0,1,0,0,0,2,1,1,...,0.462282,0.462282,2.809279,0.006521,1.265399e+06,39.621023,0.006585,0.034148,0.001083,0.005626
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,0,54.500000,58.5,1,0,1,0,3,2,1,...,0.476493,0.476493,2.453583,0.003229,2.327038e+06,18.562377,0.013766,0.018042,0.000288,0.012257
396,0,49.250000,34.3,0,0,0,1,3,3,1,...,0.418382,0.418382,2.995603,0.004243,1.005061e+06,156.627179,0.002228,0.136015,0.022148,0.002098
397,0,48.500000,53.3,0,0,0,1,2,1,1,...,0.527779,0.527778,1.500000,0.003728,2.132007e+05,0.996746,0.252582,0.007380,0.000037,0.231059
398,0,47.500000,68.8,1,0,0,0,3,3,1,...,0.313693,0.313693,3.573557,0.001112,2.008034e+07,204.864200,0.001372,0.054063,0.003697,0.001368


### Load Components for Preprocessing

In [7]:
import pickle

imputer = pickle.load(open('./regression_pkl/imp_reg.pkl', 'rb'))
ordinal_encoder = pickle.load(open('./regression_pkl/OrdinalEncoder_reg.pkl', 'rb'))
standard_scaler = pickle.load(open('./regression_pkl/scale_reg.pkl', 'rb'))
rf_select = pickle.load(open('./regression_pkl/SelectFromModel.pkl', 'rb'))

### Define Columns

In [8]:
binary_only_col = ['ChemoGrade', 'Proliferation', 'TumourStage']
cate_only_col = [
    'pCR (outcome)',
    'ER',
    'PgR',
    'HER2',
    'TrippleNegative',
    'HistologyType',
    'LNStatus'
]

cate_col = binary_only_col + cate_only_col

con_col = [
    'RelapseFreeSurvival (outcome)',
    'Age', 
    'original_shape_Elongation',
    'original_shape_Flatness',
    'original_shape_LeastAxisLength',
    'original_shape_MajorAxisLength',
    'original_shape_Maximum2DDiameterColumn',
    'original_shape_Maximum2DDiameterRow',
    'original_shape_Maximum2DDiameterSlice',
    'original_shape_Maximum3DDiameter',
    'original_shape_MeshVolume',
    'original_shape_MinorAxisLength',
    'original_shape_Sphericity',
    'original_shape_SurfaceArea',
    'original_shape_SurfaceVolumeRatio',
    'original_shape_VoxelVolume',
    'original_firstorder_10Percentile',
    'original_firstorder_90Percentile',
    'original_firstorder_Energy',
    'original_firstorder_Entropy',
    'original_firstorder_InterquartileRange',
    'original_firstorder_Kurtosis',
    'original_firstorder_Maximum',
    'original_firstorder_MeanAbsoluteDeviation',
    'original_firstorder_Mean',
    'original_firstorder_Median',
    'original_firstorder_Minimum',
    'original_firstorder_Range',
    'original_firstorder_RobustMeanAbsoluteDeviation',
    'original_firstorder_RootMeanSquared',
    'original_firstorder_Skewness',
    'original_firstorder_TotalEnergy',
    'original_firstorder_Uniformity',
    'original_firstorder_Variance',
    'original_glcm_Autocorrelation',
    'original_glcm_ClusterProminence',
    'original_glcm_ClusterShade',
    'original_glcm_ClusterTendency',
    'original_glcm_Contrast',
    'original_glcm_Correlation',
    'original_glcm_DifferenceAverage',
    'original_glcm_DifferenceEntropy',
    'original_glcm_DifferenceVariance',
    'original_glcm_Id',
    'original_glcm_Idm',
    'original_glcm_Idmn',
    'original_glcm_Idn',
    'original_glcm_Imc1',
    'original_glcm_Imc2',
    'original_glcm_InverseVariance',
    'original_glcm_JointAverage',
    'original_glcm_JointEnergy',
    'original_glcm_JointEntropy',
    'original_glcm_MCC',
    'original_glcm_MaximumProbability',
    'original_glcm_SumAverage',
    'original_glcm_SumEntropy',
    'original_glcm_SumSquares',
    'original_gldm_DependenceEntropy',
    'original_gldm_DependenceNonUniformity',
    'original_gldm_DependenceNonUniformityNormalized',
    'original_gldm_DependenceVariance',
    'original_gldm_GrayLevelNonUniformity',
    'original_gldm_GrayLevelVariance',
    'original_gldm_HighGrayLevelEmphasis',
    'original_gldm_LargeDependenceEmphasis',
    'original_gldm_LargeDependenceHighGrayLevelEmphasis',
    'original_gldm_LargeDependenceLowGrayLevelEmphasis',
    'original_gldm_LowGrayLevelEmphasis',
    'original_gldm_SmallDependenceEmphasis',
    'original_gldm_SmallDependenceHighGrayLevelEmphasis',
    'original_gldm_SmallDependenceLowGrayLevelEmphasis',
    'original_glrlm_GrayLevelNonUniformity',
    'original_glrlm_GrayLevelNonUniformityNormalized',
    'original_glrlm_GrayLevelVariance',
    'original_glrlm_HighGrayLevelRunEmphasis',
    'original_glrlm_LongRunEmphasis',
    'original_glrlm_LongRunHighGrayLevelEmphasis',
    'original_glrlm_LongRunLowGrayLevelEmphasis',
    'original_glrlm_LowGrayLevelRunEmphasis',
    'original_glrlm_RunEntropy',
    'original_glrlm_RunLengthNonUniformity',
    'original_glrlm_RunLengthNonUniformityNormalized',
    'original_glrlm_RunPercentage',
    'original_glrlm_RunVariance',
    'original_glrlm_ShortRunEmphasis',
    'original_glrlm_ShortRunHighGrayLevelEmphasis',
    'original_glrlm_ShortRunLowGrayLevelEmphasis',
    'original_glszm_GrayLevelNonUniformity',
    'original_glszm_GrayLevelNonUniformityNormalized',
    'original_glszm_GrayLevelVariance',
    'original_glszm_HighGrayLevelZoneEmphasis',
    'original_glszm_LargeAreaEmphasis',
    'original_glszm_LargeAreaHighGrayLevelEmphasis',
    'original_glszm_LargeAreaLowGrayLevelEmphasis',
    'original_glszm_LowGrayLevelZoneEmphasis',
    'original_glszm_SizeZoneNonUniformity',
    'original_glszm_SizeZoneNonUniformityNormalized',
    'original_glszm_SmallAreaEmphasis',
    'original_glszm_SmallAreaHighGrayLevelEmphasis',
    'original_glszm_SmallAreaLowGrayLevelEmphasis',
    'original_glszm_ZoneEntropy',
    'original_glszm_ZonePercentage',
    'original_glszm_ZoneVariance',
    'original_ngtdm_Busyness',
    'original_ngtdm_Coarseness',
    'original_ngtdm_Complexity',
    'original_ngtdm_Contrast',
    'original_ngtdm_Strength'
]

cate_col_no_target = [ele for ele in cate_col if ele != 'pCR (outcome)']
con_col_no_target = [ele for ele in con_col if ele != 'RelapseFreeSurvival (outcome)']

### Run preprocess

In [9]:
# replace 999 with nan
test_df = test_df.replace(999, np.nan)

# data imputation
test_df_fake_target = test_df.copy()

# test_df_fake_target.insert(0, 'RelapseFreeSurvival (outcome)', 0)
# test_df_fake_target.insert(0, 'pCR (outcome)', 0)
y = test_df_fake_target['RelapseFreeSurvival (outcome)']
df_imp = pd.DataFrame(imputer.transform(test_df_fake_target), columns=test_df_fake_target.columns)

# encode categorical data
df_cate_col_encode = df_imp.copy()
df_cate_col_encode[cate_col] = ordinal_encoder.transform(df_imp[cate_col])

# z-normalization
znorm_df = pd.DataFrame(standard_scaler.fit_transform(df_cate_col_encode[con_col_no_target]), columns=con_col_no_target)
df_normalized = pd.concat([znorm_df, df_cate_col_encode[cate_col_no_target]], axis=1)

# znorm_df = df_cate_col_encode.copy()
# znorm_df[con_col_no_target] = standard_scaler.transform(df_cate_col_encode[con_col_no_target])

# cate_col_with_two_target = cate_col + ["RelapseFreeSurvival (outcome)"]
# df_cate_col_encode_norm = pd.concat([df_cate_col_encode[cate_col_with_two_target], znorm_df], axis=1)

# drop fake target
# znorm_df.drop(['RelapseFreeSurvival (outcome)', 'pCR (outcome)'], axis=1, inplace=True)


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dt

In [10]:
df_normalized

Unnamed: 0,Age,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,...,original_ngtdm_Strength,ChemoGrade,Proliferation,TumourStage,ER,PgR,HER2,TrippleNegative,HistologyType,LNStatus
0,-0.988097,0.592892,1.028945,0.071441,-0.478742,-0.426401,-0.235863,-0.468358,-0.524498,-0.175535,...,-0.246730,2.0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
1,-1.170999,-0.309105,-0.434837,-0.237188,-0.121351,0.165771,-0.113947,-0.406647,-0.084778,-0.175734,...,-0.221445,2.0,2.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
2,-1.902606,-0.437485,0.263658,-0.142142,-0.358341,-0.621841,-0.165039,-0.187999,-0.370476,-0.274125,...,-0.197586,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-1.536803,0.330031,-0.286897,0.354112,0.198721,0.785498,0.009849,0.703661,0.417418,0.193360,...,-0.214034,2.0,2.0,2.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.840921,0.880483,1.183572,-0.263244,-0.665779,-0.611192,-0.548245,-0.590082,-0.669492,-0.333314,...,-0.204678,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,0.612294,-1.997698,-1.646952,-0.913706,0.152046,0.004921,-0.830994,0.342524,0.069186,-0.358710,...,-0.147402,2.0,1.0,3.0,1.0,0.0,1.0,0.0,0.0,1.0
396,-1.600818,1.213763,1.043835,-0.453749,-0.723344,-0.703509,-0.583732,-0.546625,-0.663571,-0.401790,...,-0.235149,2.0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
397,0.136749,-0.363316,-0.605847,-1.475340,-0.949172,-1.280079,-1.090803,-1.065504,-1.005808,-0.562863,...,1.742570,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
398,1.554238,1.058942,1.698358,0.848396,-0.318760,-0.245396,-0.040189,-0.069956,-0.324000,0.197194,...,-0.241459,2.0,2.0,2.0,1.0,0.0,0.0,0.0,0.0,1.0


### Select Features

In [11]:
# features selected from random forest feature selection
selected_features = ['Age', 'original_shape_Elongation', 'original_shape_Flatness',
       'original_shape_LeastAxisLength', 'original_shape_MajorAxisLength',
       'original_shape_Maximum2DDiameterRow',
       'original_shape_Maximum2DDiameterSlice',
       'original_shape_Maximum3DDiameter', 'original_shape_MinorAxisLength',
       'original_shape_Sphericity', 'original_shape_SurfaceVolumeRatio',
       'original_firstorder_10Percentile', 'original_firstorder_90Percentile',
       'original_firstorder_InterquartileRange',
       'original_firstorder_Kurtosis', 'original_firstorder_Maximum',
       'original_firstorder_MeanAbsoluteDeviation',
       'original_firstorder_Minimum', 'original_firstorder_Range',
       'original_firstorder_RobustMeanAbsoluteDeviation',
       'original_firstorder_RootMeanSquared', 'original_firstorder_Skewness',
       'original_firstorder_Variance', 'original_glcm_Imc1',
       'original_gldm_LargeDependenceLowGrayLevelEmphasis',
       'original_gldm_SmallDependenceEmphasis',
       'original_gldm_SmallDependenceHighGrayLevelEmphasis',
       'original_gldm_SmallDependenceLowGrayLevelEmphasis',
       'original_glrlm_RunEntropy', 'original_glrlm_RunLengthNonUniformity',
       'original_glrlm_RunVariance',
       'original_glrlm_ShortRunHighGrayLevelEmphasis',
       'original_glszm_GrayLevelNonUniformity',
       'original_glszm_SizeZoneNonUniformity',
       'original_glszm_SizeZoneNonUniformityNormalized',
       'original_glszm_SmallAreaEmphasis',
       'original_glszm_SmallAreaHighGrayLevelEmphasis',
       'original_glszm_ZoneEntropy', 'original_glszm_ZonePercentage']


X = znorm_df[selected_features]
X

Unnamed: 0,Age,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MinorAxisLength,original_shape_Sphericity,...,original_glrlm_RunLengthNonUniformity,original_glrlm_RunVariance,original_glrlm_ShortRunHighGrayLevelEmphasis,original_glszm_GrayLevelNonUniformity,original_glszm_SizeZoneNonUniformity,original_glszm_SizeZoneNonUniformityNormalized,original_glszm_SmallAreaEmphasis,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage
0,-0.988097,0.592892,1.028945,0.071441,-0.478742,-0.235863,-0.468358,-0.524498,-0.308443,1.036049,...,0.709184,-0.628974,1.769572,-0.304063,-0.326868,-0.439871,0.071207,0.729513,0.789528,-0.427579
1,-1.170999,-0.309105,-0.434837,-0.237188,-0.121351,-0.113947,-0.406647,-0.084778,-0.157788,0.026216,...,-0.381065,0.496965,-0.765192,0.420447,0.359933,-0.309800,0.324132,0.292187,0.405739,0.937740
2,-1.902606,-0.437485,0.263658,-0.142142,-0.358341,-0.165039,-0.187999,-0.370476,-0.524960,0.054451,...,-0.477137,0.227782,-0.322210,0.334361,0.676589,0.276355,0.882281,0.833929,-0.309375,1.409308
3,-1.536803,0.330031,-0.286897,0.354112,0.198721,0.009849,0.703661,0.417418,0.641270,-0.748127,...,-0.269805,0.658673,-1.074165,0.721776,1.034510,0.104683,0.706687,0.663496,-0.151761,0.168875
4,0.840921,0.880483,1.183572,-0.263244,-0.665779,-0.548245,-0.590082,-0.669492,-0.526783,0.943305,...,-0.504823,0.344312,-0.539216,0.104828,0.130663,-0.171733,0.434892,0.399691,0.114003,1.313876
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,0.612294,-1.997698,-1.646952,-0.913706,0.152046,-0.830994,0.342524,0.069186,-0.752043,-0.150642,...,-0.510325,-0.314583,-0.214369,-0.353845,-0.298026,0.048409,0.522870,0.485083,-0.351611,-0.048902
396,-1.600818,1.213763,1.043835,-0.453749,-0.723344,-0.583732,-0.546625,-0.663571,-0.532363,0.014594,...,-0.157886,-0.858311,1.184655,-0.325219,-0.369448,-0.361137,0.163122,0.135911,0.357905,0.370968
397,0.136749,-0.363316,-0.605847,-1.475340,-0.949172,-1.090803,-1.065504,-1.005808,-1.272004,0.426541,...,-0.646080,-1.158545,1.671971,-0.663529,-0.637566,1.025826,0.840364,0.793247,-1.599874,0.157733
398,1.554238,1.058942,1.698358,0.848396,-0.318760,-0.040189,-0.069956,-0.324000,0.144656,0.958425,...,-0.282059,0.428674,-1.720087,-0.296582,-0.496224,-0.916521,-0.484987,-0.493146,1.114461,-0.924759


## Model Prediction

### Load Model

In [12]:
estimators = pickle.load(open('./regression_pkl/estimators_reg.pkl', 'rb'))
estimators

[BaggingRegressor(estimator=RandomForestRegressor(ccp_alpha=0.01,
                                                  criterion='friedman_mse',
                                                  max_depth=20, n_jobs=-1,
                                                  random_state=5555),
                  random_state=42),
 GradientBoostingRegressor(alpha=0.2, ccp_alpha=0.01, criterion='squared_error',
                           learning_rate=0.02, random_state=0),
 BaggingRegressor(estimator=GradientBoostingRegressor(alpha=0.2, ccp_alpha=0.01,
                                                      criterion='squared_error',
                                                      learning_rate=0.02,
                                                      random_state=5555),
                  random_state=42),
 BaggingRegressor(random_state=42),
 BaggingRegressor(random_state=42)]

### Voting

In [14]:
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_absolute_error

Voting_Rergressor = VotingRegressor(estimators=estimators)
Voting_Rergressor.estimators_ = estimators
pred = Voting_Rergressor.predict(X)
score = mean_absolute_error(y, pred)
score

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


12.63683264318389

## Export to csv

In [54]:
result_df = pd.DataFrame()
result_df['ID'] = id
result_df['predicted RFS'] = pred

result_df

Unnamed: 0,ID,predicted RFS
0,TRG002728,58.088985
1,TRG002649,53.061217
2,TRG002628,49.644884


In [55]:
result_df.to_csv('./FinalTestRFS.csv', index=False)