# Final Test PCR
- input: test file
- output: ID | PCR

## Process
- Replace 999 to Nan
- SimpleImputer
- OrdinalEncoder
- StandardScaler
- PCA
- StandardScaler_2
- Take out selected columns: HER2, PgR, ER, PCA_1, Proliferation, LNStatus
- load each model
- Voting

## Data Preprocessing

In [103]:
import pandas as pd
import numpy as np

### Read data

In [104]:
test_df = pd.read_excel('./data/TestDatasetExample.xls')
id = test_df['ID']
id

0    TRG002728
1    TRG002649
2    TRG002628
Name: ID, dtype: object

In [105]:
test_df = test_df.drop(['ID'], axis=1)
test_df

Unnamed: 0,Age,ER,PgR,HER2,TrippleNegative,ChemoGrade,Proliferation,HistologyType,LNStatus,TumourStage,...,original_glszm_SmallAreaHighGrayLevelEmphasis,original_glszm_SmallAreaLowGrayLevelEmphasis,original_glszm_ZoneEntropy,original_glszm_ZonePercentage,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength
0,56.881588,0,0,0,1,3,3,999,0,2,...,0.194591,0.194591,2.846439,0.001281,4168474.0,131.044541,0.002335,0.109755,0.013383,0.002051
1,60.0,0,0,1,0,2,1,1,0,3,...,0.309999,0.309996,2.975317,0.007253,173658.5,23.967478,0.011285,0.05589,0.003163,0.009553
2,58.234086,0,0,0,1,3,3,1,1,4,...,0.328377,0.328377,3.785966,0.003185,3607821.0,223.279556,0.001334,0.101628,0.010844,0.001194


### Load Components for Preprocessing

In [106]:
import pickle

imputer = pickle.load(open('./classification_pkl/imp.pkl', 'rb'))
ordinal_encoder = pickle.load(open('./classification_pkl/OrdinalEncoder.pkl', 'rb'))
standard_scaler_1 = pickle.load(open('./classification_pkl/scaler.pkl', 'rb'))
pca = pickle.load(open('./classification_pkl/pca.pkl', 'rb'))
standard_scaler_2 = pickle.load(open('./classification_pkl/scaler_after_pca.pkl', 'rb'))

### Define Columns

In [107]:
binary_only_col = ['ChemoGrade', 'Proliferation', 'TumourStage']
cate_only_col = [
    'pCR (outcome)',
    'ER',
    'PgR',
    'HER2',
    'TrippleNegative',
    'HistologyType',
    'LNStatus'
]

cate_col = binary_only_col + cate_only_col

con_col = [
    'RelapseFreeSurvival (outcome)',
    'Age', 
    'original_shape_Elongation',
    'original_shape_Flatness',
    'original_shape_LeastAxisLength',
    'original_shape_MajorAxisLength',
    'original_shape_Maximum2DDiameterColumn',
    'original_shape_Maximum2DDiameterRow',
    'original_shape_Maximum2DDiameterSlice',
    'original_shape_Maximum3DDiameter',
    'original_shape_MeshVolume',
    'original_shape_MinorAxisLength',
    'original_shape_Sphericity',
    'original_shape_SurfaceArea',
    'original_shape_SurfaceVolumeRatio',
    'original_shape_VoxelVolume',
    'original_firstorder_10Percentile',
    'original_firstorder_90Percentile',
    'original_firstorder_Energy',
    'original_firstorder_Entropy',
    'original_firstorder_InterquartileRange',
    'original_firstorder_Kurtosis',
    'original_firstorder_Maximum',
    'original_firstorder_MeanAbsoluteDeviation',
    'original_firstorder_Mean',
    'original_firstorder_Median',
    'original_firstorder_Minimum',
    'original_firstorder_Range',
    'original_firstorder_RobustMeanAbsoluteDeviation',
    'original_firstorder_RootMeanSquared',
    'original_firstorder_Skewness',
    'original_firstorder_TotalEnergy',
    'original_firstorder_Uniformity',
    'original_firstorder_Variance',
    'original_glcm_Autocorrelation',
    'original_glcm_ClusterProminence',
    'original_glcm_ClusterShade',
    'original_glcm_ClusterTendency',
    'original_glcm_Contrast',
    'original_glcm_Correlation',
    'original_glcm_DifferenceAverage',
    'original_glcm_DifferenceEntropy',
    'original_glcm_DifferenceVariance',
    'original_glcm_Id',
    'original_glcm_Idm',
    'original_glcm_Idmn',
    'original_glcm_Idn',
    'original_glcm_Imc1',
    'original_glcm_Imc2',
    'original_glcm_InverseVariance',
    'original_glcm_JointAverage',
    'original_glcm_JointEnergy',
    'original_glcm_JointEntropy',
    'original_glcm_MCC',
    'original_glcm_MaximumProbability',
    'original_glcm_SumAverage',
    'original_glcm_SumEntropy',
    'original_glcm_SumSquares',
    'original_gldm_DependenceEntropy',
    'original_gldm_DependenceNonUniformity',
    'original_gldm_DependenceNonUniformityNormalized',
    'original_gldm_DependenceVariance',
    'original_gldm_GrayLevelNonUniformity',
    'original_gldm_GrayLevelVariance',
    'original_gldm_HighGrayLevelEmphasis',
    'original_gldm_LargeDependenceEmphasis',
    'original_gldm_LargeDependenceHighGrayLevelEmphasis',
    'original_gldm_LargeDependenceLowGrayLevelEmphasis',
    'original_gldm_LowGrayLevelEmphasis',
    'original_gldm_SmallDependenceEmphasis',
    'original_gldm_SmallDependenceHighGrayLevelEmphasis',
    'original_gldm_SmallDependenceLowGrayLevelEmphasis',
    'original_glrlm_GrayLevelNonUniformity',
    'original_glrlm_GrayLevelNonUniformityNormalized',
    'original_glrlm_GrayLevelVariance',
    'original_glrlm_HighGrayLevelRunEmphasis',
    'original_glrlm_LongRunEmphasis',
    'original_glrlm_LongRunHighGrayLevelEmphasis',
    'original_glrlm_LongRunLowGrayLevelEmphasis',
    'original_glrlm_LowGrayLevelRunEmphasis',
    'original_glrlm_RunEntropy',
    'original_glrlm_RunLengthNonUniformity',
    'original_glrlm_RunLengthNonUniformityNormalized',
    'original_glrlm_RunPercentage',
    'original_glrlm_RunVariance',
    'original_glrlm_ShortRunEmphasis',
    'original_glrlm_ShortRunHighGrayLevelEmphasis',
    'original_glrlm_ShortRunLowGrayLevelEmphasis',
    'original_glszm_GrayLevelNonUniformity',
    'original_glszm_GrayLevelNonUniformityNormalized',
    'original_glszm_GrayLevelVariance',
    'original_glszm_HighGrayLevelZoneEmphasis',
    'original_glszm_LargeAreaEmphasis',
    'original_glszm_LargeAreaHighGrayLevelEmphasis',
    'original_glszm_LargeAreaLowGrayLevelEmphasis',
    'original_glszm_LowGrayLevelZoneEmphasis',
    'original_glszm_SizeZoneNonUniformity',
    'original_glszm_SizeZoneNonUniformityNormalized',
    'original_glszm_SmallAreaEmphasis',
    'original_glszm_SmallAreaHighGrayLevelEmphasis',
    'original_glszm_SmallAreaLowGrayLevelEmphasis',
    'original_glszm_ZoneEntropy',
    'original_glszm_ZonePercentage',
    'original_glszm_ZoneVariance',
    'original_ngtdm_Busyness',
    'original_ngtdm_Coarseness',
    'original_ngtdm_Complexity',
    'original_ngtdm_Contrast',
    'original_ngtdm_Strength'
]

cate_col_no_target = [ele for ele in cate_col if ele != 'pCR (outcome)']
con_col_no_target = [ele for ele in con_col if ele != 'RelapseFreeSurvival (outcome)']

### Run preprocess

In [108]:
# replace 999 with nan
test_df = test_df.replace(999, np.nan)

# data imputation
test_df_fake_target = test_df.copy()

test_df_fake_target.insert(0, 'RelapseFreeSurvival (outcome)', 0)
test_df_fake_target.insert(0, 'pCR (outcome)', 0)
df_imp = pd.DataFrame(imputer.transform(test_df_fake_target), columns=test_df_fake_target.columns)

# encode categorical data
df_cate_col_encode = df_imp.copy()
df_cate_col_encode[cate_col] = ordinal_encoder.transform(df_imp[cate_col])

# z-normalization
znorm_df = df_cate_col_encode.copy()
znorm_df[con_col_no_target] = standard_scaler_1.transform(df_cate_col_encode[con_col_no_target])
# cate_col_with_two_target = cate_col + ["RelapseFreeSurvival (outcome)"]
# df_cate_col_encode_norm = pd.concat([df_cate_col_encode[cate_col_with_two_target], znorm_df], axis=1)

# pca
df_cate_col_encode_norm_no_age=[ele for ele in con_col_no_target if ele!='Age']
pca_transform = pca.transform(znorm_df[df_cate_col_encode_norm_no_age])
PCA_df = pd.DataFrame()
for i in range(pca_transform.shape[1]):
    PCA_df[f'PCA_{i+1}'] = pca_transform[:,i]

# select 5 PC
choose_num_of_pca = 5
pca_col = PCA_df.columns[:choose_num_of_pca]
col_for_norm_after_pca = list(pca_col)
cate_col_with_two_target_and_age = cate_col + ["RelapseFreeSurvival (outcome)","Age"]
# df_cate_col_encode_with_pca = pd.concat([znorm_df[cate_col_with_two_target_and_age], PCA_df.iloc[:,:choose_num_of_pca]], axis=1)

# normalize after pca
PCA_df_norm = pd.DataFrame(standard_scaler_2.fit_transform(PCA_df[col_for_norm_after_pca]), columns=col_for_norm_after_pca)
df_pca_norm_target = pd.concat([znorm_df[cate_col_with_two_target_and_age], PCA_df_norm], axis=1)

# drop targets
df_pca_norm_target.drop(['RelapseFreeSurvival (outcome)', 'pCR (outcome)'], axis=1, inplace=True)
df_pca_norm_target

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dt

Unnamed: 0,ChemoGrade,Proliferation,TumourStage,ER,PgR,HER2,TrippleNegative,HistologyType,LNStatus,Age,PCA_1,PCA_2,PCA_3,PCA_4,PCA_5
0,2.0,2.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.464288,1.06,-0.140435,1.402276,1.102389,1.059411
1,1.0,0.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.74947,-1.34074,-1.148474,-0.859934,-1.318367,0.281612
2,2.0,2.0,3.0,0.0,0.0,0.0,1.0,0.0,1.0,0.587976,0.28074,1.288909,-0.542342,0.215979,-1.341023


### Select Features

In [109]:
selected_feature_li = ['HER2', 'PgR', 'ER', 'PCA_1', 'Proliferation', 'LNStatus']

X = df_pca_norm_target[selected_feature_li]
X

Unnamed: 0,HER2,PgR,ER,PCA_1,Proliferation,LNStatus
0,0.0,0.0,0.0,1.06,2.0,0.0
1,1.0,0.0,0.0,-1.34074,0.0,0.0
2,0.0,0.0,0.0,0.28074,2.0,1.0


## Model Prediction

### Load Model

In [110]:
estimators = pickle.load(open('./classification_pkl/estimator.pkl', 'rb'))
estimators

[Pipeline(steps=[('over', SMOTE(sampling_strategy=0.9)),
                 ('under', RandomUnderSampler(sampling_strategy=1.0)),
                 ('model',
                  LogisticRegression(C=1, penalty='l1', solver='liblinear',
                                     tol=1e-06))]),
 Pipeline(steps=[('over', SMOTE(sampling_strategy=0.9)),
                 ('under', RandomUnderSampler(sampling_strategy=1.0)),
                 ('model',
                  RandomForestClassifier(max_depth=9, min_samples_leaf=10,
                                         n_estimators=10))]),
 Pipeline(steps=[('over', SMOTE(sampling_strategy=0.9)),
                 ('under', RandomUnderSampler(sampling_strategy=1.0)),
                 ('model',
                  XGBClassifier(base_score=None, booster='gblinear',
                                callbacks=None, colsample_bylevel=None,
                                colsample_bynode=None, colsample_bytree=None,
                                device=None, early_

### Voting

In [111]:
#Function of Voting

def voting(estimaters, weights=None, type='hard'):

    if type == 'hard':
        pred = np.asarray([e.predict(X) for e in estimaters]).T
        pred = np.apply_along_axis(lambda x:
                                np.argmax(np.bincount(x, weights=weights)),
                                axis=1,
                                arr=pred.astype('int'))
    elif type == 'soft':
        pred = np.asarray([e.predict_proba(X) for e in estimaters])
        pred = np.average(pred, axis=0, weights=weights)
        pred = np.argmax(pred, axis=1)

    return pred

In [112]:
pred = voting(estimators, type='soft')
pred

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([1, 1, 1], dtype=int64)

## Export to csv

In [113]:
result_df = pd.DataFrame()
result_df['ID'] = id
result_df['predicted PCR'] = pred

result_df

Unnamed: 0,ID,predicted PCR
0,TRG002728,1
1,TRG002649,1
2,TRG002628,1


In [114]:
result_df.to_csv('./FinalTestPCR.csv', index=False)