### Imports

In [24]:
import sys
import os

# Add the parent directory to the system path
sys.path.append(os.path.abspath('../'))  # Adjust the path as needed

from my_util import df_to_corr_matrix, remove_outliers

import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from matplotlib.colors import Normalize
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif
from sklearn.impute import KNNImputer

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from joblib import Parallel, delayed

import xgboost as xgb
from xgboost import XGBClassifier

from pickle import dump , load

import warnings

### Parameter


In [27]:
params = []
with open("pkl/best_params_15.pkl", 'rb') as file:
  params.append(load(file))
with open("pkl/best_params_20.pkl", 'rb') as file:
  params.append(load(file))
with open("pkl/best_params_25.pkl", 'rb') as file:
  params.append(load(file))
with open("pkl/best_params_30.pkl", 'rb') as file:
  params.append(load(file))
with open("pkl/best_params_35.pkl", 'rb') as file:
  params.append(load(file))

In [3]:
test_file_path = '../TestDatasetExample.xls'

### Load data

In [4]:
X = pd.read_excel(test_file_path)

X.replace(999, np.nan, inplace=True)

NUM_OF_SELECTED_FEATURES = "corr_25"

with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:
    selected_features = load(file)
    print(f"Loaded '{file.name}' to selected_feature")

X = X[selected_features]
print('Loaded selected_features to X')

Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature
Loaded selected_features to X


### Load model

In [5]:
model = XGBClassifier()
model.load_model("model.ubj")

print(selected_features)
y_pred = model.predict(X)

['Gene', 'HER2', 'PgR', 'ER', 'original_firstorder_10Percentile', 'original_ngtdm_Busyness', 'LNStatus', 'TumourStage', 'original_gldm_DependenceEntropy', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_ngtdm_Strength', 'original_gldm_SmallDependenceEmphasis', 'original_firstorder_InterquartileRange', 'original_shape_MajorAxisLength', 'original_glrlm_LongRunLowGrayLevelEmphasis', 'original_firstorder_Minimum', 'HistologyType', 'ChemoGrade', 'original_shape_Maximum2DDiameterRow', 'original_shape_Maximum2DDiameterColumn', 'original_shape_SurfaceVolumeRatio', 'original_shape_LeastAxisLength', 'original_glcm_Autocorrelation', 'original_shape_Sphericity']


In [6]:
y_pred

array([0, 1, 1])

### Retrain the model with different data and evaluate the model

In [90]:
NUM_OF_SELECTED_FEATURES = [25, 30, 35]

files = [("../train_data.xls", "../test_data.xls"), ("../train_data_2.xls", "../test_data_2.xls"), ("../train_data_3.xls", "../test_data_3.xls")]

ba = []

for index, (train_file, test_file) in enumerate(files):
    data = pd.read_excel(train_file)
    data.replace(999, np.nan, inplace=True)

    data.drop(["ID", "RelapseFreeSurvival (outcome)"], axis=1, inplace=True)
    data.dropna(subset=["pCR (outcome)"], inplace=True)

    X = data.drop(columns='pCR (outcome)', axis=1)
    y = data["pCR (outcome)"]
    # print(X.shape, y.shape)

    testdata = pd.read_excel(test_file)
    testdata.replace(999, np.nan, inplace=True)

    testdata.drop(["ID", "RelapseFreeSurvival (outcome)"], axis=1, inplace=True)
    testdata.dropna(subset=["pCR (outcome)"], inplace=True)

    X_test = testdata.drop(columns='pCR (outcome)', axis=1)
    y_test = testdata["pCR (outcome)"]
    # print(X_test.shape, y_test.shape)

    models = len(NUM_OF_SELECTED_FEATURES)*[XGBClassifier()]

    selected_features = []

    for i in NUM_OF_SELECTED_FEATURES:
        FEATURES_FILE_PREFIX = F"corr_{i}"
        with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:
            selected_features.append(load(file))
            # print(f"Loaded '{file.name}' to selected_feature")

    y_pred = []

    for i, model in enumerate(models):
        X_train_temp = X[selected_features[i]]
        X_test_temp = X_test[selected_features[i]]
        model.set_params(**params[5-len(NUM_OF_SELECTED_FEATURES)+i])
        model.fit(X, y)
        y_pred.append(model.predict(X_test))

    y_pred = np.array(y_pred)

    yp = np.round(np.average(y_pred, axis=0))

    print(f"File {index}:")
    print(confusion_matrix(y_test, yp))
    ba.append(balanced_accuracy_score(y_test, yp))
    print(ba[-1])

print(f"Averaged balanced accuracy: {np.mean(ba)}")


File 0:
[[43 19]
 [ 5 12]]
0.6997153700189753
File 1:
[[31 31]
 [ 0 17]]
0.75
File 2:
[[40 22]
 [ 3 14]]
0.7343453510436433
Averaged balanced accuracy: 0.7280202403542062


In [48]:
for p in params:
  print(p)

{'gamma': 0.01, 'learning_rate': 0.041870422386972375, 'max_bin': 13, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 2.8986170391945087, 'n_estimators': 180, 'num_parallel_tree': 2, 'scale_pos_weight': 4.5}
{'gamma': 0.3, 'learning_rate': 0.20282080851348258, 'max_bin': 7, 'max_depth': 1, 'max_leaves': 3, 'min_child_weight': 6.566615599070236, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}
{'gamma': 0.01, 'learning_rate': 0.0385027190573996, 'max_bin': 4, 'max_depth': 1, 'max_leaves': 5, 'min_child_weight': 0.7459049556420737, 'n_estimators': 199, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}
{'gamma': 0, 'learning_rate': 0.03650742884883299, 'max_bin': 2, 'max_depth': 5, 'max_leaves': 3, 'min_child_weight': 0.0, 'n_estimators': 110, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}
{'gamma': 0.1, 'learning_rate': 0.0011667187886546136, 'max_bin': 14, 'max_depth': 2, 'max_leaves': 4, 'min_child_weight': 0.0, 'n_estimators': 200, 'num_parallel_tree': 1, 'sc