### Parameter


In [284]:
# grid = {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 5, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0.001, 'n_estimators': 75, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}
grid = {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 8, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 70, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}


In [355]:
import sys
import os

# Add the parent directory to the system path
sys.path.append(os.path.abspath('../'))  # Adjust the path as needed

from my_util import df_to_corr_matrix, remove_outliers

import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from matplotlib.colors import Normalize
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif
from sklearn.impute import KNNImputer

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from joblib import Parallel, delayed

import xgboost as xgb
from xgboost import XGBClassifier

from pickle import dump , load

import warnings

In [356]:
test_file_path = '../TestDatasetExample.xls'

### Load data

In [357]:
X = pd.read_excel(test_file_path)

X.replace(999, np.nan, inplace=True)

NUM_OF_SELECTED_FEATURES = "corr_25"

with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:
    selected_features = load(file)
    print(f"Loaded '{file.name}' to selected_feature")

X = X[selected_features]
print('Loaded selected_features to X')

Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature
Loaded selected_features to X


### Load model

In [358]:
model = XGBClassifier()
model.load_model("model.ubj")

print(selected_features)
y_pred = model.predict(X)

['Gene', 'HER2', 'PgR', 'ER', 'original_firstorder_10Percentile', 'original_ngtdm_Busyness', 'LNStatus', 'TumourStage', 'original_gldm_DependenceEntropy', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_ngtdm_Strength', 'original_gldm_SmallDependenceEmphasis', 'original_firstorder_InterquartileRange', 'original_shape_MajorAxisLength', 'original_glrlm_LongRunLowGrayLevelEmphasis', 'original_firstorder_Minimum', 'HistologyType', 'ChemoGrade', 'original_shape_Maximum2DDiameterRow', 'original_shape_Maximum2DDiameterColumn', 'original_shape_SurfaceVolumeRatio', 'original_shape_LeastAxisLength', 'original_glcm_Autocorrelation', 'original_shape_Sphericity']


In [359]:
y_pred

array([0, 1, 1])

### Retrain the model with different data and evaluate the model

In [371]:
NUM_OF_SELECTED_FEATURES = "corr_25"

data = pd.read_excel("../TrainDataset2024.xls")
data.replace(999, np.nan, inplace=True)

data.drop(["ID", "RelapseFreeSurvival (outcome)"], axis=1, inplace=True)
data.dropna(subset=["pCR (outcome)"], inplace=True)

with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:
    selected_features = load(file)
    print(f"Loaded '{file.name}' to selected_feature")

X = data[selected_features]
y = data["pCR (outcome)"]
print(X.shape, y.shape)

while True:  
    # X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=14) # similar distribution of 1 and 0
    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.12, random_state=None)

    X_train_full.reset_index(drop=True, inplace=True)
    X_test_reserved.reset_index(drop=True, inplace=True)
    y_train_full.reset_index(drop=True, inplace=True)
    y_test_reserved.reset_index(drop=True, inplace=True)

    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)
    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)

    if abs(ratio_train - ratio_test) < 0.03:
        break

print("Splited the data into train and test. The test will not be used in the training, but just for test the xgb. ")
print(f"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. ")
print(f"Positive ratio: \n\tTrain: {ratio_train:.5f}\n\tTest: {ratio_test:.5f}")

stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True)

model.set_params(**grid)

print(X_train_full.shape)

y_pred_cv = cross_val_predict(model, X_train_full, y_train_full, cv=stratified_kfold)
print(confusion_matrix(y_train_full, y_pred_cv))
print(classification_report(y_train_full, y_pred_cv))
print(f"Balanced accuracy score: {balanced_accuracy_score(y_train_full, y_pred_cv)}")
print(f"F1 Score: {f1_score(y_train_full, y_pred_cv)}")
print(f"Precision: {precision_score(y_train_full, y_pred_cv)}")
print(f"Recall: {recall_score(y_train_full, y_pred_cv)}")
print(f"Specificity: {recall_score(y_train_full, y_pred_cv, pos_label=0)}")
print()

y_pred = model.predict(X_test_reserved)

print(X_test_reserved.shape)

print(confusion_matrix(y_test_reserved, y_pred))
print(classification_report(y_test_reserved, y_pred))
print(f"Balanced accuracy score: {balanced_accuracy_score(y_test_reserved, y_pred)}")
print(f"F1 Score: {f1_score(y_test_reserved, y_pred)}")
print(f"Precision: {precision_score(y_test_reserved, y_pred)}")
print(f"Recall: {recall_score(y_test_reserved, y_pred)}")
print(f"Specificity: {recall_score(y_test_reserved, y_pred, pos_label=0)}")


print("\nUse the whole data to train and do CV")
y_pred_cv = cross_val_predict(model, X, y, cv=stratified_kfold)
print(confusion_matrix(y, y_pred_cv))
print(classification_report(y, y_pred_cv))
print(f"Balanced accuracy score: {balanced_accuracy_score(y, y_pred_cv)}")
print(f"F1 Score: {f1_score(y, y_pred_cv)}")
print(f"Precision: {precision_score(y, y_pred_cv)}")
print(f"Recall: {recall_score(y, y_pred_cv)}")
print(f"Specificity: {recall_score(y, y_pred_cv, pos_label=0)}")
print()

Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature
(395, 25) (395,)
Splited the data into train and test. The test will not be used in the training, but just for test the xgb. 
The training data has 347 data. The testing data has 48 data. 
Positive ratio: 
	Train: 0.21037
	Test: 0.22917
(347, 25)
[[189  85]
 [ 14  59]]
              precision    recall  f1-score   support

         0.0       0.93      0.69      0.79       274
         1.0       0.41      0.81      0.54        73

    accuracy                           0.71       347
   macro avg       0.67      0.75      0.67       347
weighted avg       0.82      0.71      0.74       347

Balanced accuracy score: 0.7490000999900011
F1 Score: 0.543778801843318
Precision: 0.4097222222222222
Recall: 0.8082191780821918
Specificity: 0.6897810218978102

(48, 25)
[[23 14]
 [ 0 11]]
              precision    recall  f1-score   support

         0.0       1.00      0.62      0.77        37
         1.0       0.