Install `xlrd` for reading the `xls` file

In [2]:
# %conda install xlrd==2.0.1
# $ conda install -c conda-forge py-xgboost-gpu


Set the path to the `xls` file

In [3]:
training_file = "../TrainDataset2024.xls"
# training_file = "/kaggle/input/dataset/TrainDataset2024.xls"

Import libraries

In [272]:
import sys
import os

# Add the parent directory to the system path
sys.path.append(os.path.abspath('../'))  # Adjust the path as needed

from my_util import df_to_corr_matrix

import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from matplotlib.colors import Normalize
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif
from sklearn.impute import KNNImputer

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from joblib import Parallel, delayed

import xgboost as xgb
from xgboost import XGBClassifier

from pickle import dump , load

import warnings

### Read the data into X and y

In [273]:
NUM_OF_SELECTED_FEATURES = 20

data = pd.read_excel(training_file)
data.replace(999, np.nan, inplace=True)

data.drop(["ID", "RelapseFreeSurvival (outcome)"], axis=1, inplace=True)
data.dropna(subset=["pCR (outcome)"], inplace=True)

with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:
    selected_features = load(file)
    print(f"Loaded '{file.name}' to selected_feature")

X = data[selected_features]
y = data["pCR (outcome)"]
print(X.shape, y.shape)

Loaded '../FeatureSelection/pkl/20_selected_features.pkl' to selected_feature
(395, 20) (395,)


### Split the data into train_full and test_reserved (untouch)

In [274]:
# Close ratio random_state
# [14, 47, 49, 52, 62, 76, 83, 89, 92, 116, 118, 122, 136, 138, 144, 146, 150, 156, 157, 159, 170, 172, 174, 185]

while True:  
    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=14) # similar distribution of 1 and 0
    # X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)

    X_train_full.reset_index(drop=True, inplace=True)
    X_test_reserved.reset_index(drop=True, inplace=True)
    y_train_full.reset_index(drop=True, inplace=True)
    y_test_reserved.reset_index(drop=True, inplace=True)

    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)
    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)

    if abs(ratio_train - ratio_test) < 0.1:
        break

print("Splited the data into train and test. The test will not be used in the training, but just for test the xgb. ")
print(f"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. ")
print(f"Positive ratio: \n\tTrain: {ratio_train:.5f}\n\tTest: {ratio_test:.5f}")

Splited the data into train and test. The test will not be used in the training, but just for test the xgb. 
The training data has 316 data. The testing data has 79 data. 
Positive ratio: 
	Train: 0.21203
	Test: 0.21519


### XGBoost

In [275]:
print(X_train_full.shape)
print(y_train_full.shape)

(316, 20)
(316,)


In [276]:
data_no_na = data.dropna()
X_no_na = data_no_na.drop('pCR (outcome)', axis=1)[selected_features]
y_no_na = data_no_na['pCR (outcome)']

pca = PCA(10)
pca.fit(X_no_na)

In [278]:
model = XGBClassifier(objective="binary:logistic")

param_grid = {
    "gamma": [0.2, 0.3],
    "learning_rate": [0.3, 0.5],
    "max_bin": [2, 5, 10, 20],
    "max_depth": [1, 2, 3],
    "max_leaves": [1, 2, 3, 4],
    "n_estimators": [5, 10, 20, 30, 40, 50],
    "scale_pos_weight": [4.5],  # imbalanced data
}


# Set up the GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=["f1", "recall"],
    cv=5,
    verbose=1,
    n_jobs=-1,
    return_train_score=True,
    refit="f1",
)

# Fit the model
grid_search.fit(X_train_full, y_train_full)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_index = grid_search.best_index_

print(f"Best Parameters at Index {best_index} :", best_params)
print(f"Best {str(grid_search.scorer_).split('(')[1].split(',')[0]} Score:", best_score)

pd.DataFrame(grid_search.cv_results_).to_csv("output.csv")

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits


Best Parameters at Index 370 : {'gamma': 0.2, 'learning_rate': 0.5, 'max_bin': 5, 'max_depth': 1, 'max_leaves': 2, 'n_estimators': 40, 'scale_pos_weight': 4.5}
Best f1_score Score: 0.5711130991618797


In [279]:
results = pd.DataFrame(grid_search.cv_results_)
result_start = 4
print(list(results.keys())[result_start:result_start+len(param_grid)])


['param_gamma', 'param_learning_rate', 'param_max_bin', 'param_max_depth', 'param_max_leaves', 'param_n_estimators', 'param_scale_pos_weight']


In [280]:
results = pd.DataFrame(grid_search.cv_results_)

filter = results['mean_train_recall'] == 1
filter = pd.Series([True] * len(results)) # include all data

fig = go.Figure()
# Add mean train score trace
fig.add_trace(go.Scatter(
    x=list(range(len(results["mean_train_f1"][filter]))),
    y=results["mean_train_f1"][filter],
    mode='lines+markers',
    name='Mean Train F1',
    text=list(results['params'][filter]),  # Display parameter values on hover
    hoverinfo='text+y+x',  # Show parameter values and y value
))
fig.add_trace(go.Scatter(
    x=list(range(len(results["mean_train_recall"][filter]))),
    y=results["mean_train_recall"][filter],
    mode='lines+markers',
    name='Mean Train Recall',
    text=list(results['params'][filter]),  # Display parameter values on hover
    hoverinfo='text+y+x',  # Show parameter values and y value
    visible="legendonly",
))
# Add mean test score trace
fig.add_trace(go.Scatter(
    x=list(range(len(results["mean_test_f1"][filter]))),
    y=results["mean_test_f1"][filter],
    mode='lines+markers',
    name='Mean Test F1',
    text=list(results['params'][filter]),  # Display parameter values on hover
    hoverinfo='text+y+x',  # Show parameter values and y value
))
fig.add_trace(go.Scatter(
    x=list(range(len(results["mean_test_recall"][filter]))),
    y=results["mean_test_recall"][filter],
    mode='lines+markers',
    name='Mean Test Recall',
    text=list(results['params'][filter]),  # Display parameter values on hover
    hoverinfo='text+y+x',  # Show parameter values and y value
    visible="legendonly",
))

# Update layout
fig.update_layout(
    title='Grid Search Mean Train and Test Scores',
    xaxis_title='Parameter Combinations (Index)',
    yaxis_title='Score',
    legend_title='Scores',
    hovermode='closest'
)
fig.show()

In [302]:
model = grid_search.best_estimator_

while True:  
    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)

    X_train_full.reset_index(drop=True, inplace=True)
    X_test_reserved.reset_index(drop=True, inplace=True)
    y_train_full.reset_index(drop=True, inplace=True)
    y_test_reserved.reset_index(drop=True, inplace=True)

    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)
    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)

    if abs(ratio_train - ratio_test) < 0.1:
        break

print("Splited the data into train and test. The test will not be used in the training, but just for test the xgb. ")
print(f"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. ")
print(f"Positive ratio: \n\tTrain: {ratio_train:.5f}\n\tTest: {ratio_test:.5f}")

# print(f"TARGET_NUM_OF_FEATURES: {TARGET_NUM_OF_FEATURES}, scaler: {SCALER}, num_of_features: {num_of_features}")

print(f"Best Parameters at Index {best_index} :", best_params)
print("Best Cross-Validation Score:", best_score)

y_pred = model.predict(X_train_full)
report = classification_report(y_train_full, y_pred)
cm = confusion_matrix(y_train_full, y_pred)

print("\nTraining set:")
print(report)
print(cm)

X_test = X_test_reserved

y_pred = model.predict(X_test)
report = classification_report(y_test_reserved, y_pred)
cm = confusion_matrix(y_test_reserved, y_pred)

print("Untouched testing set:")
print(report)
print(cm)



Splited the data into train and test. The test will not be used in the training, but just for test the xgb. 
The training data has 316 data. The testing data has 79 data. 
Positive ratio: 
	Train: 0.22785
	Test: 0.15190
Best Parameters at Index 370 : {'gamma': 0.2, 'learning_rate': 0.5, 'max_bin': 5, 'max_depth': 1, 'max_leaves': 2, 'n_estimators': 40, 'scale_pos_weight': 4.5}
Best Cross-Validation Score: 0.5711130991618797

Training set:
              precision    recall  f1-score   support

         0.0       0.96      0.70      0.81       244
         1.0       0.46      0.89      0.61        72

    accuracy                           0.74       316
   macro avg       0.71      0.79      0.71       316
weighted avg       0.84      0.74      0.76       316

[[170  74]
 [  8  64]]
Untouched testing set:
              precision    recall  f1-score   support

         0.0       1.00      0.70      0.82        67
         1.0       0.38      1.00      0.55        12

    accuracy        