In [1]:
# 1. Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from xgboost import XGBClassifier

# 2. Load data
data = pd.read_csv("../Datasets/ovariantotal.csv")

# 3. Handle missing values
print("Missing values:\n", data.isnull().sum())
data = data.fillna(data.mean(numeric_only=True))

# 4. Split into features and target
X = data.drop('TYPE', axis=1)
y = data['TYPE']

# 5. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# 6. Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 1],
    'min_child_weight': [1, 3]
}

# 7. Setup GridSearchCV
xgb = XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42
)

grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='accuracy',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# 8. Fit the model
grid_search.fit(X_train, y_train)

# 9. Get best model and hyperparameters
best_model = grid_search.best_estimator_
print("Best Hyperparameters:\n", grid_search.best_params_)

# 10. Make predictions
y_pred = best_model.predict(X_test)

# 11. Evaluate model
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Missing values:
 AFP          0
AG           0
Age          0
ALB          0
ALP          0
ALT          0
AST          0
BASO#        0
BASO%        0
BUN          0
Ca           0
CA125        0
CA19-9       0
CA72-4       0
CEA          0
CL           0
CO2CP        0
CREA         0
DBIL         0
EO#          0
EO%          0
GGT          0
GLO          0
GLU.         0
HCT          0
HE4          0
HGB          0
IBIL         0
K            0
LYM#         0
LYM%         0
MCH          0
MCV          0
Menopause    0
Mg           0
MONO#        0
MONO%        0
MPV          0
Na           0
NEU          0
PCT          0
PDW          0
PHOS         0
PLT          0
RBC          0
RDW          0
TBIL         0
TP           0
UA           0
TYPE         0
dtype: int64
Fitting 5 folds for each of 192 candidates, totalling 960 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best Hyperparameters:
 {'colsample_bytree': 0.8, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 200, 'subsample': 1.0}
Test Accuracy: 0.9047619047619048
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.84      0.90        51
           1       0.87      0.96      0.91        54

    accuracy                           0.90       105
   macro avg       0.91      0.90      0.90       105
weighted avg       0.91      0.90      0.90       105

Confusion Matrix:
 [[43  8]
 [ 2 52]]
