In [1]:
# 📦 1. Import libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# ✅ If XGBoost isn't installed, run in a Jupyter cell:
# !pip install xgboost

from xgboost import XGBClassifier

# 🔁 2. Load dataset
data = pd.read_csv("../Datasets/ovariantotal.csv")
print("Number of columns:", len(data.columns))

# 🔍 3. Check missing values
print("Missing values:\n", data.isnull().sum())

# 🧹 4. Fill missing values with column-wise mean
data = data.fillna(data.mean(numeric_only=True))

# 🎯 5. Split features and target
X = data.drop('TYPE', axis=1)
y = data['TYPE']

# 🔀 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42)

# 🚀 7. Train XGBoost model
model = XGBClassifier(
    objective='binary:logistic',  # Binary classification
    use_label_encoder=False,      # Required in recent versions
    eval_metric='logloss',        # Avoid warning
    random_state=42
)
model.fit(X_train, y_train)

# 🔮 8. Predict on test set
y_pred = model.predict(X_test)

# 📊 9. Evaluation
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📄 Classification Report:\n", classification_report(y_test, y_pred))
print("\n📉 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Number of columns: 50
Missing values:
 AFP          0
AG           0
Age          0
ALB          0
ALP          0
ALT          0
AST          0
BASO#        0
BASO%        0
BUN          0
Ca           0
CA125        0
CA19-9       0
CA72-4       0
CEA          0
CL           0
CO2CP        0
CREA         0
DBIL         0
EO#          0
EO%          0
GGT          0
GLO          0
GLU.         0
HCT          0
HE4          0
HGB          0
IBIL         0
K            0
LYM#         0
LYM%         0
MCH          0
MCV          0
Menopause    0
Mg           0
MONO#        0
MONO%        0
MPV          0
Na           0
NEU          0
PCT          0
PDW          0
PHOS         0
PLT          0
RBC          0
RDW          0
TBIL         0
TP           0
UA           0
TYPE         0
dtype: int64
✅ Accuracy: 0.8952380952380953

📄 Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.84      0.89        51
           1       0.86      0.

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
