# Import libs

In [None]:
import os
import cv2
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
import joblib
import pandas as pd

# Load data and define functions to preprocess

In [None]:
def load_images_from_folder(folder, label):
    images = []
    labels = []
    for filename in os.listdir(folder):
        img_path = os.path.join(folder, filename)
        if os.path.isfile(img_path):
            img = cv2.imread(img_path)
            if img is not None:
                img_feature = cv2.resize(img, (90, 90))  # Resize images for less memory usage
                images.append(img_feature.flatten())
                labels.append(label)

    return images, labels



# Load dataset function
def load_dataset(base_path):

    benign_train, benign_labels_train = load_images_from_folder(os.path.join(base_path, 'train', 'benign'), 'benign')
    malignant_train, malignant_labels_train = load_images_from_folder(os.path.join(base_path, 'train', 'malignant'), 'malignant')

    benign_test, benign_labels_test = load_images_from_folder(os.path.join(base_path, 'test', 'benign'), 'benign')
    malignant_test, malignant_labels_test = load_images_from_folder(os.path.join(base_path, 'test', 'malignant'), 'malignant')

    X_train = np.array(benign_train + malignant_train)
    y_train = np.array(benign_labels_train + malignant_labels_train)

    X_test = np.array(benign_test + malignant_test)
    y_test = np.array(benign_labels_test + malignant_labels_test)

    return X_train, y_train, X_test, y_test

base_path = 'data/'

X_train, y_train, X_test, y_test = load_dataset(base_path) # Load the dataset

# Preprocess data

In [None]:
# Encode labels to numerical values
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Normalize pixels
X_train = X_train / 255.0
X_test = X_test / 255.0

# Set up cross-validation

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=1234)
cv_accuracies = []

# Train loop with accuracy validation (80% of dataset)

In [None]:
for train_index, val_index in skf.split(X_train, y_train):

    X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Train the model with GPU support
    model = xgb.XGBClassifier(n_estimators=200, max_depth=5, learning_rate=0.1, tree_method='hist', device='cuda')
    model.fit(X_train_fold, y_train_fold)

    # Validate the model
    y_val_pred = model.predict(X_val_fold)
    val_accuracy = accuracy_score(y_val_fold, y_val_pred)
    cv_accuracies.append(val_accuracy)
    print(f'Fold Validation Accuracy: {val_accuracy * 100:.2f}%')

# Compute mean CV accuracy
mean_cv_accuracy = np.mean(cv_accuracies)
print(f'Mean Cross-Validation Accuracy: {mean_cv_accuracy * 100:.2f}%')

# Train model on entire train data

In [None]:
final_model = xgb.XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1, tree_method='hist', device='cuda')
final_model.fit(X_train, y_train)

# Check accuracy on test data

In [None]:
y_pred = final_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy * 100:.2f}%')