In [1]:
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from keras.utils import to_categorical

In [2]:
# Load preprocessed data
with open(
    "C:/Users/syafi/Desktop/syafiq-project/new classification task/model/saved_data/preprocessed_data.pkl",
    "rb",
) as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

with open(
    "C:/Users/syafi/Desktop/syafiq-project/new classification task/model/saved_data/label_mapping.pkl",
    "rb",
) as f:
    label_mapping = pickle.load(f)

class_labels = list(label_mapping.keys())

# Display train and test sizes
print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")

# Number of classes
num_classes = len(class_labels)

Train size: 1344719
Test size: 336180


In [3]:
# Define test set expansion function
def expand_test_set(X_test, y_test, repeat_factor):
    X_test_expanded = np.repeat(X_test, repeats=repeat_factor, axis=0)
    y_test_expanded = np.repeat(y_test, repeats=repeat_factor)
    return X_test_expanded, y_test_expanded

In [4]:
# Evaluate the XGBoost model
def evaluate_xgb_model(model, X_test_subset, y_test_subset, class_labels):
    # Predictions
    predictions = model.predict(xgb.DMatrix(X_test_subset))
    predictions = np.argmax(predictions, axis=1)
    
    # Accuracy
    accuracy = accuracy_score(y_test_subset, predictions)

    # Dynamically identify the classes present in the test set
    unique_classes = np.unique(y_test_subset)
    
    # Ensure labels and target names are consistent
    labels = list(unique_classes)
    dynamic_labels = [class_labels[label] for label in labels]

    # Generate classification report
    report = classification_report(
        y_test_subset,
        predictions,
        labels=labels,
        target_names=dynamic_labels,
        zero_division=0,
        output_dict=True,
    )
    return accuracy, report, predictions, dynamic_labels

In [5]:
# Define sizes for experiments
sample_sizes = [25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 20000, 30000, 40000, 50000]
training_sizes = [20, 40, 60, 80, 200, 400, 600, 800, 2000, 4000, 6000, 8000, 16000, 24000, 32000, 40000]
testing_sizes = [5, 10, 15, 20, 50, 100, 150, 200, 500, 1000, 1500, 2000, 4000, 6000, 8000, 10000]

# Container for results
xgb_results = []
best_accuracy = 0
best_model = None
best_sample_size = 0

In [6]:
# Iterate through sample, training and testing sizes
for sample_size, train_size, test_size in zip(sample_sizes, training_sizes, testing_sizes):
    # Select subset of training and testing data
    X_train_subset, y_train_subset = X_train[:train_size], y_train[:train_size]
    X_test_subset, y_test_subset = X_test[:test_size], y_test[:test_size]

    # Expand the test set for alignment
    repeat_factor = 11
    X_test_expanded, y_test_expanded = expand_test_set(X_test_subset, y_test_subset, repeat_factor)

    # Build XGBoost model
    params = {
        "objective": "multi:softprob",
        "num_class": num_classes,
        "eval_metric": "mlogloss",
        "learning_rate": 0.1,
        "max_depth": 6,
        "subsample": 0.8,
        "colsample_bytree":0.8,
        "seed":42
        }
    dtrain = xgb.DMatrix(X_train_subset, label=y_train_subset)
    dtest = xgb.DMatrix(X_test_expanded, label=y_test_expanded)
    
    xgb_model = xgb.train(params, dtrain, num_boost_round=100, verbose_eval=True)
    
    # Evaluate the XGBoost model
    accuracy, report, predictions, dynamic_labels = evaluate_xgb_model(xgb_model, X_test_expanded, y_test_expanded, class_labels)
    
    # Print results for the current iteration
    print(f"\nXGBoost sample size {sample_size} - Accuracy {accuracy:.4f}")
    print(classification_report(
        y_test_expanded,
        predictions,
        labels=np.unique(y_test_expanded),
        target_names=dynamic_labels,
        zero_division=0
    ))
    
    # Store results
    report_flattened = {
        **{
            f"{label}_{metric}": value 
            for label, metrics in report.items()
            if isinstance(metrics, dict)
            for metric, value in metrics.items()
        },
        "sample_size": sample_size,
        "train_size": train_size,
        "test_size": test_size,
        "accuracy": accuracy
    }
    xgb_results.append(report_flattened)
    
    # Update the best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = xgb_model
        best_sample_size = sample_size
        print(f"New best model found for sample size {sample_size} with accuracy {accuracy:.4f}")


XGBoost sample size 25 - Accuracy 0.4000
                                      precision    recall  f1-score   support

               Autonomic Dysfunction       0.33      1.00      0.50        11
      Chronic Fatigue Syndrome (CFS)       0.00      0.00      0.00        11
                             Healthy       0.00      0.00      0.00        11
                        Hypertension       1.00      1.00      1.00        11
Respiratory Disease (COPD or Asthma)       0.00      0.00      0.00        11

                           micro avg       0.50      0.40      0.44        55
                           macro avg       0.27      0.40      0.30        55
                        weighted avg       0.27      0.40      0.30        55

New best model found for sample size 25 with accuracy 0.4000

XGBoost sample size 50 - Accuracy 0.4000
                                      precision    recall  f1-score   support

                         Arrhythmias       0.50      1.00      0.67    

In [7]:
# Save the best model
if best_model is not None:
    best_model_file = f"best_xgb_model_sample_size_{best_sample_size}.pkl"
    with open(best_model_file, "wb") as model_file:
        pickle.dump(best_model, model_file)
    print(f"\nBest model saved as {best_model_file} with accuracy {best_accuracy:.4f}")


Best model saved as best_xgb_model_sample_size_30000.pkl with accuracy 0.9968


In [8]:
# Save results to a CSV file for further analysis
results_df = pd.DataFrame(xgb_results)
results_df.to_csv("xgb_results.csv", index=False)
print("\nResults saved to 'xgb_results.csv'")


Results saved to 'xgb_results.csv'
