In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from xgboost import XGBClassifier

In [2]:
# Load preprocessed data
with open(
    "C:/Users/syafi/Desktop/syafiq-project/new classification task/model/saved_data/preprocessed_data.pkl",
    "rb",
) as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

with open(
    "C:/Users/syafi/Desktop/syafiq-project/new classification task/model/saved_data/label_mapping.pkl",
    "rb",
) as f:
    label_mapping = pickle.load(f)
class_labels = list(label_mapping.keys())

In [3]:
# Display train and test sizes
print(f"Train size: {len(X_train)}")
print(f"Test size: {len(X_test)}")

# Valid labels
valid_labels = [0, 1, 2, 3, 4]

Train size: 1344719
Test size: 336180


In [4]:
# Define a function to filter data by valid labels
def filter_and_remap_labels(X, y, valid_labels):
    mask = np.isin(y, valid_labels)
    X_filtered = X[mask]
    y_filtered = y[mask]
    return X_filtered, y_filtered

In [5]:
# Define test set expansion function
def expand_test_set(X_test, y_test, repeat_factor):
    X_test_expanded = np.repeat(X_test, repeats=repeat_factor, axis=0)
    y_test_expanded = np.repeat(y_test, repeats=repeat_factor)
    return X_test_expanded, y_test_expanded

In [6]:
# Evaluate the XGBoost model
def evaluate_xgb_model(model, X_test_subset, y_test_subset, class_labels):
    predictions = model.predict(X_test_subset)
    accuracy = accuracy_score(y_test_subset, predictions)

    unique_classes = np.unique(y_test_subset)
    labels = list(unique_classes)
    dynamic_labels = [class_labels[label] for label in labels]

    report = classification_report(
        y_test_subset,
        predictions,
        labels=labels,
        target_names=dynamic_labels,
        zero_division=0,
        output_dict=True,
    )
    return accuracy, report, predictions, dynamic_labels

In [7]:
# Define sizes for experiments
sample_sizes = [25, 50, 75, 100, 250, 500, 750, 1000, 2500, 5000, 7500, 10000, 20000, 30000, 40000, 50000]
training_sizes = [20, 40, 60, 80, 200, 400, 600, 800, 2000, 4000, 6000, 8000, 16000, 24000, 32000, 40000]
testing_sizes = [5, 10, 15, 20, 50, 100, 150, 200, 500, 1000, 1500, 2000, 4000, 6000, 8000, 10000]

# Container for results
xgb_results = []
best_accuracy = 0
best_model = None
best_sample_size = 0

In [8]:
# Iterate through sample, training, and testing sizes
for sample_size, train_size, test_size in zip(
    sample_sizes, training_sizes, testing_sizes
):
    # Filter training data
    X_train_subset, y_train_subset = X_train[:train_size], y_train[:train_size]
    X_train_subset, y_train_subset = filter_and_remap_labels(
        X_train_subset, y_train_subset, valid_labels
    )

    # Encode labels using only valid labels
    label_encoder = LabelEncoder()
    label_encoder.fit(valid_labels)
    y_train_encoded = label_encoder.transform(y_train_subset)

    # Expand and filter testing data
    X_test_subset, y_test_subset = X_test[:test_size], y_test[:test_size]
    repeat_factor = 11
    X_test_expanded, y_test_expanded = expand_test_set(
        X_test_subset, y_test_subset, repeat_factor
    )
    X_test_expanded, y_test_expanded = filter_and_remap_labels(
        X_test_expanded, y_test_expanded, valid_labels
    )
    y_test_encoded = label_encoder.transform(y_test_expanded)

    # Build and train the XGB model
    xgb_model = XGBClassifier(
        use_label_encoder=False,
        eval_metric="mlogloss",
        random_state=42,
        num_class=len(np.unique(y_train_encoded)),
    )
    xgb_model.fit(X_train_subset, y_train_encoded)

    # Evaluate the model
    accuracy, report, predictions, dynamic_labels = evaluate_xgb_model(
        xgb_model, X_test_expanded, y_test_encoded, class_labels
    )

    # Print results for the current iteration
    print(f"\nXGB Sample size {sample_size} - Accuracy: {accuracy:.4f}")
    print(
        classification_report(
            y_test_encoded,
            predictions,
            labels=np.unique(y_test_encoded),
            target_names=dynamic_labels,
            zero_division=0,
        )
    )

    # Store results
    report_flattened = {
        **{
            f"{label}_{metric}": value
            for label, metrics in report.items()
            if isinstance(metrics, dict)
            for metric, value in metrics.items()
        },
        "sample_size": sample_size,
        "train_size": train_size,
        "test_size": test_size,
        "accuracy": accuracy,
    }
    xgb_results.append(report_flattened)

    # Update the best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = xgb_model
        best_sample_size = sample_size
        print(
            f"New best model found for sample size {sample_size} with accuracy {accuracy:.4f}"
        )

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3], got [0 1 3 4]

In [None]:
# Save the best model
if best_model is not None:
    best_model_file = f"best_xgb_model_sample_size_{best_sample_size}.pkl"
    with open(best_model_file, "wb") as model_file:
        pickle.dump(best_model, model_file)
    print(f"\nBest model saved as {best_model_file} with accuracy {best_accuracy:.4f}")

In [None]:
# Save results to a CSV file for further analysis
results_df = pd.DataFrame(xgb_results)
results_df.to_csv("xgb_results.csv", index=False)
print("\nResults saved to 'xgb_results.csv'")