# MISS on Real World Data

In [1]:

import os
os.environ["PYTHONWARNINGS"] = "ignore"

import sys
sys.path.append('../logistic_regression')

import numpy as np
from joblib import Parallel, delayed
import matplotlib.pyplot as plt

from actual import actual_effect
from IWLS import IWLS, adaptive_IWLS
from first_order import first_order, adaptive_first_order
from margin import margin

from target import target_value

from sklearn.preprocessing import LabelEncoder

from sklearn.datasets import fetch_openml
from ucimlrepo import fetch_ucirepo

In [2]:
ks = [1, 5, 10, 20]
ratio = 0.8
targets = ["probability", "abs_probability", "test_loss", "abs_test_loss", "avg_abs_test_loss", "abs_avg_test_loss"]
methods = ["IWLS", "Adaptive IWLS", "Margin-based", "First-order", "Adaptive First-order"]
dataset = "UCI"
UCI_id = 159
class_combo = [(0, 1)]
num_methods, num_experiments, num_ks = len(methods), 100, len(ks)

In [3]:
# Load dataset
if dataset == "MNIST":
    mnist = fetch_openml('mnist_784')

    # Extract features and labels
    X, y = mnist['data'], mnist['target']

    # Convert labels to integers
    y = y.astype(int)
elif dataset == "UCI":
    # fetch dataset
    UCI_dataset = fetch_ucirepo(id=UCI_id)

    # data (as pandas dataframes)
    X = UCI_dataset.data.features
    y = UCI_dataset.data.targets

    # Convert string labels to integers
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

# Convert both to np.array from pandas dataframe
X_raw = np.array(X)
y_raw = np.array(y)

# Print the shape of features and labels
print("Shape of features:", X_raw.shape)
print("Shape of labels:", y_raw.shape)

# Find unique labels
unique_labels = np.unique(y_raw)
print("Unique labels:", unique_labels)

X_raw = X_raw[np.isin(y_raw, class_combo).flatten()]
y_raw = y_raw[np.isin(y_raw, class_combo).flatten()]


# permute the dataset such that data with label 1 are in the first half
X_raw = X_raw[np.argsort(y_raw)]
y_raw = y_raw[np.argsort(y_raw)]

def data_generation(ratio, target="probability"):
    #randomly sample ratio of the data
    n = int(len(y_raw) * ratio)
    random_indices = np.random.permutation(len(y_raw))[:n]

    # Filter X and y based on the selected indices
    X = X_raw[random_indices]
    y = y_raw[random_indices]

    n_train = n // 2
    n_test = n - n_train

    # Randomly shuffle the indices to select n_train samples
    random_indices = np.random.permutation(n)[:n_train]

    X_train = X[random_indices]
    y_train = y[random_indices]

    X_test_full = np.delete(X, random_indices, axis=0)
    y_test_full = np.delete(y, random_indices, axis=0)

    if target in ["probability", "abs_probability", "test_loss", "abs_test_loss"]: # only one test point
        random_index = np.random.choice(n_test)
        X_test = X_test_full[random_index].reshape(1, -1)
        y_test = y_test_full[random_index].reshape(1, -1)
    elif target in ["avg_abs_test_loss", "abs_avg_test_loss"]: # n test points
        X_test, y_test = X_test_full, y_test_full
    else: # no test point needed
        X_test = None
        y_test = None

    return X_train, y_train, X_test, y_test

Shape of features: (19020, 10)
Shape of labels: (19020,)
Unique labels: [0 1]


  y = column_or_1d(y, warn=True)


In [4]:
def score(ratio, k, target):
    X_train, y_train, X_test, y_test = data_generation(ratio, target=target)
    original_value = target_value(X_train, y_train, X_test, y_test, target=target)

    ind_n, ind_p = margin(X_train, y_train)

    scores = np.array([
        actual_effect(X_train, y_train, X_test, y_test, IWLS(X_train, y_train, X_test, y_test, target=target)[:k], original_value, target=target),
        actual_effect(X_train, y_train, X_test, y_test, adaptive_IWLS(X_train, y_train, X_test, y_test, k=k, target=target), original_value, target=target),
        max(actual_effect(X_train, y_train, X_test, y_test, ind_n[:k], original_value, target=target), actual_effect(X_train, y_train, X_test, y_test, ind_p[:k], original_value, target=target)),
        actual_effect(X_train, y_train, X_test, y_test, first_order(X_train, y_train, X_test, y_test, target=target)[:k], original_value, target=target),
        actual_effect(X_train, y_train, X_test, y_test, adaptive_first_order(X_train, y_train, X_test, y_test, k=k, target=target), original_value, target=target)
    ])

    return scores

In [5]:
# ranks.shape = (num_methods, num_experiments)
def Borda_count(ranks, weights=[5, 4, 3, 2, 1]):
    num_methods, num_experiments = ranks.shape

    weighted_borda_count = np.zeros((num_methods, num_experiments), dtype=int)

    # Calculate weighted Borda count for each seed and covariance
    for experiment_idx in range(num_experiments):
        # Sort indices based on actual ranks for the current experiment
        # tie-handling. ref: https://stackoverflow.com/questions/39059371/can-numpys-argsort-give-equal-element-the-same-rank
        def rankmin(x):
            u, inv, counts = np.unique(x, return_inverse=True, return_counts=True)
            csum = np.zeros_like(counts)
            csum[1:] = counts[:-1].cumsum()
            return csum[inv]

        sorted_indices = rankmin(-1 * ranks[:, experiment_idx])

        # Assign weighted Borda count scores
        for method_idx, rank in enumerate(sorted_indices):
            weighted_borda_count[method_idx, experiment_idx] = weights[rank]

    total_weighted_borda_count = weighted_borda_count.sum(axis=1)

    return total_weighted_borda_count

In [6]:
fig, axs = plt.subplots(2, 3, figsize=(15, 10))  # Create a 2x3 grid of subplots

for target_idx, target in enumerate(targets):
    # for k in ks:
    #     for i in range(num_experiments):
    #         score(ratio, k, target)

    scores_array = np.array(Parallel(n_jobs=50)(delayed(score)(ratio, k, target) for i in range(num_experiments) for k in ks))
    scores_array = scores_array.reshape((num_experiments, num_ks, -1))

    scores_method_ks_combo = scores_array.swapaxes(0, 2) # method, k, combo
    scores_ks_method_combo = scores_method_ks_combo.swapaxes(0, 1) # k, method, combo

    Borda_result = np.zeros((num_ks, num_methods), dtype=float)

    Borda_result = np.array(Parallel(n_jobs=50)(delayed(Borda_count)(scores_ks_method_combo[k_idx]) for k_idx in range(num_ks)))

    # Plot in the corresponding subplot
    row_idx, col_idx = divmod(target_idx, 3)  # Calculate subplot index
    for method_idx, method_name in enumerate(methods):
        axs[row_idx, col_idx].plot(ks, Borda_result[:, method_idx], label=method_name)

    axs[row_idx, col_idx].set_title(f'Target={target}')
    axs[row_idx, col_idx].set_xlabel('k')
    axs[row_idx, col_idx].set_ylabel('Borda Count')
    axs[row_idx, col_idx].legend(methods)

    axs[row_idx, col_idx].set_xticks(ks)

plt.tight_layout(rect=[0, 0.03, 1, 0.95])

plt.suptitle(f'MISS with UCI Dataset', fontsize=16)

plt.show()