In [1]:
import warnings
warnings.filterwarnings("ignore")
from joblib import Parallel, delayed
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from itertools import combinations

from IWLS import IWLS, adaptive_IWLS
from first_order import first_order
from margin import margin

In [2]:
# general parameters
n, k = 50, 5
job_n = 50

seed = 22
cov = 0.5

target = "test_loss"
np.random.seed(seed)

# generate data
mean_n = np.array([-1, 0])
mean_p = np.array([1, 0])
covariance = np.eye(2) * cov
x_n = np.random.multivariate_normal(mean_n, covariance, int(n/2))
x_p = np.random.multivariate_normal(mean_p, covariance, int(n/2))

y_n = np.zeros(int(n/2)) # 0 labels
y_p = np.ones(int(n/2))  # 1 labels

X_train = np.vstack((x_n, x_p))
y_train = np.hstack((y_n, y_p))

# Choose mean_n or mean_p w.p. 1/2
if np.random.rand() < 0.5:
	x_test = np.random.multivariate_normal(mean_n, covariance)
	y_test = 0
else:
	x_test = np.random.multivariate_normal(mean_p, covariance)
	y_test = 1

In [3]:
def actual_effect(X_train, y_train, x_test, y_test, subset_to_remove, original_score, target="probability"):
    # Train a Logistic Regression classifier on the reduced training set
    reduced_X_train = np.delete(X_train, subset_to_remove, axis=0)
    reduced_y_train = np.delete(y_train, subset_to_remove, axis=0)
    reduced_lr = LogisticRegression(penalty=None).fit(reduced_X_train, reduced_y_train)

    # Make inference
    if target == "probability":
        reduced_score = reduced_lr.predict_proba(x_test.reshape(1, -1))[0][1]
    elif target == "train_loss":
        reduced_score = log_loss(reduced_y_train, reduced_lr.predict_proba(reduced_X_train), labels=[0, 1])
    elif target == "test_loss":
        reduced_score = log_loss([y_test], reduced_lr.predict_proba(x_test.reshape(1, -1)), labels=[0, 1])
  
    # Calculate the difference in predicted probabilities
    score_difference = reduced_score - original_score

    return score_difference

# The actual effect of a specific k, not <= k
def actual(X_train, y_train, x_test, y_test, k=10, job_n=50, target="probability"):
    # Create a Logistic Regression classifier
    original_lr = LogisticRegression(penalty=None).fit(X_train, y_train)
 
    # Initialize variables to keep track of the best subset and loss difference for parameter changes
    best_subset = np.full((k), None)
    best_subset_combination = []
    
    if target == "probability":
        original_score = original_lr.predict_proba(x_test.reshape(1, -1))[0][1] # We're looking at the predicted probability of the positive class
    elif target == "train_loss":
        original_score = log_loss(y_train, original_lr.predict_proba(X_train), labels=[0, 1])
    elif target == "test_loss":
        original_score = log_loss([y_test], original_lr.predict_proba(x_test.reshape(1, -1)), labels=[0, 1])

    # Loop over different subset sizes from 1 to k
    for subset_size in range(1, k + 1):
        # Generate all combinations of subsets of the current size
        subset_combinations = combinations(range(X_train.shape[0]), subset_size)
        combinations_list = list(combinations(range(X_train.shape[0]), subset_size))
  
        scores = Parallel(n_jobs=job_n)(delayed(actual_effect)(X_train, y_train, x_test, y_test, subset_to_remove, original_score, target) for subset_to_remove in subset_combinations)
    
        sort_subset_combinations = np.array(combinations_list)[np.argsort(scores)[::-1]]
        best_subset_combination.append(sort_subset_combinations)
        best_subset[subset_size - 1] = sort_subset_combinations[0]

    return [scores, best_subset_combination, best_subset]

In [4]:
scores, best_subset_combination, best_subset = actual(X_train, y_train, x_test, y_test, k=k, job_n=job_n, target=target)
best_k_subset = best_subset[-1]

In [5]:
IWLS_best = IWLS(X_train, y_train, x_test, y_test, target=target)
adaptive_IWLS_best_k = adaptive_IWLS(X_train, y_train, x_test, y_test, k=k, target=target)
ind_n, ind_p = margin(X_train, y_train)
FO_best = first_order(X_train, y_train, x_test, y_test, target=target)
best_k_subset_combination = best_subset_combination[-1]

In [6]:
original_lr = LogisticRegression(penalty=None).fit(X_train, y_train)

if target == "probability":
        original_score = original_lr.predict_proba(x_test.reshape(1, -1))[0][1] # We're looking at the predicted probability of the positive class
elif target == "train_loss":
    original_score = log_loss(y_train, original_lr.predict_proba(X_train), labels=[0, 1])
elif target == "test_loss":
    original_score = log_loss([y_test], original_lr.predict_proba(x_test.reshape(1, -1)), labels=[0, 1])