In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn import datasets
from itertools import combinations
from sklearn.kernel_ridge import KernelRidge

In [None]:
def kernel_regression(X_train, y_train, x_test, kernel, alpha):
    kernel_regressor = KernelRidge(alpha=alpha, kernel=kernel)
    kernel_regressor.fit(X_train, y_train)
    y_pred = kernel_regressor.predict(x_test.reshape(1, -1))
    return y_pred

In [None]:
def brute_force_removal(X_train, y_train, x_test, max_size, kernel, alpha):
    # Initialize variables to keep track of the best subset and loss difference for parameter changes
    best_subset_fix_test = np.full((max_size), None)
    
    original_score = kernel_regression(X_train, y_train, x_test, kernel, alpha)
    
    # Loop over different subset sizes from 1 to max_size
    for subset_size in range(1, max_size + 1):
        # Generate all combinations of subsets of the current size
        subset_combinations = combinations(range(X_train.shape[0]), subset_size)
        
        max_score_difference = -float("inf")
        
        for subset_to_remove in subset_combinations:
            # Create a new training set without the selected data points
            reduced_X_train = np.delete(X_train, subset_to_remove, axis=0)
            reduced_y_train = np.delete(y_train, subset_to_remove, axis=0)
            
            # Calculate the influence using kernel regression
            reduced_score = kernel_regression(reduced_X_train, reduced_y_train, x_test, kernel, alpha)
            
            # Calculate the difference in loss
            score_difference = reduced_score - original_score
            
            # Update if the current subset induces the minimum change in loss
            if score_difference > max_score_difference:
                max_score_difference = score_difference
                best_subset_fix_test[subset_size - 1] = subset_to_remove
        
        print(f"Best subset of size {subset_size}: {best_subset_fix_test[subset_size - 1]}")
    
    return best_subset_fix_test

In [None]:
def influence(X_train, y_train, x_test, kernel, alpha):
    n_samples = X_train.shape[0]
    influences = np.zeros(n_samples)
    
    kernel_regressor = KernelRidge(alpha=alpha, kernel=kernel)
    kernel_regressor.fit(X_train, y_train)
    y_pred = kernel_regressor.predict(x_test.reshape(1, -1))
    
    # K = kernel(X_train, X_train) + alpha * np.identity(n_samples)
    
    # K_inv = np.linalg.inv(K)
    
    # k = kernel(X_train, x_test)
    
    # # Calculate the influence using the provided formula
    # influences = np.dot(K_inv, k)
    
    # # Calculate the leverage scores using the provided formula
    # H = np.dot(K_inv, K)
    # influences = influences / (1 - np.diagonal(H))
    
    print(f"Largest off-diagonal entry of H: {H[~np.eye(*H.shape, dtype=bool)].max()}")
    
    # Plot the leverage scores empirical distribution
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.hist(np.diagonal(H), bins=30, color='blue', alpha=0.7)
    plt.xlabel('Leverage Score')
    plt.ylabel('Frequency')
    plt.title('Leverage Scores Empirical Distribution')
    
    plt.subplot(1, 2, 2)
    off_diagonal_H = H - np.diag(np.diagonal(H))
    plt.hist(off_diagonal_H[~np.eye(*H.shape, dtype=bool)], bins=30, color='green', alpha=0.7)
    plt.xlabel('Off-Diagonal Entry of H')
    plt.ylabel('Frequency')
    plt.title('Off-Diagonal Entries of H Empirical Distribution')
    
    plt.tight_layout()
    plt.show()
    
    return influences

In [None]:
def kernel_regression_and_influence(X, y, n, max_size, kernel, alpha):
    print(f"Dataset size: {X.shape[0]} samples, {X.shape[1]} features")
    
    test_size = 1 - n / X.shape[0]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    best_subset = brute_force_removal(X_train, y_train, X_test[0], max_size, kernel, alpha)
    print(f"Best subset: {best_subset}")
    
    influences = influence(X_train, y_train, X_test[0], kernel, alpha)
    top_indices = np.argsort(influences)[-(max_size+3):][::-1]
    print(f"Top {max_size+3} Influential Data Points Estimate: {top_indices}")

In [None]:
X, y = datasets.make_regression(n_samples=100, n_features=2, noise=0.1, random_state=42)
    
# Parameters for kernel regression
kernel = 'rbf'  # You can use other kernel types like 'linear', 'poly', 'sigmoid', etc.
alpha = 1.0  # Regularization parameter

# Set the values of n and max_size
n = 50
max_size = 5

# Call the main function to perform kernel regression and influence analysis
kernel_regression_and_influence(X, y, n, max_size, kernel, alpha)