# Classification on the Digits Dataset

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time

import multiprocessing as mp
from ebc.sequential.iterative_with_convexification import SensitivityBasedFW
from splitting import split_based_on_ML, split_randomly, distribute
from parallelization import parallelize

from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.datasets import load_digits

import warnings
warnings.filterwarnings(action = "ignore")

import pickle

## Load data

In [None]:
np.random.seed(123)

X, y = load_digits()["data"], load_digits()["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
print(X.shape, len(X_train), len(X_test))

lb = LabelBinarizer()
y_train_bin = lb.fit_transform(y_train)
y_test_bin = lb.fit_transform(y_test)

## Log-likelihood Definitions

In [None]:
def log_likelihood(params, X, y, weights):
    '''
    Returns:
    ----------
    log_lik: np.ndarray(shape = X.shape[0])
    '''
    N, d = X.shape
    c = y.shape[1]
    beta = params.reshape(d, c)
    preds = X @ beta
    probs = np.exp(-preds) / np.sum(np.exp(-preds))
    probs[probs <= 0] = 1e-15
    probs[probs > 1] = 1
    ll = np.sum(y * np.log(probs), axis = 1)
    return ll.reshape(-1, 1)

def summed_log_likelihood(params, X, y, weights):
    return log_likelihood(params, X, y, weights).sum()

def negative_summed_log_likelihood(params, X, y, weights):
    return -summed_log_likelihood(params, X, y, weights)

def log_posterior(params, X, y, weights):
    return weights.T @ log_likelihood(params, X, y, weights)

## Test

In [None]:
coreset_sizes = np.arange(100, 310, 10)
len(coreset_sizes)

In [None]:
# Sequential
acc_sequential = []
time_sequential = []

na = {"log_likelihood": log_likelihood,
      "log_likelihood_start_value": np.ones(X_train.shape[1] * y_train_bin.shape[1]).reshape(-1, 1),
      "S": int(0.3 * len(X_train)),
      "log_likelihood_gradient": None,
      "approx": "MCMC",
      "MCMC_subs_size": int(0.7 * len(X_train)),
      "log_posterior": log_posterior,
      "log_posterior_start_value": np.ones(X_train.shape[1] * y_train_bin.shape[1]).reshape(-1, 1)}

for k in range(10):
      np.random.seed(120 + k)
      acc_sequential_k = []
      time_sequential_k = []

      for i in coreset_sizes:
            print(k, i)
            start = time.time()
            sbfw = SensitivityBasedFW(X_train, y_train_bin)
            w, I = sbfw.run(k = i, likelihood_gram_matrix = None, norm = "2", norm_attributes = na)
            time_sequential_k.append(time.time() - start)

            # Compute accuracy
            lr = LogisticRegression()
            lr.fit(X_train * w, y_train)
            acc_sequential_k.append(accuracy_score(y_test, lr.predict(X_test)))

      acc_sequential.append(acc_sequential_k)
      time_sequential.append(time_sequential_k)

print(f"Accuracy: {acc_sequential}")
print(f"Time: {time_sequential}")

In [None]:
# Parallel
acc_parallel = []
time_parallel = []

na = {"log_likelihood": log_likelihood,
      "log_likelihood_start_value": np.ones(X_train.shape[1] * y_train_bin.shape[1]).reshape(-1, 1),
      "S": int(0.3 * 0.1 * len(X_train)),
      "log_likelihood_gradient": None,
      "approx": "MCMC",
      "MCMC_subs_size": int(0.1 * len(X_train)),
      "log_posterior": log_posterior,
      "log_posterior_start_value": np.ones(X_train.shape[1] * y_train_bin.shape[1]).reshape(-1, 1)}

for k in range(10):
      np.random.seed(120 + k)
      acc_parallel_k = []
      time_parallel_k = []
      
      for i in coreset_sizes:
            print(k, i)
            acc_parallel_i = []
            time_parallel_i = []
            for ind, strat in enumerate([split_randomly, split_based_on_ML]):
                  start = time.time()

                  # Step 1: distribute
                  if ind == 0:
                        full_inds = strat(X_train)
                  elif ind == 1:
                        gm = GaussianMixture(X_train.shape[1])
                        gm.fit(np.linalg.pinv(X_train) @ y_train_bin)
                        params = gm.means_.flatten()
                        log_liks = log_likelihood(params, X_train, y_train_bin, None)
                        probs = np.abs(log_liks) / np.sum(np.abs(log_liks))
                        probs = probs.flatten()
                        full_inds = distribute(probs)

                  print("running")
                  # Step 2: run
                  w = parallelize(alg = SensitivityBasedFW, x = X_train, k = int(i // mp.cpu_count()), norm = "2", na = na, distributed_indices = full_inds,
                                  y = y_train_bin)

                  time_parallel_i.append(time.time() - start)
                  print("finished running")

                  # Compute mse
                  lr = LogisticRegression()
                  lr.fit(X_train * w, y_train)
                  acc_parallel_i.append(accuracy_score(y_test, lr.predict(X_test)))

            acc_parallel_k.append(acc_parallel_i)
            time_parallel_k.append(time_parallel_i)

      acc_parallel.append(acc_parallel_k)
      time_parallel.append(time_parallel_k)

print(f"Acc: {acc_parallel}")
print(f"Time: {time_parallel}")

In [None]:
data = {
    "acc_sequential": acc_sequential,
    "time_sequential": time_sequential,
    "acc_parallel": acc_parallel,
    "time_parallel": time_parallel
}

with open('data/digits.pickle', 'wb') as file:
    pickle.dump(data, file, protocol = pickle.HIGHEST_PROTOCOL)

## Plot

In [None]:
with open("data/digits.pickle", 'rb') as file:
    data = pickle.load(file)

acc_sequential = np.array(data['acc_sequential'])
time_sequential = np.array(data['time_sequential'])
acc_parallel = np.array(data["acc_parallel"])
time_parallel = np.array(data["time_parallel"])

In [None]:
plt.rcParams.update({'font.size': 22})

fig = plt.figure(figsize = (20, 7))

ax12 = fig.add_subplot(121)
ax13 = fig.add_subplot(222)
ax14 = fig.add_subplot(224, sharex = ax13)

ax12.plot(coreset_sizes, np.median(acc_sequential, axis = 0), label = 'Sequential', 
          linestyle = "solid", linewidth = 2, color = 'black')
ax12.plot(coreset_sizes, np.median(acc_parallel, axis = 0)[:, 0], label = 'Random split',
          linestyle = "dashed", linewidth = 2, color = 'dimgray')
ax12.plot(coreset_sizes, np.median(acc_parallel, axis = 0)[:, 1], label = 'ML split',
          linestyle = "solid", marker = "o", linewidth = 2, color = 'maroon')

ax13.spines['bottom'].set_visible(False)
ax13.xaxis.tick_top()
ax13.tick_params(labeltop = False)
ax14.spines['top'].set_visible(False)
ax14.ticklabel_format(useOffset=False)

ax12.set_xlabel("Coreset size")
ax14.set_xlabel("Coreset size")

fig.text(0.06, 0.5, 'Accuracy', va='center', rotation='vertical')
fig.text(0.49, 0.5, 'Seconds', va='center', rotation='vertical')

d = .015
kwargs = dict(transform=ax13.transAxes, color='k', clip_on=False)
ax13.plot((-d, +d), (-d, +d), **kwargs)        # top-left diagonal
ax13.plot((1 - d, 1 + d), (-d, +d), **kwargs)  # top-right diagonal

kwargs.update(transform=ax14.transAxes)  # switch to the bottom axes
ax14.plot((-d, +d), (1 - d, 1 + d), **kwargs)  # bottom-left diagonal
ax14.plot((1 - d, 1 + d), (1 - d, 1 + d), **kwargs)  # bottom-right diagonal

fig.legend()
fig.suptitle('Classification')

ax13.plot(coreset_sizes, np.median(time_sequential, axis = 0), label = 'Sequential',
          linestyle = "solid", linewidth = 2, color = 'black')
ax14.plot(coreset_sizes, np.median(time_parallel, axis = 0)[:, 0], label = 'Random split',
          linestyle = "dashed", linewidth = 2, color = 'dimgray')
ax14.plot(coreset_sizes, np.median(time_parallel, axis = 0)[:, 1], label = 'ML split',
          linestyle = "solid", marker = "o", linewidth = 2, color = 'maroon')

ax12.grid()
ax13.grid()
ax14.grid()

plt.savefig("plots/digits.eps")

plt.show()