In [1]:
import matplotlib.pyplot as plt
import numpy as np
from SALib.analyze import sobol
from SALib.sample import saltelli
from matplotlib import offsetbox
from sklearn import manifold
from sklearn.neighbors import NearestNeighbors
from sklearn import datasets

In [2]:
def plot_embedding(X, title=None):
    x_min, x_max = np.min(X, 0), np.max(X, 0)
    X = (X - x_min) / (x_max - x_min)

    plt.figure()
    for i in range(X.shape[0]):
        plt.text(X[i, 0], X[i, 1], str(y[i]),
                 color=plt.cm.Set1((y[i]+1) / 10.),
                 fontdict={'weight': 'bold', 'size': 9})

    if hasattr(offsetbox, 'AnnotationBbox'):
        # only print thumbnails with matplotlib > 1.0
        shown_images = np.array([[1., 1.]])  # just something big
        for i in range(X.shape[0]):
            dist = np.sum((X[i] - shown_images) ** 2, 1)
            if np.min(dist) < 4e-3:
                # don't show points that are too close
                continue
            shown_images = np.r_[shown_images, [X[i]]]
            # imagebox = offsetbox.AnnotationBbox(
            #     offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r),
            #     X[i])
            # ax.add_artist(imagebox)
    plt.xticks([]), plt.yticks([])
    if title is not None:
        plt.title(title)

In [4]:
def transform_digits(images):
    result = []
    for image in images:
        result.append(image.flatten())
    return np.asarray(result)


def calculate_fit(label_group):
    label = label_group[0]
    s = 0.
    for l in label_group[1:]:
        if l == label:
            s += 0.25
    return s


def calculate_nn_metric(X, y, neighs) -> float:
    nbrs = NearestNeighbors(n_neighbors=neighs).fit(X)
    _, indices = nbrs.kneighbors(X)
    return sum(map(lambda ind: calculate_fit(y[ind]), indices))


def perform_embedding(X, y, perplexity, early_exaggeration, learning_rate, n_iter, init, random_state, angle) -> float:
    tsne = manifold.TSNE(n_components=2, perplexity=perplexity, early_exaggeration=early_exaggeration, learning_rate=learning_rate, n_iter=int(n_iter), init=('random' if init < 0.5 else 'pca'), random_state=int(random_state), angle=angle)
    # tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
    return tsne.fit_transform(X)

In [5]:
problem = {
    'num_vars': 7,
    'names': ['perplexity', 'early_exaggeration', 'learning_rate', 'n_iter', 'init', 'random_state', 'angle'],
    'bounds': [
        [5.0, 50.0],
        [1.0, 100.0],
        [10.0, 1000.0],
        [250, 2500],
        [0.0, 1.0],
        [0.0, 1000.0],
        [0.0, 1.0]
    ]
}

In [6]:
digits = datasets.load_digits()

X = transform_digits(digits.images)
y = digits.target
n_samples, n_features = X.shape

In [7]:
N = 1
samples = saltelli.sample(problem, N)

In [8]:
import multiprocessing
from multiprocessing import Pool, Value
import time

In [None]:
def embedding(sample):
    global counter
    with counter.get_lock():
        counter.value += 1
        step = counter.value
    print('Starting to calculate ' + str(step) + ' of ' + str(len(samples)) + ' (sample ' + str(["{0:0.2f}".format(i) for i in sample]) + ')')
    t = time.process_time()
    embed = perform_embedding(X, y, *sample)
    elapsed_time = time.process_time() - t
    print('Calculation complete for ' + str(step) + ' of ' + str(len(samples)) + ' in ' + str(elapsed_time) + ' s.')
    return embed

Starting to calculate 1 of 160
Starting to calculate 2 of 160
Starting to calculate 3 of 160
Starting to calculate 4 of 160
Starting to calculate 5 of 160
Starting to calculate 6 of 160
Starting to calculate 7 of 160
Starting to calculate 8 of 160
Calculation complete for 3 of 160
Starting to calculate 9 of 160
Calculation complete for 4 of 160
Starting to calculate 10 of 160
Calculation complete for 10 of 160
Starting to calculate 11 of 160
Calculation complete for 6 of 160
Starting to calculate 12 of 160
Calculation complete for 7 of 160
Starting to calculate 13 of 160
Calculation complete for 5 of 160
Starting to calculate 14 of 160
Calculation complete for 11 of 160
Starting to calculate 15 of 160
Calculation complete for 1 of 160
Starting to calculate 16 of 160
Calculation complete for 2 of 160
Starting to calculate 17 of 160
Calculation complete for 8 of 160
Starting to calculate 18 of 160
Calculation complete for 14 of 160
Starting to calculate 19 of 160
Calculation complete for

In [None]:
counter = Value('i', 0)

with Pool(processes=multiprocessing.cpu_count()) as pool:
    embeddings = pool.map(embedding, samples)

In [None]:
def get_metric(args):
    embedding, neighs = args
    global counter
    with counter.get_lock():
        counter.value += 1
        step = counter.value
    print('Starting to calculate ' + str(step) + ' of ' + str(len(embeddings)))
    t = time.process_time()
    metric = calculate_nn_metric(embedding, y, neighs)
    elapsed_time = time.process_time() - t
    print('Calculation complete for ' + str(step) + ' of ' + str(len(embeddings)) + ' in ' + str(elapsed_time) + ' s.')
    return metric

In [None]:

neighs_nums = [5, 10, 20]
Ys = []
for neighs in neighs_nums:
    counter = Value('i', 0)
    print("############# Calculation of metrics for {} neighs ###################".format(neighs))
    with Pool(processes=multiprocessing.cpu_count()) as pool:
        Ys.append(pool.map(get_metric, [(embedding, neighs) for embedding in embeddings]))

In [None]:
Ys[0]

In [None]:
Y

In [None]:
sensitivity = sobol.analyze(problem, Y)

In [None]:
sensitivity["S1"]    

In [None]:
sensitivity['ST']


In [None]:
# print(Y)
index = np.argmin(Y)
# print(index)
for s in sample[index]:
    print('%.2f' % s)