In [2]:
import pandas as pd
import numpy as np
import numpy.random_intel as rng
import plotly.offline as plt
from tqdm.notebook import tqdm
from sklearn.utils import shuffle
from sklearn.preprocessing import MinMaxScaler

In [3]:

f32_u = 2 ** (-23)
f16_u = 2 ** (-10)
bf16_u = 2 ** (-7)

In [4]:
higgs = pd.read_csv(dataset_root + 'classifiers/higgs/higgs_train1m.csv', 
                    header=None)
higgs.drop([28], axis=1, inplace=True)
higgs = shuffle(higgs)
higgs = MinMaxScaler().fit_transform(higgs)

In [5]:
hepmass = pd.read_csv(dataset_root + 'workloads/hepmass/dataset/hepmass_100t_train.csv', 
                      header=None)
hepmass.drop([27, 28], axis=1, inplace=True)
hepmass = shuffle(hepmass)
hepmass = MinMaxScaler().fit_transform(hepmass)

In [6]:
susy = pd.read_csv(dataset_root + 'workloads/susy/dataset/susy_train_small.csv',
                   header=None)
susy.drop(17, axis=1, inplace=True)
susy = shuffle(susy)
susy = MinMaxScaler().fit_transform(susy)

In [7]:
road_net = pd.read_csv(dataset_root + 'workloads/road_network/dataset/road_network_20t_cluster.csv',
                       header=None)
road_net.drop(3, axis=1, inplace=True)
road_net = shuffle(road_net)
road_net = MinMaxScaler().fit_transform(road_net)

In [8]:
mnist = pd.read_csv(dataset_root + 'workloads/mnist/dataset/mnist_train.csv',
                    header=None)
mnist.drop(784, axis=1, inplace=True)
mnist = shuffle(mnist)
mnist = MinMaxScaler().fit_transform(mnist)

In [79]:
def evaluate_dbscan(X, distances_count, u, max_eps):
    errors = []
    feature_count = X.shape[1]
    eps_space = np.linspace(0, max_eps, 2000)
    I_1 = rng.choice(X.shape[0], size=distances_count)
    I_2 = rng.choice(X.shape[0], size=distances_count)
    distances = np.sum((X[I_1] - X[I_2]) ** 2, axis=1)
    distances = distances[distances > 0.0] 
    for eps in tqdm(eps_space):
        delta = rng.normal(scale=feature_count * u, size=len(distances))
        error = delta / (distances - eps) > 1
        errors.append(np.mean(error))
    return eps_space, np.array(errors)

def plot_dbscan_errors(name, X, distance_count, max_eps=5):
    eps_f32, err_f32 = evaluate_dbscan(X, distance_count, f32_u, max_eps)
    eps_f16, err_f16 = evaluate_dbscan(X, distance_count, f16_u, max_eps)
    eps_bf16, err_bf16 = evaluate_dbscan(X, distance_count, bf16_u, max_eps)
    return plt.iplot({
        'data': [
            { 'x': eps_f32, 'y': err_f32 * 100,
              'name': 'float32' },
            { 'x': eps_f16, 'y': err_f16 * 100,
              'name': 'float16' },
            { 'x': eps_bf16, 'y': err_bf16 * 100,
              'name': 'bfloat16' },
        ],
        'layout': {
            'title': name.upper(),
            'legend': {
                'x': 0.8,
                'y': 0.95,
                'font': { 'size': 14 },
            },
            'xaxis': {
                'title': 'Epsilon',
            },
            'yaxis': {
                'title': 'Error rate, %',
            },
            'width': 600,
            'height': 300,
            'margin': { 'l': 35, 't': 25, 'b': 35, 'r': 10 }
        }
    })

In [94]:
def evaluate_knn(X, distances_count, u):
    errors = []
    feature_count = X.shape[1]
    I_1 = rng.choice(X.shape[0], size=distances_count)
    I_2 = rng.choice(X.shape[0], size=distances_count)
    I_3 = rng.choice(X.shape[0], size=distances_count)
    I_4 = rng.choice(X.shape[0], size=distances_count)
    distances_1 = np.sum((X[I_1] - X[I_2]) ** 2, axis=1)
    distances_2 = np.sum((X[I_3] - X[I_4]) ** 2, axis=1)
    diff = distances_1 - distances_2 
    diff = diff[diff != 0]
    delta = rng.normal(scale=feature_count * u, size=len(diff))
    errors = delta / diff > 1
    return np.mean(errors)

def plot_knn_errors(name, X, distance_count):
    err_f32 = evaluate_knn(X, distance_count, f32_u)
    err_f16 = evaluate_knn(X, distance_count, f16_u)
    err_bf16 = evaluate_knn(X, distance_count, bf16_u)
    print(f'{name.upper()} \t f32 = {err_f32 * 100:.2f}%')
    print(f'{name.upper()} \t f16 = {err_f16 * 100:.2f}%')
    print(f'{name.upper()} \t bf16 = {err_bf16 * 100:.2f}%')


In [95]:
plot_knn_errors('higgs', higgs, 100000)
plot_knn_errors('hepmass', hepmass, 100000)
plot_knn_errors('susy', susy, 100000)
plot_knn_errors('road_net', road_net, 100000)
plot_knn_errors('mnist', mnist, 100000)

HIGGS 	 f32 = 0.00%
HIGGS 	 f16 = 0.54%
HIGGS 	 bf16 = 4.27%
HEPMASS 	 f32 = 0.00%
HEPMASS 	 f16 = 0.48%
HEPMASS 	 bf16 = 3.76%
SUSY 	 f32 = 0.00%
SUSY 	 f16 = 0.82%
SUSY 	 bf16 = 6.87%
ROAD_NET 	 f32 = 0.00%
ROAD_NET 	 f16 = 0.40%
ROAD_NET 	 bf16 = 3.19%
MNIST 	 f32 = 0.00%
MNIST 	 f16 = 0.63%
MNIST 	 bf16 = 4.68%


In [74]:
plot_dbscan_errors('higgs', higgs, 100000, max_eps=7)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [75]:
plot_dbscan_errors('HEPMASS', hepmass, 100000, max_eps=8)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [76]:
plot_dbscan_errors('SUSY', susy, 100000, max_eps=3)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [78]:
plot_dbscan_errors('ROAD_NETWORK', road_net, 100000, max_eps=0.6)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))




In [62]:
plot_dbscan_errors(mnist, 100000, max_eps=200)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=2000.0), HTML(value='')))


