In [1]:
import whynot as wn
import numpy as np
from sklearn.svm import SVC
from whynot import repeated_classification
import matplotlib.pyplot as plt
%matplotlib inline

# Two Gaussians

Visualize generated distribution, intended to look like graph on [page 3](https://arxiv.org/pdf/1806.08010.pdf)

In [None]:
def graph_two_gaussians(features, labels, params=None):
    plt.figure(figsize=(9, 6))

    plt.scatter(*features[labels.astype(bool)].T, marker='+', color='k')
    plt.scatter(*features[(1 - labels).astype(bool)].T, marker='_', color='r')
    
    xlims = np.array([-3, 3])
    if params is not None:
        plt.plot(xlims, -params[0] / params[1] * xlims - params[2] / params[1], color='b')

    plt.xlim(xlims)
    plt.ylim((-2, 2))

    plt.show()


init = repeated_classification.experiments.sample_initial_states_gaussians(np.random.RandomState())
print('init:\n' + str(init))

graph_two_gaussians(init.features, init.labels)

Run to see progress at each step

In [None]:
repeated_classification.TwoGaussiansExperiment.run(1)

Using `scipy.optimize.minimize` generally fits slightly better than using linear SVM

In [2]:
config = repeated_classification.experiments.construct_config_gaussians()
rng = np.random.RandomState()

POPULATIONS = [3000, 3000]  # modify

features, labels = [], []
for pop, dist in list(zip(POPULATIONS, config.group_distributions)):
    features_k, labels_k = dist(pop, rng)
    features.append(features_k)
    labels.append(labels_k)
features = np.concatenate(features, axis=0)
labels = np.concatenate(labels, axis=0)

# SVM
c_values = [0.01, 0.1, 1, 10]
models = [SVC(C=c, kernel='linear').fit(features, labels) for c in c_values]
params = [np.concatenate([m.coef_.flatten(), m.intercept_.flatten()]) for m in models]
accuracies = [
    np.count_nonzero(labels == repeated_classification.experiments.linear_classifier_2d(features, p, None))
    for p in params
]
for (c, acc) in zip(c_values, accuracies):
    print('linear SVM with C = {: >6.2f} :  {:>6d} / {} correct'.format(c, acc, len(labels)))

# ERM with scipy.optimize.minimize
erm_params = repeated_classification.experiments.empirical_risk_minimization(
    repeated_classification.experiments.linear_classifier_2d,
    repeated_classification.experiments.zero_one_loss,
    features,
    labels,
    [0, 1, 0],
    rng,
    method='Powell'
)
erm_acc = np.count_nonzero(
    labels == repeated_classification.experiments.linear_classifier_2d(features, erm_params, None)
)
print('scipy.optimize.minimize    :  {: >6d} / {} correct'.format(erm_acc, len(labels)))

linear SVM with C =   0.01 :    4865 / 6000 correct
linear SVM with C =   0.10 :    4866 / 6000 correct
linear SVM with C =   1.00 :    4865 / 6000 correct
linear SVM with C =  10.00 :    4863 / 6000 correct
scipy.optimize.minimize    :    4865 / 6000 correct


# Median Estimation

In [None]:
def graph_median(pop1, pop2):
    data = np.concatenate([np.random.normal(-1, 0.2, pop1), np.random.normal(1, 0.2, pop2)], axis=0)
    plt.figure(figsize=(9, 6))
    plt.hist(data, bins=50)
    plt.show()


graph_median(1000, 500)

In [None]:
repeated_classification.MedianEstimationExperiment.run(1)