In [3]:
# Code source: Sebastian Curi, Andreas Krause and Fanny Yang, based on Jaques Grobler (sklearn demos).
# License: BSD 3 clause

# We start importing some modules and running some magic commands
%matplotlib inline
%reload_ext autoreload
%load_ext autoreload
%autoreload 2

# General math and plotting modules.
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from scipy.special import erfinv

# Project files.
from utilities.util import gradient_descent
from utilities.classifiers import Logistic
from utilities.regressors import TStudent
from utilities.regularizers import L2Regularizer
from utilities.load_data import polynomial_data, linear_separable_data
from utilities import plot_helpers

# Widget and formatting modules
import IPython
import ipywidgets
from ipywidgets import interact, interactive, interact_manual, fixed
from matplotlib import rcParams
# If in your browser the figures are not nicely vizualized, change the following line. 
rcParams['figure.figsize'] = (10, 5)
rcParams['font.size'] = 16

# Machine Learning library. 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn import datasets
from sklearn.linear_model import SGDRegressor, Ridge, LogisticRegression
from sklearn.model_selection import cross_val_score


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
def get_regression_dataset(dataset, X=None, n_samples=200, noise=0, w=None):
    if X is None:
        X = np.random.randn(n_samples)
    
    if dataset == 'cos':
        Y = np.cos(1.5 * np.pi * X) + noise * np.random.randn(X.shape[0])
        
    elif dataset == 'sinc':
        Y = X * np.sin(1.5 * np.pi * X) + noise * np.random.randn(X.shape[0])
        
    elif dataset == 'linear':
        X = np.atleast_2d(X).T
        Phi = PolynomialFeatures(degree=1, include_bias=True).fit_transform(X)
        Y = Phi @ w[:2] + noise * np.random.randn(X.shape[0])
    
    elif dataset == 'linear-features':
        X = np.atleast_2d(X).T
        Phi = PolynomialFeatures(degree=len(w) - 1, include_bias=True).fit_transform(X)
        Y = Phi @ w + noise * np.random.randn(X.shape[0])
    
    return X, Y
    

def get_classification_dataset(dataset, n_samples=200, noise=0.5):
    if dataset == 'linear':
        X, Y = linear_separable_data(n_samples, noise=noise, dim=2) 
        Y = (Y + 1) // 2
    elif dataset == '2-blobs':
        X, Y = datasets.make_classification(n_classes=2, n_features=2, n_informative=2, n_redundant=0,
                                            n_clusters_per_class=1, n_samples=n_samples, random_state=8)
    elif dataset == '3-blobs':
        X, Y = datasets.make_classification(n_classes=3, n_features=2, n_informative=2, n_redundant=0,
                                            n_clusters_per_class=1, n_samples=n_samples, random_state=8)
    elif dataset == '4-blobs':
        X, Y = datasets.make_classification(n_classes=4, n_features=2, n_informative=2, n_redundant=0,
                                            n_clusters_per_class=1, n_samples=n_samples, random_state=8) 
    elif dataset == 'circles':
        X, Y = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05)
    elif dataset == 'moons':
        X, Y = datasets.make_moons(n_samples=n_samples, noise=.05)
    elif dataset == 'iris':
        X, Y = datasets.load_iris(return_X_y=True)
        X = X[:, :2]
    elif dataset == 'imbalanced':
        X, Y = linear_separable_data(n_samples, noise=noise, dim=2, num_negative=int(n_samples * 0.2))
        Y = (Y + 1) // 2

    return X, Y

# Probabilistic Classification (Logistic Regression)

In [14]:
rcParams['figure.figsize'] = (20, 6)
rcParams['font.size'] = 22

num_points_w = ipywidgets.IntSlider(value=300, min=30, max=1500, step=1, description='Number of samples:',
                                   style={'description_width': 'initial'}, continuous_update=False)
noise_w = ipywidgets.FloatSlider(value=0.1, min=0, max=1, step=0.01, readout_format='.2f', description='Noise level:',
                                 style={'description_width': 'initial'}, continuous_update=False)
reg_w = ipywidgets.BoundedFloatText(value=0, min=0, max=1000, step=0.0001, description='Regularization:',
                                    style={'description_width': 'initial'}, continuous_update=False)
batch_size_w = ipywidgets.IntSlider(value=16, min=1, max=64, step=1, description='Batch Size:',
                                   style={'description_width': 'initial'}, continuous_update=False)
lr_w = ipywidgets.FloatLogSlider(value=0.3, min=-4, max=1, step=0.1, readout_format='.4f', description='Learning Rate:',
                                 style={'description_width': 'initial'}, continuous_update=False)
num_iter_w = ipywidgets.IntSlider(value=50, min=10, max=200, step=1, description='Num Iter:',
                                   style={'description_width': 'initial'}, continuous_update=False)
def logistic_SGD(dataset, num_points, noise, reg, batch_size, lr, num_iter):
#     np.random.seed(42)
    
    # DATASET
    X, Y = get_classification_dataset(dataset, num_points, noise)
    Y = 2 * Y - 1 
    if X.shape[1] == 2:
        ones = np.ones((X.shape[0], 1))
        X = np.concatenate((X, ones), axis=-1)
    
    Xtest, Ytest = get_classification_dataset(dataset, int(0.1 * num_points), noise)
    Ytest = 2 * Ytest - 1 
    if Xtest.shape[1] == 2:
        ones = np.ones((Xtest.shape[0], 1))
        Xtest = np.concatenate((Xtest, ones), axis=-1)

    indexes = np.arange(0, X.shape[0], 1)
    np.random.shuffle(indexes)
    X, Y = X[indexes], Y[indexes]

    # REGRESSION
    classifier = Logistic(X, Y)
    classifier.load_test_data(Xtest, Ytest)
    regularizer = L2Regularizer(reg)
    np.random.seed(42)
    w0 = np.random.randn(3, )
    
    opts = {'eta0': lr,
            'n_iter': num_iter,
            'batch_size': min(batch_size, X.shape[0]),
            'n_samples': X.shape[0],
            'algorithm': 'SGD',
            }
    
    try:
        trajectory, indexes = gradient_descent(w0, classifier, regularizer, opts)
        
        # PLOTS
        contour_plot = plt.subplot(121)
        error_plot = plt.subplot(122)

        opt = {'marker': 'ro', 'fillstyle': 'full', 'label': '+ Train', 'size': 8}
        plot_helpers.plot_data(X[np.where(Y == 1)[0], 0], X[np.where(Y == 1)[0], 1], fig=contour_plot, options=opt)
        opt = {'marker': 'bs', 'fillstyle': 'full', 'label': '- Train', 'size': 8}
        plot_helpers.plot_data(X[np.where(Y == -1)[0], 0], X[np.where(Y == -1)[0], 1], fig=contour_plot, options=opt)

        opt = {'marker': 'ro', 'fillstyle': 'none', 'label': '+ Test', 'size': 8}
        plot_helpers.plot_data(Xtest[np.where(Ytest == 1)[0], 0], Xtest[np.where(Ytest == 1)[0], 1], fig=contour_plot, options=opt)
        opt = {'marker': 'bs', 'fillstyle': 'none', 'label': '- Test', 'size': 8}
        plot_helpers.plot_data(Xtest[np.where(Ytest == -1)[0], 0], Xtest[np.where(Ytest == -1)[0], 1], fig=contour_plot, options=opt)

        contour_opts = {'n_points': 100, 'x_label': '$x$', 'y_label': '$y$', 'sgd_point': True, 'n_classes': 4}
        error_opts = {'epoch': 5, 'x_label': '$t$', 'y_label': 'error'}

        opts = {'contour_opts': contour_opts, 'error_opts': error_opts}
        plot_helpers.classification_progression(X, Y, trajectory, indexes, classifier, 
                                                contour_plot=contour_plot, error_plot=error_plot, 
                                                options=opts)
    
    except KeyboardInterrupt:
        pass 
interact_manual(logistic_SGD, dataset=['linear', 'moons', 'circles', 'imbalanced'],
                num_points=num_points_w, noise=noise_w, reg=reg_w, batch_size=batch_size_w, 
                lr=lr_w, num_iter=num_iter_w);


interactive(children=(Dropdown(description='dataset', options=('linear', 'moons', 'circles', 'imbalanced'), va…

# Cost-Sensitive Linear Regression

In [61]:
rcParams['figure.figsize'] = (10, 6)
rcParams['font.size'] = 16

def cost_sensitive_linear_regression(dataset, tau, degree, alpha, n_samples, noise):
    np.random.seed(42)
    
    # DATASET
    w_star = np.array([1, 0.2, -0.3, 4])
    X = np.sort(np.random.rand(n_samples))
    _, f = get_regression_dataset(dataset, n_samples=200, X=X, noise=0, w=w_star)
    _, y = get_regression_dataset(dataset, n_samples=200, X=X, noise=noise, w=w_star)

    # REGRESSION
    Phi = PolynomialFeatures(degree=degree, include_bias=True).fit_transform(np.atleast_2d(X).T)
    w_hat = Ridge(alpha=alpha, fit_intercept=False).fit(Phi, y).coef_

    # PREDICT
    X_test = np.linspace(0, 1, 100)
    _, f_test = get_regression_dataset(dataset, n_samples=200, X=X_test, noise=0, w=w_star)
    Phi_test = PolynomialFeatures(degree=degree, include_bias=True).fit_transform(np.atleast_2d(X_test).T)
    y_equal = Phi_test @ w_hat
    
    # COST SENSITIVITY
    y_sensitive = y_equal + noise * np.sqrt(2) * erfinv(2*tau-1)
    
    # PLOT
    p1=plt.plot(X, y, '*', label='Train samples')
    p2=plt.plot(X_test, y_sensitive, label='Quantile Regression')
    p3=plt.plot(X_test, y_equal, label='Linear Regression')
    p4=plt.plot(X_test, f_test, label='True Function')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    #plt.legend(loc='upper left', ncol=1)

    plt.ylim(-2, 2);
    
interact(cost_sensitive_linear_regression,  dataset=['cos', 'sinc', 'linear', 'linear-features'], 
         tau=ipywidgets.FloatSlider(value=0.5, min=0, max=1, step=0.001, 
                                                              readout_format='.4f',
                                      description='Quantile:', continuous_update=False),
         n_samples=ipywidgets.IntSlider(value=30, min=30, max=1500, step=1, 
                                        description='N Samples:', continuous_update=False),
         degree=ipywidgets.IntSlider(value=1, min=1, max=9, step=1, 
                                     description='Poly Degree:', continuous_update=False),
         alpha=ipywidgets.BoundedFloatText(value=0, min=0, max=1000, step=0.0001, 
                                           description='Reg Coef.:', continuous_update=False),
         noise=ipywidgets.FloatSlider(value=0.3, min=0, max=1, step=0.01, readout_format='.2f',
                                      description='Noise level:', continuous_update=False)
        )


interactive(children=(Dropdown(description='dataset', options=('cos', 'sinc', 'linear', 'linear-features'), va…

<function __main__.cost_sensitive_linear_regression(dataset, tau, degree, alpha, n_samples, noise)>

# Cost Sensitive Classification (Logistic Regression)


In [18]:
rcParams['figure.figsize'] = (20,8)
rcParams['font.size'] = 16

def cost_sensitive_logistic_regression(dataset, cost_ratio):
    # cost_ratio = cost_false_positive / cost_false_negative
    np.random.seed(0)

    min_positive_prob = 1 / (1 + cost_ratio)

    # DATASET
    X, y = get_classification_dataset(dataset, 200)
    X = X[:, :2]
    
    # REGRESSION
    model = LogisticRegression().fit(X, y)

    # PREDICT
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    h = .02  # step size in the mesh
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    xy = np.c_[xx.ravel(), yy.ravel()]
    P = model.predict_proba(xy)
    C = 2 * model.predict(xy)
    H = -(model.predict_log_proba(xy) * P).sum(axis=1)    
    
    # Cost Sensitive Step
    C[np.where(P[:, 1] < min_positive_prob)[0]] = 0
    C[np.where(P[:, 1] >= min_positive_prob)[0]] = 1
    
    P = P.max(axis=1)

    C = C.reshape(xx.shape)
    P = P.reshape(xx.shape)
    H = H.reshape(xx.shape)
    

    # PLOTS
    fig, axes = plt.subplots(1, 2)
    axes[0].set_title('Classification Boundary')
    axes[0].contourf(xx, yy, C, cmap=plt.cm.jet, alpha=0.5, vmin=0, vmax=1)

    axes[1].set_title('Prediction Probabilities')
    cf = axes[1].contourf(xx, yy, P, cmap=plt.cm.cividis_r, alpha=0.5, vmin=1. / len(np.unique(y)), vmax=1)
    m = plt.cm.ScalarMappable(cmap=plt.cm.cividis_r)
    m.set_array(P)
    m.set_clim(1. / len(np.unique(y)), 1.)
    cbar = plt.colorbar(m, ax=axes[1])  

    for ax in axes:
        ax.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.jet, vmin=0, vmax=1)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
    plt.show()

interact(cost_sensitive_logistic_regression, 
         dataset=['linear', 'imbalanced', '2-blobs', 'moons'],
        cost_ratio=ipywidgets.FloatLogSlider(value=1, min=-3, max=4, step=0.1, continuous_update=False));


interactive(children=(Dropdown(description='dataset', options=('linear', 'imbalanced', '2-blobs', 'moons'), va…

# Logistic Regression with abstention


In [17]:
rcParams['figure.figsize'] = (20, 6)
rcParams['font.size'] = 16

def doubtful_logistic_regression(dataset, min_prob):
    np.random.seed(42)
    
    # DATASET
    X, y = get_classification_dataset(dataset, 200)
    X = X[:, :2]
    
    # REGRESSION
    model = LogisticRegression().fit(X, y)
    
    # PREDICT
    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    h = .02  # step size in the mesh
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    xy = np.c_[xx.ravel(), yy.ravel()]
    P = model.predict_proba(xy)
    C = 2 * model.predict(xy)
    H = -(model.predict_log_proba(xy) * P).sum(axis=1)    
    P = P.max(axis=1)

    # Doubfult STEP
    C[np.where(P < min_prob)[0]] = 1

    C = C.reshape(xx.shape)
    P = P.reshape(xx.shape)
    H = H.reshape(xx.shape)
    
    # PLOTS
    fig, axes = plt.subplots(1, 2)
    axes[0].set_title('Classification Boundary')
    axes[0].contourf(xx, yy, C, cmap=plt.cm.jet, alpha=0.5)
    
    axes[1].set_title('Probability')
    cf = axes[1].contourf(xx, yy, P, cmap=plt.cm.cividis_r, alpha=0.5)
    m = plt.cm.ScalarMappable(cmap=plt.cm.cividis_r)
    m.set_array(P)
    m.set_clim(1. / len(np.unique(y)), 1.)
    cbar = plt.colorbar(m, ax=axes[1])  
    # Plot also the training points
    
    for ax in axes:
        ax.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.jet)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        ax.set_xticks(())
        ax.set_yticks(())
    plt.show()
  
    
interact(doubtful_logistic_regression, dataset=['linear', 'imbalanced', '2-blobs', '3-blobs', '4-blobs', 'circles', 'moons', 'iris'],
        min_prob=ipywidgets.FloatSlider(value=0.75, min=0.25, max=1, step=0.01, continuous_update=False));

interactive(children=(Dropdown(description='dataset', options=('linear', 'imbalanced', '2-blobs', '3-blobs', '…