In [1]:
import pandas as pd
import numpy as np
from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.manifold import TSNE
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
import gower

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# TSNE

Read in supervised classifications and centered and scaled unlabeled data, and precompute a Gower distance matrix:

In [None]:
y_LinearSVC = pd.read_pickle('./data/y_pred_LinearSVC.pkl')
y_LogReg = pd.read_pickle('./data/y_pred_LogReg.pkl')
y_RidgeReg = pd.read_pickle('./data/y_pred_RidgeReg.pkl')
X = pd.read_pickle('./data/unlabeled_behavior.pkl')
X_Gower = gower.gower_matrix(X, cat_features=[False, False, False, False, True,
                                              False, False, False, True])

Define a plotting function:

In [None]:
# plot_with_ys(x1, x2, y_class, y_cluster=None)
# Generates a 2d plot of the given data. If only y_class is given, data points will be colored by
# classification. If y_cluster is also given, data points will be colored by cluster label with shapes
# corresponding to classification. Intended for use with embeddings generated by TSNE. Plots with
# extended names will go to a scratch folder so we can compare selections of parameters for TSNE.
# Once we've chosen parameters these will be used through all future plots so the plots that go
# into our report can go without extended names.
# Variables:
# x1        -  array representing the position on the x-axis of each point in a 2d embedding from TSNE
# x2        -  array representing the position on the y-axis of each point in a 2d embedding from TSNE
# names     -  an array of string names for the classification algorithm used (index 0) and
#              the clustering algorithm used (index 1). If ext_names=True, also includes the 
#              perplexity (index 2) and the distance metric (index 3). For use in plotting.
# y_class   -  an array of classifications from a supervised model
# y_cluster -  an array of cluster labelings (default: None)
# ext_names -  whether to look for extended names for the plot (default: False)
@mpl.rc_context({'image.cmap': 'tab10', 'figure.figsize': [12.0, 8.0]})
def plot_with_ys(x1, x2, names, y_class, y_cluster=None, ext_names=False):
    
    fig, ax = plt.subplots()
    
    if y_cluster is not None: #plotting comparison between classification and clustering
        
        #create a colormap of the correct size, doing this bc just giving 'tab10' to the cmap
        #parameter gives colors from each end of the palette instead of sequentially
        colors = mpl.colors.ListedColormap(plt.get_cmap('tab10')(np.arange(len(np.unique(y_cluster)))))
        
        #plot the two predicted classes with different markers, coloring by cluster assignment
        x1_normal = [a for a,b in zip(x1, y_class) if b == 0]
        x2_normal = [a for a,b in zip(x2, y_class) if b == 0]
        scatter1 = ax.scatter(x1_normal, x2_normal, marker='|', cmap=colors,
                              c=y_cluster[np.argwhere(y_class == 0)])
        
        x1_outlier = [a for a,b in zip(x1, y_class) if b == 1]
        x2_outlier = [a for a,b in zip(x2, y_class) if b == 1]
        scatter2 = ax.scatter(x1_outlier, x2_outlier, marker='_', cmap=colors,
                              c=y_cluster[np.argwhere(y_class == 1)])
        
        #create a legend for differentiating between colors
        legend1 = ax.legend(*scatter1.legend_elements(), loc="lower left", title="Clusters")
        ax.add_artist(legend1)
        
        #create a legend from scratch for differentiating between markers
        vline = mlines.Line2D([], [], color='black', marker='|', linestyle='None',
                              markersize=10, label='0 (normal)')
        hline = mlines.Line2D([], [], color='black', marker='_', linestyle='None',
                              markersize=10, label='1 (outlier)')
        legend2 = ax.legend(handles=[vline, hline], loc="lower right", title="Classes")
        
        #insert given names to title and filename
        if ext_names: #with names for perplexity and distance metric
            plt.title('TSNE '+names[1]+' Clusterings & ' +names[0]+' Classifications p'
                      +names[2]+' '+names[3])
            filename = './figures/scratch/TSNE_'+names[1]+'_'+names[0]+'_p'+names[2]+'_'+names[3]+'.png'
        else: #without names for perplexity and distance metric
            plt.title('TSNE '+names[1]+' Clusterings & ' +names[0]+' Classifications')
            filename = './figures/TSNE_'+names[1]+'_'+names[0]+'.png'

        plt.savefig(filename, format='png')
    else: #plotting only a classification
        colors = mpl.colors.ListedColormap(plt.get_cmap('tab10')(np.arange(2)))
        scatter = ax.scatter(x1, x2, marker='.', c=y_class, cmap=colors)
        legend = ax.legend(*scatter.legend_elements(), loc="lower left", title="Classes")
        
        #insert given names to title and filename
        if ext_names: #with names for perplexity and distance metric
            plt.title('TSNE '+names[0]+' Classifications p'+names[2]+' '+names[3])
            filename = './figures/scratch/TSNE_'+names[0]+'_p'+names[2]+'_'+names[3]+'.png'
        else: #without names for perplexity and distance metric
            plt.title('TSNE '+names[0]+' Classifications')
            filename = './figures/TSNE_'+names[0]+'.png'
            
        plt.savefig(filename, format='png')

    plt.show()

Let's run TSNE with various values of perplexity and distance functions. Why only these two? For one, it takes a little while for each model to run, so we need to cut down on the number of combinations to try. What about the other parameters?

- early_exaggeration: according to docs "the choice of this parameter is not very critical" so we won't worry about this unless we're having trouble getting reasonable outputs with the default value
- learning_rate: the 'auto' option for our dataset works out to about 650, which is reasonable considering the typical range is 10-1000; inital outputs looked good, i.e. results did not look like a uniform ball or a dense cloud with few outliers, so it seems reasonable to continue with this learning rate unless results look particularly bad
- method: the exact gradient calculation algorithm crashed my ipykernel so we'll stick with Barnes-Hut approximation
- angle: we will leave this at the default 0.5 because initial runs completed in a reasonable time while giving decent results, and Barnes-Hut "is not very sensitive to changes in this parameter in the range of 0.2 - 0.8"
- max_iter: initial runs seemed to optimize fine at the default of 1000 so we'll leave this be unless runs are failing to complete
- init: since initial runs even at high perplexity did not have better outputs with 'random' than 'pca' we will stick with PCA. PCA initialization is also usually more globally stable. However, PCA initialization is not supported for pre-computed distance matrices, which we will need when using the Gower distance, so in this one case we will use random initializations.

How did we choose the range of perplexities to try? Initial runs with high perplexity (100, 500) did not result in better separation of the data in accordance to supervised classifications. In fact, high perplexity values resulted in a separation that cut each of the classes in half. I suspect that these embeddings were over-focusing on the differences between data from the two different sources used for the dataset, which is composed of roughly half from each source. We will therefore try embeddings with perplexity from 5 to 65 moving in steps of 10.

How did we choose which distance metrics to try? We have 7 centered and scaled continuous variables, and 2 categorical variables with 2 levels each. We will run models with euclidean distance, l2 norm, cosine distance, and correlation distance as available from sklearn and scipy. We will also run models using the Gower distance, which uses Manhattan distance for continuous variables and dice distance for binary variables. The implementation of Gower distance comes from the [gower](https://github.com/wwwjk366/gower) package.

In [None]:
%%capture

perplexities = np.arange(5, 70, 10)
distance_metrics = ['euclidean', 'l2', 'cosine', 'correlation']

for p in perplexities:
    
    #apply TSNE with Gower distance
    model = TSNE(n_components=2, perplexity=p, learning_rate='auto', metric='precomputed',
                 init='random', method='barnes_hut', square_distances=True, n_jobs=-1)
    X_new = model.fit_transform(X_Gower)
    plot_with_ys(X_new[:,0], X_new[:,1], ['LinearSVC', None, str(p), 'Gower'],
                 y_LinearSVC, ext_names=True)
    plot_with_ys(X_new[:,0], X_new[:,1], ['LogReg', None, str(p), 'Gower'],
                 y_LogReg, ext_names=True)
    plot_with_ys(X_new[:,0], X_new[:,1], ['RidgeReg', None, str(p), 'Gower'],
                 y_RidgeReg, ext_names=True)
    
    for d in distance_metrics: 
        
        #apply TSNE with distance metrics from sklearn and scipy
        model = TSNE(n_components=2, perplexity=p, learning_rate='auto', metric=d, init='pca',
                     method='barnes_hut', square_distances=True, n_jobs=-1)
        X_new = model.fit_transform(X)
        plot_with_ys(X_new[:,0], X_new[:,1], ['LinearSVC', None, str(p), d],
                     y_LinearSVC, ext_names=True)
        plot_with_ys(X_new[:,0], X_new[:,1], ['LogReg', None, str(p), d],
                     y_LogReg, ext_names=True)
        plot_with_ys(X_new[:,0], X_new[:,1], ['RidgeReg', None, str(p), d],
                     y_RidgeReg, ext_names=True)

Initialize a model. We want a 2-dimensional representation.

Compute the embedding:

Plot the embedding, colored by classifications from LinearSVC:

Plot the embedding, colored by classifications from LogReg:

Plot the embedding, colored by classifications from RidgeReg:

example plot using y_RidgeReg as though it were a clustering output