In [6]:
!pip install gower

import pandas as pd
import numpy as np
import pickle
import gower
from collections import namedtuple

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.lines as mlines

from sklearnex import patch_sklearn
patch_sklearn()
from sklearn.cluster import OPTICS, cluster_optics_dbscan

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


# OPTICS

\[**TALK ABOUT INITIAL RUNS, CHOOSING PARAMETER RANGES, CHOOSING OPTIONS (KD-TREE, DBSCAN EXTRACTION, ETC) AND SO FORTH**\]

Define a model:
min_samples = (95, 215, 15)
max_eps = (1, 2, 0.5)
eps = (1, 2, 0.5)

Load data:

In [30]:
y_LinearSVC = pd.read_pickle('./data/y_pred_LinearSVC.pkl')
y_LogReg = pd.read_pickle('./data/y_pred_LogReg.pkl')
y_RidgeReg = pd.read_pickle('./data/y_pred_RidgeReg.pkl')
X = pd.read_pickle('./data/unlabeled_behavior.pkl')
embedding = pd.read_pickle('./data/embedding.pkl')

Generate Gower matrix and data structure to store cluster assignments and scorings from parameter search:

In [None]:
X_gower = pd.DataFrame(gower.gower_matrix(X, cat_features=[False, False, False, False, True,
                                                           False, False, False, True]))

#define parameter ranges
min_samples_range = np.arange(95, 201, 15)
max_eps_range = np.arange(1.0, 2.1, 0.5)
eps_range = np.arange(1.0, 2.1, 0.5)

#create a data structure to store cluster assignments and scorings
Key = namedtuple('Key', ['s', 'me', 'e'])
Labels_Scores = {Key(s, me, e):{'labels_g':None, 'labels_e':None, 'relabels_g':None, 'relabels_e':None,
                                'score_e':{'lsvm':None, 'lr':None, 'rr':None},
                                'score_g':{'lsvm':None, 'lr':None, 'rr':None}}
                 for s, me, e in [[s, me, e] for s in min_samples_range
                                             for me in max_eps_range
                                             for e in eps_range]}

Define plotting function (this is modified from the version for TSNE):

In [3]:
# plot_with_ys(x1, x2, names, y_class, y_cluster, ext_names=False, relabeled=False, score=None)
# Generates a 2d plot of the given data. If only y_class is given, data points will be colored by
# classification. If y_cluster is also given, data points will be colored by cluster label with shapes
# corresponding to classification. Intended for use with embeddings generated by TSNE. Plots with
# extended names will go to a scratch folder so we can compare selections of parameters for OPTICS.
# Once we've chosen parameters these will be used through all future plots so the plots that go
# into our report can go without extended names.
# Variables:
# x1        -  array representing the position on the x-axis of each point in a 2d embedding from TSNE
# x2        -  array representing the position on the y-axis of each point in a 2d embedding from TSNE
# names     -  an array of string names for the classification algorithm used (index 0) and
#              the clustering algorithm used (index 1). If ext_names=True, also includes the 
#              min_samples (index 2), max_eps (index 3), eps (index 4), and the distance 
#              metric (index 5). For use in plotting.
# y_class   -  an array of classifications from a supervised model
# y_cluster -  an array of cluster labelings
# ext_names -  whether to look for extended names for the plot (default: False)
# relabeled -  whether clusters have been relabeled to enable computation of mutual information scores
#              (default: False)
# score     -  if relabeled=True, mutual information score of the clustering against the classification
#              (default: None)
@mpl.rc_context({'image.cmap': 'tab10', 'figure.figsize': [12.0, 8.0]})
def plot_with_ys(x1, x2, names, y_class, y_cluster, ext_names=False, relabeled=False, score=None):
    
    fig, ax = plt.subplots()
        
    #create a colormap of the correct size, doing this bc just giving 'tab10' to the cmap
    #parameter gives colors from each end of the palette instead of sequentially
    colors = mpl.colors.ListedColormap(plt.get_cmap('tab10')(np.arange(len(np.unique(y_cluster)))))

    #plot the two predicted classes with different markers, coloring by cluster assignment
    x1_normal = [a for a,b in zip(x1, y_class) if b == 0]
    x2_normal = [a for a,b in zip(x2, y_class) if b == 0]
    scatter1 = ax.scatter(x1_normal, x2_normal, marker='|', cmap=colors,
                          c=y_cluster[np.argwhere(y_class == 0)])

    x1_outlier = [a for a,b in zip(x1, y_class) if b == 1]
    x2_outlier = [a for a,b in zip(x2, y_class) if b == 1]
    scatter2 = ax.scatter(x1_outlier, x2_outlier, marker='_', cmap=colors,
                          c=y_cluster[np.argwhere(y_class == 1)])

    #create a legend for differentiating between colors
    legend1 = ax.legend(*scatter1.legend_elements(), loc="lower left", title="Clusters")
    ax.add_artist(legend1)

    #create a legend from scratch for differentiating between markers
    vline = mlines.Line2D([], [], color='black', marker='|', linestyle='None',
                          markersize=10, label='0 (normal)')
    hline = mlines.Line2D([], [], color='black', marker='_', linestyle='None',
                          markersize=10, label='1 (outlier)')
    legend2 = ax.legend(handles=[vline, hline], loc="lower right", title="Classes")

    if relabeled: #add text reporting the mutual information score
        #TODO

    #insert given names to title and filename
    if ext_names: #with names for min_samples, max_eps, eps, and distance metric
        if relabeled: #plot has relabeled clusterings
            plt.title('TSNE Relabeled '+names[1]+' Clusterings & ' +names[0]+' Classifications s'
                      +names[2]+' me'+names[3]+' e'+names[4]+' '+names[5])
            filename = './figures/scratch/TSNE_relabeled_'+names[1]+'_'+names[0]+'_s'+names[2]+'_me'
                        +names[3]+'_e'+names[4]+'_'+names[5]+'.png'
        else: #plot has original clusterings
            plt.title('TSNE '+names[1]+' Clusterings & ' +names[0]+' Classifications s'
                      +names[2]+' me'+names[3]+' e'+names[4]+' '+names[5])
            filename = './figures/scratch/TSNE_'+names[1]+'_'+names[0]+'_s'+names[2]+'_me'
                        +names[3]+'_e'+names[4]+'_'+names[5]+'.png'

    else: #without names for min_samples, max_eps, eps, and distance metric
        if relabeled: #plot has relabeled clusterings
            plt.title('TSNE Relabeled '+names[1]+' Clusterings & ' +names[0]+' Classifications')
            filename = './figures/TSNE_relabeled_'+names[1]+'_'+names[0]+'.png'
        else:
            plt.title('TSNE '+names[1]+' Clusterings & ' +names[0]+' Classifications')
            filename = './figures/TSNE_'+names[1]+'_'+names[0]+'.png'

    plt.savefig(filename, format='png')

    plt.show()

Let's search over the range of parameters identified by our initial runs. We will generate and plot clusterings for each parameter combination, relabel clusterings so that the largest component is 0 (normal) and all others are 1 (outlier), compute mutual information scores against classifications, and replot with relabeled clusters and scores.

In [None]:
%%capture

#first let's just compute the cluster assignments and plot them
for s in min_samples_range:
    
    for me in max_eps_range:
        
        #fit model with gower distance
        model_g = OPTICS(min_samples=s, max_eps=me, metric='precomputed',
                         algorithm='kd_tree', n_jobs=-1)
        model_g.fit(X_gower)
        
        #fit model with euclidean distance
        model_e = OPTICS(min_samples=s, max_eps=me, metric='euclidean',
                         algorithm='kd_tree', n_jobs=-1)
        model_e.fit(X)
        
        for e in eps_range:
            
            #extract clusters 
            labels_g = cluster_optics_dbscan(reachability=model_g.reachability_,
                                             core_distances=model_g.core_distances_,
                                             ordering=model_g.ordering_, eps=e)
            labels_e = cluster_optics_dbscan(reachability=model_e.reachability_,
                                             core_distances=model_e.core_distances_,
                                             ordering=model_e.ordering_, eps=e)
            
            #add clusters to Labels_Scores dictionary
            Labels_Scores[Key(s, me, e)]['labels_g'] = labels_g
            Labels_Scores[Key(s, me, e)]['labels_e'] = labels_e
            
            #plot original clusterings for gower
            plot_with_ys(embedding[0], embedding[1], 
                         ['LinearSVC', 'OPTICS', str(s), str(me), str(e), 'gower'],
                         y_LinearSVC, labels_g, ext_names=True)
            plot_with_ys(embedding[0], embedding[1], 
                         ['LogReg', 'OPTICS', str(s), str(me), str(e), 'gower'],
                         y_LogReg, labels_g, ext_names=True)
            plot_with_ys(embedding[0], embedding[1], 
                         ['RidgeReg', 'OPTICS', str(s), str(me), str(e), 'gower'],
                         y_RidgeReg, labels_g, ext_names=True)
            
            #plot original clusterings for euclidean
            plot_with_ys(embedding[0], embedding[1], 
                         ['LinearSVC', 'OPTICS', str(s), str(me), str(e), 'euclidean'],
                         y_LinearSVC, labels_e, ext_names=True)
            plot_with_ys(embedding[0], embedding[1], 
                         ['LogReg', 'OPTICS', str(s), str(me), str(e), 'euclidean'],
                         y_LogReg, labels_e, ext_names=True)
            plot_with_ys(embedding[0], embedding[1], 
                         ['RidgeReg', 'OPTICS', str(s), str(me), str(e), 'euclidean'],
                         y_RidgeReg, labels_e, ext_names=True)

In [None]:
%%capture

#now that we've seen the initial plots, let's relabel the clusters, compute mutual information scores,
#and generate plots with relabelings and scores
for s, me, e in [[s, me, e] for s in min_samples_range
                            for me in max_eps_range
                            for e in eps_range]:
    
    #create relabelings for gower
    
    
    #create relabelings for euclidean
    

    #compute mutual information scores for gower
    
    
    #compute mutual information scores for euclidean
    
    
    #add relabelings and scores to Labels_Scores dictionary
    
    
    #plot relabeled clusterings with scores
    
    