In [64]:
import pandas as pd
import numpy as np
import pickle
import gower
from collections import namedtuple

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.lines as mlines

from sklearn.metrics import normalized_mutual_info_score as nmi

In [2]:
y_LinearSVC = pd.read_pickle('./data/y_pred_LinearSVC.pkl')
y_LogReg = pd.read_pickle('./data/y_pred_LogReg.pkl')
y_RidgeReg = pd.read_pickle('./data/y_pred_RidgeReg.pkl')
X = pd.read_pickle('./data/unlabeled_behavior.pkl')
embedding = pd.read_pickle('./data/embedding.pkl')

#define parameter ranges
#we originally wanted to search over the range commented below, but it took 15 hours of uninterrupted
#compute time to generate all of the models at min_samples=95, so we'll just run 95 and 200
min_samples_range = [200] #np.arange(95, 201, 15)
max_eps_range = np.arange(1.0, 2.1, 0.5)
eps_range = np.arange(1.0, 2.1, 0.5)

#create a data structure to store cluster assignments and scorings
Key = namedtuple('Key', ['s', 'me', 'e'])
'''
#This is how Labels_Scores was initialized, we had to interrupt execution and save it after the s=95
#runs completed.
Labels_Scores = {Key(s, me, e):{'labels_g':None, 'labels_e':None, 'relabels_g':None, 'relabels_e':None,
                                'score_e':{'lsvm':None, 'lr':None, 'rr':None},
                                'score_g':{'lsvm':None, 'lr':None, 'rr':None}}
                 for s, me, e in [[s, me, e] for s in min_samples_range
                                             for me in max_eps_range
                                             for e in eps_range]}
'''
#read in the partially filled Labels_Scores structure
with open('./data/labels_scores.pkl', 'rb') as handle:
    Labels_Scores = pickle.load(handle)

In [53]:
# plot_with_ys(x1, x2, names, y_class, y_cluster, ext_names=False, relabeled=False, score=None)
# Generates a 2d plot of the given data. If only y_class is given, data points will be colored by
# classification. If y_cluster is also given, data points will be colored by cluster label with shapes
# corresponding to classification. Intended for use with embeddings generated by TSNE. Plots with
# extended names will go to a scratch folder so we can compare selections of parameters for OPTICS.
# Once we've chosen parameters these will be used through all future plots so the plots that go
# into our report can go without extended names.
# Variables:
# x1        -  array representing the position on the x-axis of each point in a 2d embedding from TSNE
# x2        -  array representing the position on the y-axis of each point in a 2d embedding from TSNE
# names     -  an array of string names for the classification algorithm used (index 0) and
#              the clustering algorithm used (index 1). If ext_names=True, also includes the 
#              min_samples (index 2), max_eps (index 3), eps (index 4), and the distance 
#              metric (index 5). For use in plotting.
# y_class   -  an array of classifications from a supervised model
# y_cluster -  an array of cluster labelings
# ext_names -  whether to look for extended names for the plot (default: False)
# relabeled -  whether clusters have been relabeled to enable computation of mutual information scores
#              (default: False)
# score     -  if relabeled=True, mutual information score of the clustering against the classification
#              (default: None)
@mpl.rc_context({'figure.figsize': [12.0, 8.0]})
def plot_with_ys(x1, x2, names, y_class, y_cluster, ext_names=False, relabeled=False, score=None):
    
    fig, ax = plt.subplots()
        
    #create a colormap of the correct size, doing this bc just giving 'tab10' to the cmap
    #parameter gives colors from each end of the palette instead of sequentially
    colors = mpl.colors.ListedColormap(plt.get_cmap('tab10')(np.arange(len(np.unique(y_cluster)))))

    #plot the two predicted classes with different markers, coloring by cluster assignment
    x1_normal = [a for a,b in zip(x1, y_class) if b == 0]
    x2_normal = [a for a,b in zip(x2, y_class) if b == 0]
    scatter1 = ax.scatter(x1_normal, x2_normal, marker='|', cmap=colors,
                          c=y_cluster[np.argwhere(y_class == 0)])
    
    x1_outlier = [a for a,b in zip(x1, y_class) if b == 1]
    x2_outlier = [a for a,b in zip(x2, y_class) if b == 1]
    scatter2 = ax.scatter(x1_outlier, x2_outlier, marker='_', cmap=colors,
                          c=y_cluster[np.argwhere(y_class == 1)])

    #create a legend for differentiating between colors
    #handles1, labels1 = scatter1.legend_elements()
    #handles2, labels2 = scatter2.legend_elements()
    #handles = handles1 + handles2
    #labels = labels1 + labels2
    legend1 = ax.legend(*scatter1.legend_elements(), loc="lower left", title="Clusters")
    ax.add_artist(legend1)

    #create a legend from scratch for differentiating between markers
    vline = mlines.Line2D([], [], color='black', marker='|', linestyle='None',
                          markersize=10, label='0 (normal)')
    hline = mlines.Line2D([], [], color='black', marker='_', linestyle='None',
                          markersize=10, label='1 (outlier)')
    legend2 = ax.legend(handles=[vline, hline], loc="lower right", title="Classes")

    if relabeled: #add text reporting the mutual information score
        plt.text()

    #insert given names to title and filename
    if ext_names: #with names for min_samples, max_eps, eps, and distance metric
        if relabeled: #plot has relabeled clusterings
            plt.title('TSNE Relabeled '+names[1]+' Clusterings & ' +names[0]+' Classifications s'
                      +names[2]+' me'+names[3]+' e'+names[4]+' '+names[5])
            filename = str('./figures/scratch/TSNE_relabeled_'+names[1]+'_'+names[0]+'_s'
                           +names[2]+'_me'+names[3]+'_e'+names[4]+'_'+names[5]+'.png')
        else: #plot has original clusterings
            plt.title('TSNE '+names[1]+' Clusterings & ' +names[0]+' Classifications s'
                      +names[2]+' me'+names[3]+' e'+names[4]+' '+names[5])
            filename = str('./figures/scratch/TSNE_'+names[1]+'_'+names[0]+'_s'+names[2]+'_me'
                           +names[3]+'_e'+names[4]+'_'+names[5]+'.png')

    else: #without names for min_samples, max_eps, eps, and distance metric
        if relabeled: #plot has relabeled clusterings
            plt.title('TSNE Relabeled '+names[1]+' Clusterings & ' +names[0]+' Classifications')
            filename = './figures/TSNE_relabeled_'+names[1]+'_'+names[0]+'.png'
        else:
            plt.title('TSNE '+names[1]+' Clusterings & ' +names[0]+' Classifications')
            filename = './figures/TSNE_'+names[1]+'_'+names[0]+'.png'

    #plt.savefig(filename, format='png')

    plt.show()

In [81]:
#now that we've seen the initial plots, let's relabel the clusters, compute mutual information scores,
#and generate plots with relabelings and scores
for s, me, e in [[s, me, e] for s in [95]
                            for me in max_eps_range
                            for e in eps_range]:
    
    key = Key(s, me, e)
    
    #create relabelings - from manual inspection, we can name everything outside cluster 0 as class 1
    def relabel(arr):
        ret = []
        for i in arr:
            if i == 0:
                ret.append(0)
            else:
                ret.append(1)
        return np.array(ret)
    
    Labels_Scores[key]['relabels_g'] = relabel(Labels_Scores[key]['labels_g'])
    Labels_Scores[key]['relabels_e'] = relabel(Labels_Scores[key]['labels_e'])    

    #compute mutual information scores
    Labels_Scores[key]['score_g']['lsvm'] = nmi(Labels_Scores[key]['relabels_g'], y_LinearSVC)
    Labels_Scores[key]['score_g']['lr'] = nmi(Labels_Scores[key]['relabels_g'], y_LogReg)
    Labels_Scores[key]['score_g']['rr'] = nmi(Labels_Scores[key]['relabels_g'], y_RidgeReg)
    Labels_Scores[key]['score_e']['lsvm'] = nmi(Labels_Scores[key]['relabels_e'], y_LinearSVC)
    Labels_Scores[key]['score_e']['lr'] = nmi(Labels_Scores[key]['relabels_e'], y_LogReg)
    Labels_Scores[key]['score_e']['rr'] = nmi(Labels_Scores[key]['relabels_e'], y_RidgeReg)
    

In [82]:
#identify the best model and score
best_model = None
best_score = 0
for s, me, e in [[s, me, e] for s in [95]
                            for me in max_eps_range
                            for e in eps_range]:
    key = Key(s, me, e)
    
    for k, v in Labels_Scores[key]['score_e'].items():
        if v > best_score:
            best_model = (s, me, e, k, 'euclidean')
            best_score = v
        else:
            pass
    
    for k, v in Labels_Scores[key]['score_g'].items():
        if v > best_score:
            best_model = (s, me, e, k, 'gower')
            best_score = v
        else:
            pass

In [83]:
best_model

(95, 1.0, 1.0, 'rr', 'euclidean')

In [84]:
best_score

0.004819616139413582