In [1]:
import pandas as pd
import numpy as np
import pickle
import gower
from collections import namedtuple

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.lines as mlines

from sklearn.metrics import normalized_mutual_info_score as nmi

In [2]:
y_LinearSVC = pd.read_pickle('./data/y_pred_LinearSVC.pkl')
y_LogReg = pd.read_pickle('./data/y_pred_LogReg.pkl')
y_RidgeReg = pd.read_pickle('./data/y_pred_RidgeReg.pkl')
X = pd.read_pickle('./data/unlabeled_behavior.pkl')
embedding = pd.read_pickle('./data/embedding.pkl')

#define parameter ranges
#we originally wanted to search over the range commented below, but it took 15 hours of uninterrupted
#compute time to generate all of the models at min_samples=95, so we'll just run 95 and 200
min_samples_range = [95, 200] #np.arange(95, 201, 15)
max_eps_range = np.arange(1.0, 2.1, 0.5)
eps_range = np.arange(1.0, 2.1, 0.5)

#create a data structure to store cluster assignments and scorings
Key = namedtuple('Key', ['s', 'me', 'e'])
'''
#This is how Labels_Scores was initialized, we had to interrupt execution and save it after the s=95
#runs completed.
Labels_Scores = {Key(s, me, e):{'labels_g':None, 'labels_e':None, 'relabels_g':None, 'relabels_e':None,
                                'score_e':{'lsvm':None, 'lr':None, 'rr':None},
                                'score_g':{'lsvm':None, 'lr':None, 'rr':None}}
                 for s, me, e in [[s, me, e] for s in min_samples_range
                                             for me in max_eps_range
                                             for e in eps_range]}
'''
#read in the partially filled Labels_Scores structure
with open('./data/labels_scores.pkl', 'rb') as handle:
    Labels_Scores = pickle.load(handle)

In [3]:
#now that we've seen the initial plots, let's relabel the clusters, compute mutual information scores,
#and generate plots with relabelings and scores
for s, me, e in [[s, me, e] for s in min_samples_range
                            for me in max_eps_range
                            for e in eps_range]:
    
    key = Key(s, me, e)
    
    #create relabelings - from manual inspection, we can name everything outside cluster 0 as class 1
    def relabel(arr):
        ret = []
        for i in arr:
            if i == 0:
                ret.append(0)
            else:
                ret.append(1)
        return np.array(ret)
    
    Labels_Scores[key]['relabels_g'] = relabel(Labels_Scores[key]['labels_g'])
    Labels_Scores[key]['relabels_e'] = relabel(Labels_Scores[key]['labels_e'])    

    #compute mutual information scores
    Labels_Scores[key]['score_g']['lsvm'] = nmi(Labels_Scores[key]['relabels_g'], y_LinearSVC)
    Labels_Scores[key]['score_g']['lr'] = nmi(Labels_Scores[key]['relabels_g'], y_LogReg)
    Labels_Scores[key]['score_g']['rr'] = nmi(Labels_Scores[key]['relabels_g'], y_RidgeReg)
    Labels_Scores[key]['score_e']['lsvm'] = nmi(Labels_Scores[key]['relabels_e'], y_LinearSVC)
    Labels_Scores[key]['score_e']['lr'] = nmi(Labels_Scores[key]['relabels_e'], y_LogReg)
    Labels_Scores[key]['score_e']['rr'] = nmi(Labels_Scores[key]['relabels_e'], y_RidgeReg)
    

In [4]:
#identify the best model and score
best_model = None
best_score = 0
for s, me, e in [[s, me, e] for s in [95]
                            for me in max_eps_range
                            for e in eps_range]:
    key = Key(s, me, e)
    
    for k, v in Labels_Scores[key]['score_e'].items():
        if v > best_score:
            best_model = (s, me, e, k, 'euclidean')
            best_score = v
        else:
            pass
    
    for k, v in Labels_Scores[key]['score_g'].items():
        if v > best_score:
            best_model = (s, me, e, k, 'gower')
            best_score = v
        else:
            pass

In [5]:
best_model

(95, 1.0, 1.0, 'rr', 'euclidean')

In [6]:
best_score

0.004819616139413582

In [7]:
with open('./data/labels_scores.pkl', 'wb') as handle:
    pickle.dump(Labels_Scores, handle, protocol=pickle.HIGHEST_PROTOCOL)