In [1]:
import numpy as np
from scipy.stats import zscore


def similarity(X, Y, method):
    '''
    SIMILARITY Computes similarity matrices

    Usage:
        sim = similarity(X, Y, method)

    Input:
    X   N1 x M matrix
    Y   N2 x M matrix 
    method   string defining one of the following similarity measure
           'SMC', 'smc'             : Simple Matching Coefficient
           'Jaccard', 'jac'         : Jaccard coefficient 
           'ExtendedJaccard', 'ext' : The Extended Jaccard coefficient
           'Cosine', 'cos'          : Cosine Similarity
           'Correlation', 'cor'     : Correlation coefficient

    Output:
    sim Estimated similarity matrix between X and Y
        If input is not binary, SMC and Jaccard will make each
        attribute binary according to x>median(x)

    Copyright, Morten Morup and Mikkel N. Schmidt
    Technical University of Denmark '''

    X = np.mat(X)
    Y = np.mat(Y)
    N1, M = np.shape(X)
    N2, M = np.shape(Y)
    
    method = method[:3].lower()
    if method=='smc': # SMC
        #X,Y = binarize(X,Y);
        sim = ((X*Y.T)+((1-X)*(1-Y).T))/M
    elif method=='jac': # Jaccard
        #X,Y = binarize(X,Y);
        sim = (X*Y.T)/(M-(1-X)*(1-Y).T)        
    elif method=='ext': # Extended Jaccard
        XYt = X*Y.T
        sim = XYt / (np.log( np.exp(sum(np.power(X.T,2))).T * np.exp(sum(np.power(Y.T,2))) ) - XYt)
    elif method=='cos': # Cosine
        sim = (X*Y.T)/(np.sqrt(sum(np.power(X.T,2))).T * np.sqrt(sum(np.power(Y.T,2))))
    elif method=='cor': # Correlation
        X_ = zscore(X,axis=1,ddof=1)
        Y_ = zscore(Y,axis=1,ddof=1)
        sim = (X_*Y_.T)/(M-1)
    return sim
        
def binarize(X,Y=None):
    ''' Force binary representation of the matrix, according to X>median(X) '''
    if Y==None:
        X = np.matrix(X)
        Xmedians = np.ones((np.shape(X)[0],1)) * np.median(X,0)
        Xflags = X>Xmedians
        X[Xflags] = 1; X[~Xflags] = 0
        return X
    else:
        X = np.matrix(X); Y = np.matrix(Y);
        XYmedian= np.median(np.bmat('X; Y'),0)
        Xmedians = np.ones((np.shape(X)[0],1)) * XYmedian
        Xflags = X>Xmedians
        X[Xflags] = 1; X[~Xflags] = 0
        Ymedians = np.ones((np.shape(Y)[0],1)) * XYmedian
        Yflags = Y>Ymedians
        Y[Yflags] = 1; Y[~Yflags] = 0
        return [X,Y]

In [2]:
import sklearn as sk
from sklearn import metrics
import numpy as np


o1 = [1,0,1,0,0,1] #12
o2 = [1,0,1,0,1,0] 
o3 = [1,0,0,0,1] #13
o4 = [1,1,1,0,0,1]
o5 = [1,0,1,0,0,1]
o6 = [0,0,1,1,0,1]
o7 = [1,1,1,1,1,1]
o10 = [0,1,0,1,0,1,1,0]
vector1 = o1
vector2 = o2

jaccard = similarity(vector1,vector2,method = "Jaccard")
cosine = similarity(vector1,vector2,method = "Cosine") 
extendedJacc = similarity(vector1,vector2,method = "ExtendedJaccard")
correlation = np.corrcoef(vector1,vector2)
smc = similarity(vector1,vector2,method = "SMC")
print("jac = ",jaccard)
print("smc = ",smc)
print("cos = ",cosine)
'''
           'SMC', 'smc'             : Simple Matching Coefficient
           'Jaccard', 'jac'         : Jaccard coefficient 
           'ExtendedJaccard', 'ext' : The Extended Jaccard coefficient
           'Cosine', 'cos'          : Cosine Similarity
           'Correlation', 'cor'     : Correlation coefficient
'''
from scipy.special import comb

labels = [0,1,0,0,0,0,1,1]  #true labels of data corresponding to the array order below
clustered= [0,0,0,0,0,1,1,1] #cluster number each data point belongs to. Absolute number is irrelevant.


def rand_index_score(clusters, classes):

    tp_plus_fp = comb(np.bincount(clusters), 2).sum()
    tp_plus_fn = comb(np.bincount(classes), 2).sum()
    A = np.c_[(clusters, classes)]
    tp = sum(comb(np.bincount(A[A[:, 0] == i, 1]), 2).sum()
             for i in set(clusters))
    fp = tp_plus_fp - tp
    fn = tp_plus_fn - tp
    tn = comb(len(A), 2) - tp - fp - fn
    return (tp + tn) / (tp + fp + fn + tn)

rand = rand_index_score(clustered,labels)
print("Rand: ",rand)


#jaccard
import itertools 

def jaccardIndex(labels1, labels2):
    """
    Computes the Jaccard similarity between two sets of clustering labels.

    The value returned is between 0 and 1, inclusively. A value of 1 indicates
    perfect agreement between two clustering algorithms, whereas a value of 0
    indicates no agreement. For details on the Jaccard index, see:
    http://en.wikipedia.org/wiki/Jaccard_index

    Example:
    labels1 = [1, 2, 2, 3]
    labels2 = [3, 4, 4, 4]
    print jaccard(labels1, labels2)

    @param labels1 iterable of cluster labels
    @param labels2 iterable of cluster labels
    @return the Jaccard similarity value
    """
    n11 = n10 = n01 = 0
    n = len(labels1)
    # TODO: Throw exception if len(labels1) != len(labels2)
    for i, j in itertools.combinations(range(n), 2):
        comembership1 = labels1[i] == labels1[j]
        comembership2 = labels2[i] == labels2[j]
        if comembership1 and comembership2:
            n11 += 1
        elif comembership1 and not comembership2:
            n10 += 1
        elif not comembership1 and comembership2:
            n01 += 1
    return float(n11) / (n11 + n10 + n01)



jaccIndex = jaccardIndex(labels,clustered)
print("Jaccard Index (for clusters): ",jaccIndex)

jac =  [[0.5]]
smc =  [[0.66666667]]
cos =  [[0.66666667]]
Rand:  0.5714285714285714
Jaccard Index (for clusters):  0.3684210526315789


In [3]:
o1 = [0, 0, 0, 1, 0, 0, 0, 1] #11
o2 = [0, 0, 1, 0, 0, 1, 0, 1] 

vector1 = o1
vector2 = o2

jaccard = similarity(vector1,vector2,method = "Jaccard")
cosine = similarity(vector1,vector2,method = "Cosine") 
extendedJacc = similarity(vector1,vector2,method = "ExtendedJaccard")
correlation = np.corrcoef(vector1,vector2)
smc = similarity(vector1,vector2,method = "SMC")
print("jac = ",jaccard)
print("smc = ",smc)
print("cos = ",cosine)

jac =  [[0.25]]
smc =  [[0.625]]
cos =  [[0.40824829]]
