In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation, metrics
import sys
sys.path.append('../tools')
import tools
sys.path.append('../Unsupervised Learning/')
from FuzzyKmeans import FuzzyKmeans
%matplotlib inline



# Spectral Clustering

In [2]:
iris = datasets.load_iris()

In [3]:
def split_set(X,portion,y=None):
    X = np.array(X)
    y = np.array(y)
    size = int(X.shape[0]*portion)
    indexlist = np.arange(X.shape[0])
    testinds = np.random.choice(indexlist, size, replace=False)
    traininds = np.array([x for x in range(X.shape[0]) if x not in testinds])  
    if np.all(y == None):
        return X[traininds],X[testinds]
    else:
        return X[traininds],X[testinds],y[traininds],y[testinds]

In [4]:
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = split_set(X,0.1,y)

In [6]:
def l2distance(X,y):
    '''
	   gets euclidian distance between vector X and matrix(or vector) y
    '''
    ones = np.ones(y.shape[0]).reshape(y.shape[0],1)
    X = ones.dot(X)
    dist = (y - X)**2
    dist = np.sqrt(np.sum(dist,axis=1))

    return dist

In [7]:
l2distance(X_train[1].reshape(1,4),X_train)

array([0.53851648, 0.        , 0.3       , 0.33166248, 0.60827625,
       1.09087121, 0.50990195, 0.42426407, 0.17320508, 0.8660254 ,
       0.45825757, 0.14142136, 0.678233  , 1.36014705, 1.62788206,
       1.05356538, 1.17473401, 0.83666003, 0.70710678, 0.76157731,
       0.78102497, 0.55677644, 0.64807407, 0.2236068 , 0.5       ,
       0.59160798, 0.34641016, 0.24494897, 0.678233  , 1.14891253,
       1.34164079, 0.17320508, 0.3       , 0.78740079, 0.17320508,
       0.50990195, 0.45825757, 0.52915026, 0.81853528, 0.54772256,
       0.678233  , 0.98488578, 0.14142136, 0.84852814, 0.36055513,
       0.81240384, 4.09633983, 3.68646172, 4.23674403, 2.96984848,
       3.86005181, 2.14709106, 3.78813938, 2.80535203, 3.24499615,
       3.04138127, 3.71214224, 3.7       , 3.43365694, 2.97153159,
       3.69188299, 2.79284801, 3.89358447, 3.07408523, 4.01870626,
       3.65650106, 3.44673759, 3.65102725, 4.08044115, 4.29534632,
       3.5383612 , 2.41867732, 2.7       , 2.57875939, 2.85482

In [13]:
f = FuzzyKmeans(X)

In [15]:
f.predict(2,argmax=True)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [23]:
def generate_similarity_matrix(X,sim_type='l2'):
    
    n = X.shape[0]
    p = X.shape[1]
    
    newmatrix = np.zeros((n,n))
    if sim_type == 'l2':
        for i in range(n):
            newmatrix[i] = (l2distance(X[i].reshape(1,p),X)).reshape(1,n)
        return newmatrix
    else:
        print("Please provide a distance measure")    

In [10]:
generate_similarity_matrix(X_train,'l2')

array([[0.        , 0.53851648, 0.50990195, ..., 4.45982062, 4.65080638,
        4.14004831],
       [0.53851648, 0.        , 0.3       , ..., 4.49888875, 4.71805044,
        4.15331193],
       [0.50990195, 0.3       , 0.        , ..., 4.66154481, 4.84871117,
        4.29883705],
       ...,
       [4.45982062, 4.49888875, 4.66154481, ..., 0.        , 0.6164414 ,
        0.64031242],
       [4.65080638, 4.71805044, 4.84871117, ..., 0.6164414 , 0.        ,
        0.76811457],
       [4.14004831, 4.15331193, 4.29883705, ..., 0.64031242, 0.76811457,
        0.        ]])

In [84]:
class SpectralClustering():
    
    def __init__(self,X):
        
        self.X = X
        
    def generate_normalized_laplacian(self,distance_measurement='l2'):
        
        X = self.X
        similarity = generate_similarity_matrix(X)
        distances_inv = np.diag(1/np.sum(similarity,axis=1))
        laplacian = np.identity(X.shape[0]) - distances_inv.dot(similarity)
        return laplacian
    
    def predict(self,k,distance_measure='l2',clustering_algo=FuzzyKmeans):
        
        laplacian = self.generate_normalized_laplacian(distance_measure)
        eigenvalues, eigenvectors = np.linalg.eig(laplacian)
        indices = eigenvalues.argsort()[:k]
        eigenvectors = eigenvectors[:,indices]
        clustering = clustering_algo(eigenvectors)
        return clustering.predict(k,argmax=True)
    

In [82]:
sp = SpectralClustering(X)

In [83]:
sp.predict(3)

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2])