In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation, metrics
import sys
%matplotlib inline

# Spectral Clustering

In [18]:
iris = datasets.load_iris()

In [19]:
def split_set(X,portion,y=None):
    X = np.array(X)
    y = np.array(y)
    size = int(X.shape[0]*portion)
    indexlist = np.arange(X.shape[0])
    testinds = np.random.choice(indexlist, size, replace=False)
    traininds = np.array([x for x in range(X.shape[0]) if x not in testinds])  
    if np.all(y == None):
        return X[traininds],X[testinds]
    else:
        return X[traininds],X[testinds],y[traininds],y[testinds]

In [20]:
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = split_set(X,0.1,y)

In [21]:
class FuzzyKmeans():
    """
    Fuzzy Kmeans calculates cluster probabilities in regards to euclidian distance
    Equivalent to vanilla Kmeans if we assign a given point to the cluster with the highest probability
    """
    def __init__(self,X):
        self.X = X
        
    def calculate_centers(self,U,m):
        X = self.X
        ones = np.ones(X.shape[0]).reshape((X.shape[0],1))
        denom = (1/(U**m).dot(ones)).reshape(U.shape[0],)
        diagonal = np.diag(denom)
        centers = diagonal.dot((U**m).dot(X))
        return centers
        
    def calculate_fuzzy(self,centers,m):
        m = 2/(m-1)
        X = self.X
        us = np.ones(shape=(centers.shape[0],1))
        ones = np.ones(shape=(1,centers.shape[0]))
        for i in range(X.shape[0]):
            row = X[i].reshape(X.shape[1],1)
            row = row.dot(ones).T - centers
            norms = np.linalg.norm(row,2,axis=1).reshape(centers.shape[0],1)
            norms = (norms.dot(1/norms.T))**m
            sums = 1/np.sum(norms,axis=1).reshape(norms.shape[0],1)
            us = np.column_stack((us,sums))

        return us[:,1:]
        
    def predict(self,m,k,exit,seed,argmax=False): 
        """
        Main clustering function
        m is the degree of uncertainty, (fuzziness of cluster)
        k is the number of clusters
        exit is the exit criteria 
        set argmax = True for normal K-means
        """
        np.random.seed(seed)
        U = np.random.uniform(0,1,size=(k,X.shape[0])) #initialize cluster probabilities
        centers = self.calculate_centers(U,m)
        newcenters = 2*centers
        while np.linalg.norm((centers - newcenters),2) >= exit:
            newcenters = centers
            U = self.calculate_fuzzy(centers,m)
            centers = self.calculate_centers(U,m)
        if argmax:
            return np.argmax(U,axis=0).T
        return U.T

In [24]:
def l2distance(X,y):
    '''
	   gets euclidian distance between vector X and matrix(or vector) y
    '''
    ones = np.ones(y.shape[0]).reshape(y.shape[0],1)
    X = ones.dot(X)
    dist = (y - X)**2
    dist = np.sqrt(np.sum(dist,axis=1))

    return dist

In [29]:
l2distance(X_train[1].reshape(1,4),X_train)

array([0.53851648, 0.        , 0.3       , 0.33166248, 0.60827625,
       1.09087121, 0.50990195, 0.17320508, 0.8660254 , 0.45825757,
       0.14142136, 0.678233  , 1.36014705, 1.62788206, 1.05356538,
       0.54772256, 1.17473401, 0.83666003, 0.70710678, 0.76157731,
       0.78102497, 0.55677644, 0.64807407, 0.2236068 , 0.5       ,
       0.59160798, 0.5       , 0.34641016, 0.24494897, 0.678233  ,
       1.14891253, 1.34164079, 0.17320508, 0.17320508, 0.50990195,
       0.45825757, 0.52915026, 0.81853528, 0.54772256, 0.98488578,
       0.84852814, 0.36055513, 0.81240384, 4.09633983, 4.23674403,
       2.96984848, 3.81182371, 3.39116499, 3.86005181, 2.14709106,
       3.78813938, 2.80535203, 2.46170673, 3.24499615, 3.04138127,
       3.71214224, 2.55929678, 3.7       , 3.43365694, 2.97153159,
       3.69188299, 2.79284801, 3.89358447, 4.01870626, 3.65650106,
       3.65102725, 4.08044115, 4.29534632, 3.5383612 , 2.41867732,
       2.7       , 2.57875939, 2.85482048, 4.11703777, 3.39852

In [31]:
X_train[0] = np.ones(4)

In [48]:
def generate_similarity_matrix(X,sim_type):
    
    n = X.shape[0]
    p = X.shape[1]
    
    newmatrix = np.zeros((n,n))
    if sim_type == 'l2':
        for i in range(n):
            newmatrix[i] = (l2distance(X[i].reshape(1,p),X)).reshape(1,n)
        return newmatrix
    else:
        print("Please provide a distance measure")    

In [51]:
generate_similarity_matrix(X_train,'l2')

array([[0.        , 4.47325385, 4.38862165, ..., 7.27255113, 7.33825592,
        6.74240313],
       [4.47325385, 0.        , 0.3       , ..., 4.49888875, 4.71805044,
        4.15331193],
       [4.38862165, 0.3       , 0.        , ..., 4.66154481, 4.84871117,
        4.29883705],
       ...,
       [7.27255113, 4.49888875, 4.66154481, ..., 0.        , 0.6164414 ,
        0.64031242],
       [7.33825592, 4.71805044, 4.84871117, ..., 0.6164414 , 0.        ,
        0.76811457],
       [6.74240313, 4.15331193, 4.29883705, ..., 0.64031242, 0.76811457,
        0.        ]])

In [None]:
class SpectralClustering():
    
    def __init__(self,X):
        
        self.X = X
        
    def generate_normalized_laplacian(self,distance_measurement):
        
        X = self.X
        similarity = generate_similarity_matrix(X)
        distances_inv = np.diag(1/np.sum(similarity,axis=1))
        laplacian = np.identity(n) - distances_inv.dot(similarity)
        return laplacian
    
    def predict(self,)
    
    

In [55]:
np.identity(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])