In [2]:
import pandas as pd
import numpy as np
import copy as copy
import time as time
import random
from tqdm import tqdm_notebook as tqdm
import matplotlib.pyplot as pt
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity

# Loading Data

In [5]:
def getData():
    basepath = 'clustering-data/'
    fileName = ['animals','countries','fruits','veggies']
    xData = pd.DataFrame()
    for i in range(len(fileName)):
        dataTemp = pd.read_csv(basepath + fileName[i],sep = " ",header = None)
        label = [i] * (len(dataTemp))
        dataTemp['label'] = label
        if len(xData) == 0:
            xData = dataTemp
        else:
            xData = pd.concat([xData,dataTemp])
    xData = xData.iloc[:,1:]
    return xData

In [6]:
xData  = getData()
xData.shape

(329, 301)

# Data Preprocessing

In [7]:
xData = xData.sample(frac=1).reset_index(drop=True)

In [8]:
yLabel = xData.iloc[:,-1]
xData = xData.iloc[:,:-1]

In [9]:
def normalizeMinMax(xData):
    xData = (xData - xData.mean())/xData.std()
    print(xData.describe())
    return xData

In [7]:
def normalizeUnitNorm(xData):
    return normalize(xData, norm='l2')

In [166]:
xData = normalizeUnitNorm(xData)
#xData.describe()

# kMeans Methods

In [8]:
# kmeans = KMeans(n_clusters=4,n_init  = 20, random_state=0,n_jobs = 4) #init = 'k-means++'
# kmeans.fit(xData)
# print(kmeans.labels_)
# print(kmeans.n_iter_)
# print(kmeans.inertia_)

In [9]:
# xData = np.array([2, 4, 10, 12, 3, 20, 30, 11, 25])
# xData = xData.reshape(9,1)
# xData

In [10]:
xData = xData.to_numpy()

In [11]:
def selectCentroid(dataPoint,k):
    randPos = list(np.random.randint(dataPoint.shape[0], size=k))
    return dataPoint[ randPos , :]

In [12]:
def EquiledianDistance(centroid,dataPoint):
    return np.sqrt(np.sum(np.square(centroid[:,np.newaxis,:] - dataPoint), axis=2))

In [13]:
def ManhattenDistance(centroid,dataPoint):
    return (np.sum(np.absolute(centroid[:,np.newaxis,:] - dataPoint), axis=2))

In [14]:
def CosineSimilarity(centroid,dataPoint):
    print(type(centroid))
    print(type(dataPoint))
    temp = cosine_similarity(centroid,dataPoint)
    print(temp)
    return temp

In [15]:
def assignLabel(distanceMatrix):
    return np.argmin(distanceMatrix.T, axis=1) , np.min(distanceMatrix.T, axis=1)

In [16]:
def newCentroid(xData,clusterLabel,k,SSEList,distanceType):
    centroid  = [] 
    #print(clusterLabel)
    for i in range(k):
        pos = np.where(clusterLabel == i)[0]
        
        filteredData = xData[pos,:]
#         if(len(filteredData) == 0):
#             print(len(filteredData))
#             print(np.mean(filteredData, axis = 0))
#             d = np.isnan(np.mean(filteredData, axis = 0))
#             print(any(d))
        if distanceType == 2:
            centroid.append(np.median(filteredData, axis = 0))
        else:
            centroid.append(np.mean(filteredData, axis = 0))
    
    for i in range(k):
        #print(centroid[i])
        if any(np.isnan(centroid[i])):
            maxSSE = np.argmax(SSEList)
            pos = np.where(clusterLabel == maxSSE)[0]
            pos = random.choice(pos)
            centroid[i] = xData[pos,:]
    #print(centroid)
    return np.array(centroid)

In [17]:
def computeSSE(MinDistance,clusterLabel):
    SSEList = np.bincount(clusterLabel, weights=(MinDistance ** 2))
    return np.sum(MinDistance **2),SSEList

In [18]:
#NotUsed Anywhere
def giveCorrectLabel(k,distanceMatrix,clusterLabel,yLabel):
    actualLabel = np.zeros(len(yLabel),order ='F')
    for i in range(k):
        minPoint = np.argmin(distanceMatrix[i,:], axis=0)
        print(minPoint)
        ActualLabelCluster = yLabel[minPoint]
        pos = np.where(clusterLabel == i)[0]
        actualLabel[pos] = ActualLabelCluster
    return actualLabel

In [19]:
def nC2(n):
    return n*(n-1)/2

In [20]:
def calculateMeasure(clusterLabelCount,k,LabelC):
    horizontalSum = np.sum(clusterLabelCount,axis = 1)
    TPFP = np.sum(list(map(nC2,horizontalSum)))
    TP = 0
    for i in range(k):
        TP += np.sum(list(map(nC2,clusterLabelCount[i,:])))
    
    arry =np.ones(LabelC)
    #np.place(clusterLabelCount, clusterLabelCount == 0, [1])
    #print(clusterLabelCount)
    for i in range(LabelC):
        labelDistibution = (clusterLabelCount[:,i])
        FNinter = 0
        for j in range(len(labelDistibution)):
            FNinter += (labelDistibution[j] * np.sum(labelDistibution[j+1:]))
        arry[i] =FNinter
    #print(arry)    
    FN = np.sum(arry)
    #print(TPFP,TP,FN)
    return TPFP,TP,FN

In [21]:
def getClusterLabelCount(labels,k,yLabel,clusterLabel):
    clustercount = np.zeros((k,labels))
    for i in range(k):
        pos = np.where(clusterLabel == i)[0]
        filterY = yLabel[pos]
        for j in range(labels):
            pos = (np.where(filterY == j)[0])
            clustercount[i,j] = len(pos)
    return clustercount;

In [22]:
def plotGraph(precision,recall,F1Score,k):
    xaxis = [i for i in range(k)]
    pt.figure(figsize=(9,7))
    pt.grid()
    pt.title("Prescision,Recall and F1Score graph")
    pt.plot(xaxis,precision,marker ='o',label= 'Precision')
    pt.plot(xaxis,recall,marker ='o', label = 'Recall')
    pt.plot(xaxis,F1Score,marker ='o',label = 'F1Score')
    pt.xlabel("k")
    pt.ylabel("values")
    pt.legend()
    pt.show()

In [23]:
def plotSSEScore(SSE):
    pt.figure(figsize=(9,7))
    pt.title("SSE vs iteration Graph")
    pt.grid()
    pt.plot(SSE,marker = 'o',label = 'SSE')
    pt.xlabel("iteration")
    pt.ylabel("SSE")
    pt.legend()
    pt.show()

In [24]:
def kMeans(maxk,maxiteration,distanceType):
    precision = []
    recall = []
    F1Score = []
    SSEAcrossK =[]
    for k in range(1,maxk+1):
        initialCentroid = selectCentroid(xData,k)
        centroid = copy.deepcopy(initialCentroid)
        #centroid = np.asarray([[4],[11]])
        SSE = []
        for i in range(maxiteration):
            #print(i , "iteration")
            distanceSSE = EquiledianDistance(centroid ,xData)
            if distanceType == 1:
                distance = EquiledianDistance(centroid ,xData)
            elif distanceType == 2:
                distance = ManhattenDistance(centroid ,xData)
            else:
                distance = CosineSimilarity(centroid ,xData)    
            #clusterLabel,MinDistance = assignLabel(distance)
            #rint(distance.T)
            #print('SSE distance',distanceSSE.T)
            clusterLabel = np.argmin(distance.T, axis=1)
            MinDistance = np.min(distanceSSE.T, axis=1)
            SSEFromMethod, SSEList = computeSSE(MinDistance,clusterLabel)
            SSE.append(SSEFromMethod)
            #print("SSE individual run",SSEList)
            centroid = newCentroid(xData,clusterLabel,k,SSEList,distanceType)

        #actualPLabel = giveCorrectLabel(k,distance,clusterLabel,yLabel)
        #print(classification_report(yLabel, actualPLabel))
        #print("SSE For k",SSE)
        SSEAcrossK.append(SSE[-1])
        ClusterLabelCount = getClusterLabelCount(4,k,yLabel,clusterLabel)
        TPFP,TP,FN=calculateMeasure(ClusterLabelCount,k,4)
        precisi = TP/TPFP
        reca = TP/(TP+FN)
        precision.append(precisi)
        recall.append(reca)
        #print(precisi,reca)
        F1Score.append((2*precisi*reca)/(precisi+reca))
    plotSSEScore(SSEAcrossK)
    plotGraph(precision,recall,F1Score,k)

In [25]:
def main():
    print('Enter value of k')
    k = int(input())
    print('Enter value of iteration')
    iteration = int(input())
    print('Enter 1 for Euclidian distance , 2 for Manhatten distance,3 for cosine distance')
    distance = int(input())
    print("values for K meanse algorith are",k,iteration,distance)
    kMeans(k,iteration,distance)

In [1]:
main()

NameError: name 'main' is not defined