<a href="https://colab.research.google.com/github/shashankdubey78/CAP5610_ML/blob/main/K_Means.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import time
import random
from multiprocessing import Pool
import numpy as np
import pandas as pd
import math
from math import sqrt

import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
def calculate_euclidean_dist(x, y):
		return np.sqrt(np.sum(np.square(np.subtract(x, y))))

def calculate_cosine_sim(x, y):
		return 1 - (np.dot(x, y) / (np.linalg.norm(x) * np.linalg.norm(y)))

def calculate_jaccard_dist(x, y):
		return 1 - np.divide(np.sum(np.minimum(x, y)), np.sum(np.maximum(x, y)))

def Calculate_SSE(distance, occurences, centroids):
	sseval = 0
	for c in centroids:
		for i in occurences:
			sseval += distance(i, c) ** 2
	return sseval

In [7]:
def assign(occurences, centroids, distance):
    mininumDistance = float('Inf')
    mininumDistanceIndex = 0
    len_centroid = len(centroids)
    for i in range(len_centroid):
        dis = distance(occurences, centroids[i])
        if (dis < mininumDistance):
            mininumDistance = dis
            mininumDistanceIndex = i
    return mininumDistanceIndex


def createEmptyListLists(numLists):
    lis = []
    for i in range(numLists):
        lis.append([])
    return lis


def assignAllValues(occurences, centroids, dist):
    clusters = createEmptyListLists(len(centroids))
    assignments = np.empty(len(occurences), dtype=np.int8)
    i = 0
    for occurence in occurences:
        clusterIndex = assign(occurence, centroids, dist)
        clusters[clusterIndex].append(occurence)
        assignments[i] = clusterIndex
        i += 1
    return (clusters, assignments)


def calculateCentroids(clusters):
    centroids = []
    len_cluster=len(clusters)
    for i in range(len_cluster):
        name = "centroid" + str(i)
        # centroid = meanInstance(name, clusters[i])
        centroid = np.mean(clusters[i], axis=0).tolist()
        centroids.append(centroid)
    return centroids


def calculate_kmeans(instances, k=10, distance=calculate_cosine_sim, stopCondition='Max_Preset', iteration_limit=500, initCentroids=None):

    VALID_STOP_CONDITIONS = ['CENTROIDS_STABLE', 'SSE_INCREASED', 'MAX_PRESET', 'ANY']
    stopCondition = stopCondition.upper()
    if not stopCondition in VALID_STOP_CONDITIONS:
	    raise ValueError("Invalid stopCondition: %s" % stopCondition)

    start_time = time.time()

    if (initCentroids is None or len(initCentroids) < k):
        rng = np.random.default_rng()
        centroids = rng.choice(
            instances, k, replace=False, shuffle=False).tolist()
    else:
        centroids = initCentroids
    prevCentroids = []
    iterations = 0
    prevSSE = currentSSE = float('inf')
    keep_going = True
    stopReason = None
    while keep_going:
        iterations += 1
        (clusters, assignments) = assignAllValues(instances, centroids, distance)
        prevCentroids = centroids
        centroids = calculateCentroids(clusters)

        if stopCondition == 'Centroids_Stable' or stopCondition == 'ANY':
            keep_going = (centroids != prevCentroids)
            stopReason = 'Centroids_Stable'
        if stopCondition == 'SSE_Increased' or stopCondition == 'ANY':
            prevSSE = currentSSE
            currentSSE = Calculate_SSE(distance, instances, centroids)
            keep_going = (currentSSE < prevSSE)
            stopReason = 'SSE_Increased'
        if iterations >= iteration_limit:
            keep_going = False
            stopReason = 'Max_Preset'

    endtime = time.time()

    result = {
        '#iterations': iterations,
        'timeTaken': endtime - start_time,
        'SSE_': Calculate_SSE(distance, instances, centroids),
        'assignments': assignments,
        'stoppingReason': stopReason,
    }
    return result


In [8]:
DATA_PATH = './data.csv'
LABEL_PATH = './label.csv'

DATA = pd.read_csv(DATA_PATH, header=None)
LABEL = pd.read_csv(LABEL_PATH, names=['truth'])

droplist = []
for column in DATA:
	if(DATA[column].max() == 0):
		droplist.append(column)
DATA.drop(columns=droplist, inplace=True)

SAMPLE_SIZE = 500
DATA_SMALL = DATA.sample(SAMPLE_SIZE)
LABEL_SMALL = LABEL.iloc[DATA_SMALL.index]

DATA = DATA.to_numpy(dtype=np.int16)
# LABEL = LABEL.to_numpy(dtype=np.int16)
DATA_SMALL = DATA_SMALL.to_numpy(dtype=np.int16)
# LABEL_SMALL = LABEL_SMALL
CATEGORIES = len(np.unique(LABEL_SMALL))

np.shape(DATA), np.shape(LABEL), np.shape(DATA_SMALL), np.shape(LABEL_SMALL)

((1159, 604), (10000, 1), (500, 604), (500, 1))

In [11]:
def runAllTests(argsObj, groundTruth):
	pool = Pool()
	results = pool.starmap(calculate_kmeans, argsObj.values())

	resultsOb = {}
	i = 0
	for key in argsObj:
		resultsOb[key] = results[i]
		i += 1

	benchmarkValue = pd.DataFrame(resultsOb).transpose()

	scores = pd.Series(dtype=float, name="accuracy")
	for key in argsObj.keys():
		assignment = benchmarkValue.assignments[key]
		scores[key] = calculate_score(assignment, groundTruth)

	benchmarkValue = pd.concat([benchmarkValue, scores], axis=1).drop('assignments', axis=1)
	return benchmarkValue

def calculate_score(assignments, groundTruth):
	clusteringLabels = {}
	clustering_Score = pd.DataFrame()
	assignment_Series = pd.Series(assignments, index=groundTruth.index, name='assignments')
	assignment_Table = pd.concat([assignment_Series, groundTruth], axis=1)

	for i in range(CATEGORIES):
		countVal = assignment_Table.loc[assignment_Table['assignments'] == i].groupby('truth').count()
		countVal = countVal.assignments.rename(index=(i))
		clustering_Score = pd.concat([clustering_Score, countVal], axis=1)

	clustering_Score.fillna(0, inplace=True)
	for cluster in clustering_Score:
		assignedLabel = clustering_Score[cluster].idxmax()
		clusteringLabels[cluster] = assignedLabel
		clustering_Score.drop(assignedLabel, inplace=True)

	assignedLabels = []
	for i in assignments:
		assignedLabels.append(clusteringLabels[i])

	return np.sum(assignedLabels == groundTruth.truth) / len(assignedLabels)

In [None]:
args1 = {
    'Euclidean Dist.': [DATA, CATEGORIES, calculate_euclidean_dist, 'Centroids_Stable'],
    'Cosine Dist.': [DATA, CATEGORIES, calculate_cosine_sim, 'Centroids_Stable'],
    'Jaccard Dist.': [DATA, CATEGORIES, calculate_jaccard_dist, 'Centroids_Stable'],
}

benchmarks1 = runAllTests(args1, LABEL)

In [None]:
benchmarks1.style.background_gradient(axis=0, cmap ='gist_heat_r')

In [None]:
args3 = {
    'Euclidean Dist.': [DATA, CATEGORIES, calculate_euclidean_dist, 'any', 100],
    'Cosine Dist.': [DATA, CATEGORIES, calculate_cosine_sim, 'any', 100],
    'Jaccard Dist.': [DATA, CATEGORIES, calculate_jaccard_dist, 'any', 100],
}

benchmarks3 = runAllTests(args3, LABEL)

In [None]:
benchmarks3.style.background_gradient(axis=0, cmap ='gist_heat_r')

In [None]:
LIMIT = 100
args4 = {
    'Euclidean_Centroids': [DATA, CATEGORIES, calculate_euclidean_dist, 'centroids'],
    'Euclidean_SSE_': [DATA, CATEGORIES, calculate_euclidean_dist, 'SSE'],
    'Euclidean_Limit': [DATA, CATEGORIES, calculate_euclidean_dist, 'limit', LIMIT],
    'Cosine_Centroids': [DATA, CATEGORIES, calculate_cosine_sim, 'centroids'],
    'Cosine_SSE': [DATA, CATEGORIES, calculate_cosine_sim, 'SSE'],
    'Cosine_Limit': [DATA, CATEGORIES, calculate_cosine_sim, 'limit', LIMIT],
    'Jaccard_Centroids': [DATA, CATEGORIES, calculate_jaccard_dist, 'centroids'],
    'Jaccard_SSE': [DATA, CATEGORIES, calculate_jaccard_dist, 'SSE'],
    'Jaccard_Limit': [DATA, CATEGORIES, calculate_jaccard_dist, 'limit', LIMIT],
}

benchmarks4 = runAllTests(args4, LABEL)

In [None]:
benchmarks4.style.background_gradient(axis=0, cmap ='gist_heat_r')