# K-Means Clustering

K-Means clustring of dataset and displaying the classified resutls. Developed by Pratham Shah, MANAS AI Taskphase 6

### Imports

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA

### Establishing the dataset

In [2]:
orignal = pd.read_csv("Clustering_Data.csv")
data = orignal.copy()

In [3]:
titles = list(data)
data.head()

Unnamed: 0,Alcohol,Malic_Acid,Ash,Ash_Alcanity,Magnesium,Total_Phenols,Flavanoids,Nonflavanoid_Phenols,Proanthocyanins,Color_Intensity,Hue,OD280,Proline,Customer_Segment
0,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065,1
1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050,1
2,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185,1
3,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480,1
4,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735,1


### Defining the functions for clustering

In [4]:
def z(X):
    #Z score normalisation algorithm. Returns z score normalised dataset, means and std. deviations of the features.
    # find the mean of each n columns
    means     = np.mean(X, axis=0)                 # mean will have length n
    # find the standard deviation of each n columns
    stdevs  = np.std(X, axis=0)+0.00001                 # sigma will have length n. +0.00001 to prevent divide by 0 error.
    X_norm = (X - means) / stdevs    

    return X_norm, means, stdevs

data, means, stddevs = z(data)

def return_closest_centroid (point, centroids): #returns the centroid that is closest to the given datapoint and sum of euclidean distances
    distances = []
    for index, item in enumerate(centroids):
        distances.append(np.linalg.norm(point - item)) #distances of point from various centroids
    return (centroids[distances.index(min(distances))], sum(distances)) #returns nearest centroid and sum of euclidean distances

def map_points(centroids, points): #returns {point, centroid that is closest to that point}
    mapping = []
    for index, point in enumerate(points):
        mapping.append({"point" : point, "centroid" : return_closest_centroid(point, centroids)[0]}) #{point: that point, centroid: the closest centroid}
    return mapping

def get_means_from_clusters(pointmap, corrcentroids):
    means = []
    for centroid in corrcentroids:
        means.append(np.mean(np.array([point["point"] for point in pointmap if (point["centroid"] == centroid).all()]), axis=0))
    return means #list of mean points for all items in particular clusters

def graphing (points, centroids, title):
    tindex = titles.index(title)
    mean = list(means)[tindex]
    stddev = list(stddevs)[tindex]
    for centroid in centroids:
        #values should contain (point, index of point)
        values = [(point["point"][tindex], index) for index, point in enumerate(points) if (point['centroid'] == centroid).all()]   
        y_val = [stddev * x[0] + mean for x in values]
        x_val = [x[1] for x in values]
        principal=PCA(n_components=3)
        principal.fit(Scaled_data)
        print(principal.components_)
    
def calculate_cluster_variance(data):
    # Extract points and centroids from the input data
    points = np.array([d['point'] for d in data])
    centroids = np.array([d['centroid'] for d in data])

    # Find unique centroids and their indices
    unique_centroids, indices = np.unique(centroids, axis=0, return_inverse=True)

    # Calculate variances for each cluster
    variances = []
    for i in range(len(unique_centroids)):
        cluster_points = points[indices == i]
        if len(cluster_points) > 0:
            centroid = unique_centroids[i]
            squared_distances = np.sum((cluster_points - centroid) ** 2, axis=1)
            variance = np.mean(squared_distances)
            variances.append(variance)
        else:
            variances.append(0)  # Handle empty clusters

    return variances

distances = []
def kmeans (points, init_centres, max_iterations):
    global distances
    pointmap = []
    centres = init_centres
    for i in range(0, max_iterations):
        pointmap = map_points(centres, points) # puts stuff into the pointmap
        centres = get_means_from_clusters(pointmap, centres)
        
        sumofpoints = 0
        for point in points:
            sumofpoints += return_closest_centroid(point, centres)[1]
        print(f"iter. {i} : sum of distance of all (normalised) points from their nearest centroid: {sumofpoints}")
        distances.append(sumofpoints)
        try:
            if(distances[-1] == distances[-2]):
                print("\n*** The model has reached maximum efficiency. There is no need to run more iterations now. ***\n")
                break
        except:
            continue
    pointmap = map_points(centres, points)
    print("CENTRES OF CLUSTERS:")
    print(centres)
    print(f"\n\nSum of variances with {len(init_centres)} clusters: {sum(calculate_cluster_variance(pointmap))}")
    #at this point, centres is storing the means and pointmap is storing {point, closest mean to that point}
    for title in titles:
        graphing(pointmap, centres, title)

### Running the model

Variations with K = 4, 3 and 2:\
29.061930439703755, 21.340397703895988, 19.331384313865637\
It is evident that K = 3 is the elbow point and is thus the ideal number of clusters.

In [5]:
kmeans(data.values, data.values[0:3], 20)

iter. 0 : sum of distance of all (normalised) points from their nearest centroid: 2163.6316008474796
iter. 1 : sum of distance of all (normalised) points from their nearest centroid: 2212.9454884260517
iter. 2 : sum of distance of all (normalised) points from their nearest centroid: 2277.481376912945
iter. 3 : sum of distance of all (normalised) points from their nearest centroid: 2307.3807331048392
iter. 4 : sum of distance of all (normalised) points from their nearest centroid: 2307.3807331048392

*** The model has reached maximum efficiency. There is no need to run more iterations now. ***

CENTRES OF CLUSTERS:
[array([ 0.83522177, -0.30380696,  0.36469271, -0.61018946,  0.57758639,
        0.88522318,  0.97780974, -0.56204437,  0.58027641,  0.17106274,
        0.47396286,  0.7792361 ,  1.12518525, -1.15132049]), array([-9.09317170e-01, -3.80704058e-01, -5.19728428e-01,  1.37766594e-01,
       -4.97451102e-01, -9.61988117e-02, -7.65215989e-04, -4.11437910e-03,
        1.12386456e-02

NameError: name 'Scaled_data' is not defined