# Importing required libraries

In [41]:
import numpy as np
import pandas as pd
from collections import defaultdict

# Replacing all null values with the median value of the column

In [42]:
df = pd.read_csv('ieeedata.csv')

df['coordinate1'] = df['coordinate1'].replace(0, df['coordinate1'].median())

df['coordinate2'] = df['coordinate2'].replace(0, df['coordinate2'].median())

df['coordinate3'] = df['coordinate3'].replace(0, df['coordinate3'].median())

df['coordinate4'] = df['coordinate4'].replace(0, df['coordinate4'].median())

df['coordinate5'] = df['coordinate5'].replace(0, df['coordinate5'].median())

data = df.to_numpy()

# Choosing number of clusters

In [43]:
K = 4

# Class to assign points to clusters

In [44]:
class Point:
    def __init__(self, data):
        self.data = data
        self.k = np.random.randint(0, K)


points = [Point(i) for i in data]

# Function to create a list of points assigned to each cluster 

In [45]:
def make_k_mapping(points):
    point_dict = defaultdict(list)
    for p in points:
        point_dict[p.k] = point_dict[p.k] + [p.data]
    return point_dict

# Function to calculate mean of each cluster

In [46]:
def calc_k_means(point_dict):
    means = [np.mean(point_dict[k], axis=0) for k in range(K)]
    return means

# Function to reassign points to the nearest cluster mean

In [47]:
def update_k(points, means):
    for p in points:
        dists = [np.linalg.norm(means[k] - p.data) for k in range(K)]
        p.k = np.argmin(dists)

# Final K-Means algorithm

## Avoids bad initialisations by computing the cost function for multiple random initialisations and returning the centroids corresponding to the lowest value of the cost function.

In [48]:
def kmeans(epochs=15, randinit=8):
    dist = []
    listofmeans = []
    for i in range(randinit):
        points = [Point(i) for i in data]
        for e in range(epochs):
            point_dict = make_k_mapping(points)
            means = calc_k_means(point_dict)
            update_k(points, means)

        dist.append([(sum(np.linalg.norm(p.data - means[p.k]) for p in points))])
        listofmeans.append(means)
    return listofmeans[np.argmin(dist)]

# Prints the coordinates of the means

In [49]:
print(kmeans())

[array([1.30072991e+07, 5.62670333e+08, 3.66736379e+06, 3.03316481e+07,
       1.78874391e+07]), array([1.15881681e+07, 2.13043250e+06, 5.23457820e+08, 6.72437342e+06,
       2.23811055e+07]), array([7.95225865e+07, 5.48023626e+07, 1.44102912e+07, 7.14930655e+08,
       3.54625900e+06]), array([30481505.30913209, 10342579.58868895, 10451526.73042852,
       14883977.53470437, 25741647.86409615])]


# Modified Akaiko Information Criterion function to determine the optimal number of clusters

In [50]:
means = kmeans()


def modaic(points):
    sigma = 0.00
    for p in points:
        dist = np.linalg.norm(p.data - means[p.k])
        sigma = sigma + dist

    print(sigma / (5 * K * (10 ** 8)))


modaic(points)

130.88055795065569
