# K Means Clustering 

In [1]:
#importing files
import pandas as pd
import numpy as np
from copy import deepcopy
import os

### Setting path locations and other initializations for <font color =red> *Aggregation*<font>

In [2]:
# setting path for the relevant locations

current_dir = os.getcwd()                      # current working directory
parent_dir = os.path.dirname(current_dir)      # parent directory
data_set = parent_dir+'/data/'                  # setting the path for data directory
plots = parent_dir+'/plots'                     # setting the path for plot directory

In [3]:
# reading input from the file
data = pd.read_csv(data_set+'ClusterDataset.csv')
data.head()

Unnamed: 0,State,Depth,OBC,Public or Group owned,Electric Pump,Own Savings
0,ANDAMAN & NICOBARS,0.013882,0.327586,0.370315,0.477511,0.396552
1,ANDHRA PRADESH,0.095147,0.465189,0.075197,0.40581,0.542277
2,ASSAM,0.609923,0.07907,0.4,0.055814,0.418605
3,BIHAR,0.114603,0.460156,0.174717,0.116361,0.107663
4,CHHATISGARH,0.079264,0.393502,0.026254,0.809141,0.576085


In [4]:
label = data.State
data_set = data.iloc[:, [1, 2, 3, 4, 5]].values
# print(data_set)

In [5]:
m = data_set.shape[0]
n = data_set.shape[1]
n_iter = 100
# print(n, m)

In [6]:
# number of clusters
K = 6

In [7]:
def Kmeans(K):
    # Generate random centers, here we use sigma and mean to ensure it represent the whole data
    mean = np.mean(data_set, axis=0)
    std = np.std(data_set, axis=0)
    centers = np.random.randn(K, n)*std + mean

    centers_old = np.zeros(centers.shape)  # to store old centers
    centers_new = deepcopy(centers)  # Store new centers

    clusters = np.zeros(m)
    distances = np.zeros((m, K))

    error = np.linalg.norm(centers_new - centers_old)

    # When, after an update, the estimate of that center stays the same, exit loop
    while error != 0:
        # Measure the distance to every center
        for i in range(K):
            distances[:, i] = np.linalg.norm(data_set - centers_new[i], axis=1)
        # Assign all training data to closest center
        clusters = np.argmin(distances, axis=1)

        centers_old = deepcopy(centers_new)
        # Calculate mean for every cluster and update the center
        for i in range(K):
            centers_new[i] = np.mean(data_set[clusters == i], axis=0)
        error = np.linalg.norm(centers_new - centers_old)
    return clusters


In [8]:
c = Kmeans(K)
Cluster_Final = pd.DataFrame(c)
# Change the column names
Cluster_Final.columns = ['Cluster_No.']

In [11]:
# Change the row indexes
Cluster_Final.index = [label]
Cluster_Final.to_csv('Cluster.csv')