# DSCI-633 Lab #6 (python)
# K-Means Algorithm

## Complete find_closest_centroid( )  and 
## Complete calc_centroids( ).

### Name: Stephen Cook

In [18]:
# Import all needed libraries
import numpy as np
import pandas as pd
import numpy.matlib

In [19]:
# upload iris dataset
df = pd.read_csv('iris.csv')
df.columns = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'species']

In [20]:
# Use all records in the dataset withoout classes ('species' column) for cluster analysis
cluster_data = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']].copy(deep=True)

# Sort the dataset by attributes to select the suboptimal starting centroid
cluster_data.sort_values(by=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], inplace=True)
cluster_array = np.array(cluster_data)

In [21]:
print(cluster_array[0:10])

[[4.3 3.  1.1 0.1]
 [4.4 2.9 1.4 0.2]
 [4.4 3.  1.3 0.2]
 [4.4 3.2 1.3 0.2]
 [4.5 2.3 1.3 0.3]
 [4.6 3.1 1.5 0.2]
 [4.6 3.2 1.4 0.2]
 [4.6 3.4 1.4 0.3]
 [4.6 3.6 1.  0.2]
 [4.7 3.2 1.3 0.2]]


In [22]:
cluster_array.shape[0]

149

In [23]:
# Calculate Euclidean distance between two observations
def distance(X1, X2):
    return (sum((X1 - X2)**2))**0.5

# Assign each point to its closest centroid
def find_closest_centroid(centroids, cluster_array):
    clusters = []
    for i in range(cluster_array.shape[0]):
        distances = []
        for centroid in centroids:
            distances.append(distance(centroid, cluster_array[i]))
            cluster = [z for z, val in enumerate(distances) if  val == min(distances)]
            clusters.append(cluster[0])
#### Complete this #########
    
    return clusters

# Calculate new centroids based on each cluster's mean
def calc_centroids(clusters, cluster_array):
    new_centroids = []
    cluster_df = pd.concat([pd.DataFrame(cluster_array), pd.DataFrame(clusters, columns=['cluster'])], axis=1)
    for c in set(cluster_df['cluster']):
        current_cluster = cluster_df[cluster_df['cluster']==c][cluster_df.columns[:-1]]
        cluster_mean = current_cluster.mean(axis=0)
        new_centroids.append(cluster_mean)
#### Complete this #########

    return new_centroids

# Calculate SSE (Sum of the Squared Error) within each cluster

def calc_SSE(clusters, cluster_array):
    sum_squares = []
    cluster_df = pd.concat([pd.DataFrame(cluster_array),
                            pd.DataFrame(clusters, 
                                         columns=['cluster'])], 
                           axis=1)
    for c in set(cluster_df['cluster']):
        current_cluster = cluster_df[cluster_df['cluster']\
                                     ==c][cluster_df.columns[:-1]]
        cluster_mean = current_cluster.mean(axis=0)
        mean_repmat = np.matlib.repmat(cluster_mean, 
                                       current_cluster.shape[0],1)
        sum_squares.append(np.sum(np.sum((current_cluster - mean_repmat)**2)))
    return sum_squares

In [24]:
k = 4
cluster_vars = []

centroids = [cluster_array[i+2] for i in range(k)]
clusters = find_closest_centroid(centroids, cluster_array)
initial_clusters = clusters
print(initial_clusters)
print(0, round(np.mean(calc_SSE(clusters, cluster_array))))

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 2, 2, 0, 0, 0, 3, 0, 1, 1, 3, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 3, 0, 1, 1, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 2, 2, 0, 0, 2, 3, 0, 0, 0, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 0, 2, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 0, 2, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 0, 0, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 0, 0, 3, 0, 1, 1, 3, 0, 1, 1, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 1, 1, 3, 0, 0, 2, 3, 0, 0, 2, 3, 0, 1, 1, 3, 0, 0, 2, 3, 0, 

In [25]:

for i in range(20):
    centroids = calc_centroids(clusters, cluster_array)
    clusters = find_closest_centroid(centroids, cluster_array)
    cluster_var = np.mean(calc_SSE(clusters, cluster_array))
    cluster_vars.append(cluster_var)
    print(i+1, round(cluster_var))
print(centroids)
print(clusters)

1 166
2 337
3 319
4 333
5 333
6 333
7 333
8 333
9 333
10 333
11 333
12 333
13 333
14 333
15 333
16 333
17 333
18 333
19 333
20 333
[0    5.836054
1    3.056463
2    3.753741
3    1.200000
dtype: float64, 0    6.75
1    2.65
2    5.30
3    1.60
dtype: float64]
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0