In [1]:
import pandas as pd
import numpy as np

# Load Dataset

In [2]:
# The dataset is open and available at url : https://archive.ics.uci.edu/ml/datasets/seeds
seed_data=pd.read_csv("..//data//seed_data.csv")

In [3]:
# we make a copy of of seed data without seed class to train our unsupervised algorithm to find pattern
data=seed_data[['area', 'perimeter', 'compactness', 'length_of_kernel',
       'width_of_kernel', 'asymmetry_coefficient', 'length_of_kernel_groove']].as_matrix().copy()

# Initialize Varriables

In [4]:
# number of cluster to be formed
k = 3 
# the maximum number of time centers of cluster will be adjusted so that each point is closest to designted cluster
iterations_count = 10 

In [5]:
# Shuffle the dataset and select k number of initial centers of cluster
np.random.shuffle(data)
cluster_centroids = {}
cluster_centroids[0] = data[0]
cluster_centroids[1] = data[1]
cluster_centroids[2] = data[2]

In [6]:
prev_cluster_centroids = cluster_centroids.copy()

In [7]:
clusters = np.zeros(len(data))

# Evaluate Cluster Centroids

In [8]:
#iterate through each observtion set to identify which cluster it belongs to with current cluster centroids
for i in range(0,len(data)):
    distances = [np.linalg.norm(data[i] - cluster_centroids[centroid]) for centroid in cluster_centroids]
    clusters[i] = np.argmin(distances)

In [9]:
# evaluate new cluster center by taking average of all data points belonging to the same cluster
for p in clusters:
    cluster_centroids[p] = np.mean([data[q] for q in range(0,len(data)) if clusters[q] == p],axis=0)

In [10]:
# calculate the average difference between previous cluster centers and changed clsuter centers
avg_change = np.mean([np.linalg.norm(prev_cluster_centroids[centroid] - cluster_centroids[centroid]) 
                      for centroid in cluster_centroids])
print("Iteration : ",i," ; Average Change in Centroid Distance :",avg_change)
prev_cluster_centroids = cluster_centroids.copy()

Iteration :  209  ; Average Change in Centroid Distance : 1.90978636479


In [11]:
#carry out rest of iteration to improve cluster centers that is iteration_count-1
for i in range(1,iterations_count):
    for m in range(0,len(data)):
        distances = [np.linalg.norm(data[m] - cluster_centroids[centroid]) for centroid in cluster_centroids]
        clusters[m] = np.argmin(distances)
        
    for p in clusters:
        cluster_centroids[p] = np.mean([data[q] for q in range(0,len(data)) if clusters[q] == p],axis=0)
        

    avg_change = np.mean([np.linalg.norm(prev_cluster_centroids[centroid] - cluster_centroids[centroid]) 
                          for centroid in cluster_centroids])
    print("Iteration : ",i," ; Average Change in Centroid Distance :",avg_change)
    prev_cluster_centroids = cluster_centroids.copy()

Iteration :  1  ; Average Change in Centroid Distance : 0.665691422955
Iteration :  2  ; Average Change in Centroid Distance : 0.304211049869
Iteration :  3  ; Average Change in Centroid Distance : 0.136301381622
Iteration :  4  ; Average Change in Centroid Distance : 0.0264206109328
Iteration :  5  ; Average Change in Centroid Distance : 0.0
Iteration :  6  ; Average Change in Centroid Distance : 0.0
Iteration :  7  ; Average Change in Centroid Distance : 0.0
Iteration :  8  ; Average Change in Centroid Distance : 0.0
Iteration :  9  ; Average Change in Centroid Distance : 0.0


# Predict Cluster label for Seed Data

In [12]:
def pred(data):
    distances = [np.linalg.norm(data - cluster_centroids[centroid]) for centroid in cluster_centroids]
    cluster_val = np.argmin(distances)
    return cluster_val

In [14]:
# Assign clusters to observation in seeds data
seed_data['cluster']=seed_data.apply(lambda x : pred(x[['area', 'perimeter', 'compactness', 'length_of_kernel'
                                                        ,'width_of_kernel', 'asymmetry_coefficient', 
                                                        'length_of_kernel_groove']]),axis=1)

seed_data['index']=seed_data.index

# Compare Cluster Labels with Seed Class

In [15]:
#Evaluate cluster label with seed classes
pd.pivot_table(seed_data,index=['seed_class','cluster'],values=['index'],aggfunc=len).unstack()

Unnamed: 0_level_0,index,index,index
cluster,0,1,2
seed_class,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
1,1.0,57.0,12.0
2,60.0,10.0,
3,,,70.0
