Iris Classification through KMeans Clustering 

In [0]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn import datasets
from sklearn.cluster import KMeans
import pandas as pd
from copy import deepcopy

iris = datasets.load_iris()

samples = iris.data
target = iris.target
print(iris.feature_names)

x = samples[:,0]
y = samples[:,1]

sepal_length_width = np.array(list(zip(x, y)))

k = 3

centroids_x = np.random.uniform(min(x), max(x), size=k)
centroids_y = np.random.uniform(min(y), max(y), size=k)

centroids = np.array(list(zip(centroids_x, centroids_y)))

def distance(a, b):
  one = (a[0] - b[0]) ** 2
  two = (a[1] - b[1]) ** 2
  distance = (one + two) ** 0.5
  return distance

centroids_old = np.zeros(centroids.shape)

# Cluster labeles (either 0, 1, or 2)
labels = np.zeros(len(samples))

distances = np.zeros(3)

# Initialize error:
error = np.zeros(3)
for i in range(3):
  error[i] =  distance( centroids[i], centroids_old[i])

# Repeat Steps until convergence:
while error.all() != 0:
  for i in range(len(samples)):
    distances[0] = distance(sepal_length_width[i], centroids[0])
    distances[1] = distance(sepal_length_width[i], centroids[1])
    distances[2] = distance(sepal_length_width[i], centroids[2])
    cluster = np.argmin(distances)
    labels[i] = cluster

  centroids_old = deepcopy(centroids) #updating centroids

  for i in range(3):
    points = [sepal_length_width[j] for j in range(len(sepal_length_width)) if labels[j] == i]
    centroids[i] = np.mean(points, axis=0)
  for i in range(3):
    error[i] =  distance( centroids[i], centroids_old[i])

colors = ['y', 'g', 'b']
  
for i in range(3):
  points = np.array([sepal_length_width[j] for j in range(len(samples)) if labels[j] == i])
  plt.scatter(points[:, 0], points[:, 1], c=colors[i], alpha=0.5)

plt.scatter(centroids[:, 0], centroids[:, 1], marker='D', s=50, color = "red")

plt.xlabel('sepal length (cm)')
plt.ylabel('sepal width (cm)')
plt.show()


  

Classification of each data in dataset

In [0]:
species = np.chararray(target.shape, itemsize=150)

for i in range(len(samples)):
  if target[i] == 0:
    species[i] = 'setosa'
  elif target[i] == 1:
    species[i] = 'versicolor'
  elif target[i] == 2: 
    species[i] = 'virginica'

df = pd.DataFrame({'labels' : labels, 'species':species})
print(df) 

Cross Tabulation 

In [18]:
ct = pd.crosstab(df['labels'], df['species'])
print(ct)

species  b'setosa'  b'versicolor'  b'virginica'
labels                                         
0.0              0              8            23
1.0              0             36            26
2.0             50              6             1


Visualising Inertia

In [0]:
num_clusters = list(range(1,9))
inertias =[]
for k in num_clusters:
  model = KMeans(n_clusters=k)
  model.fit(samples)
  inertias.append(model.inertia_)

plt.plot(num_clusters, inertias, '-o')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.show()