## **Question : Use a k-means algorithm to cluster the Iris dataset into 11 clusters. Then manually label five representatives for each cluster (ones closest to cluster center). Thereafter, use an MLP to classify the IRIS data using the manually labeled samples.**

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

In [None]:
irisDataSet=pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data",header=None)
irisDataSet.columns=['x1','x2','x3','x4','y']

irisData=irisDataSet.iloc[:,0:4]

irisData

Unnamed: 0,x1,x2,x3,x4
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


K-Means implemented in here below :  

In [None]:
class Cluster(object):
  def __init__(self,nodeIds,centroidNode):
    self.nodeIds=nodeIds
    self.centroidNode=centroidNode

In [None]:
class KMeans(object):

  def __init__(self,dataSet,k):
    self.k=k
    self.dataSet=dataSet
    self.clusters=[(Cluster([],node))for node in self.dataSet.sample(n=self.k,random_state=None).to_numpy()]
    self.meanSqDevs=[]


  def Clustering(self):
    for cluster in self.clusters:
      cluster.nodeIds=[]
    for i,node in enumerate(self.dataSet.to_numpy()):
      minIndx,minD=self.MinDistanceCluster(node)
      self.clusters[minIndx].nodeIds.append(i)
      self.clusters[minIndx].centroidNode=self.UpdateCentroid(self.clusters[minIndx],node)
  
  def fit(self):
    for i in range(20):

      self.Clustering()
      self.meanSqDevs.append(self.AvgSqDev())




  def AvgSqDev(self):
    nodes=self.dataSet.to_numpy()
    stdDev=[]
    for cluster in self.clusters: 
      s=0.0
      for nodeId in cluster.nodeIds:
        s+=sum((nodes[nodeId]-cluster.centroidNode)**2)
      # s/=len(cluster.nodeIds)
      # s=(s)**0.5
      stdDev.append(s)
    return np.mean(stdDev)

  

  def UpdateCentroid(self,cluster,node):
    nodes=len(cluster.nodeIds)*np.array(cluster.centroidNode)+np.array(node) 
    centroid = nodes/(len(cluster.nodeIds)+1)
    return centroid

  def MinDistanceCluster(self,node):
    minD=1e9
    minIndx=1e9
    for i,cluster in enumerate(self.clusters):
      d=math.sqrt(sum((cluster.centroidNode-node)**2))
      if d <= minD:
        minD=d
        minIndx=i
    return int(minIndx),minD

K-Means ran for creating 11 clusters. And it gives us the set of individual clusters along with centroid.

In [None]:
km=KMeans(irisData,11)

km.fit()

Euclidean distance used to select five minimum distance points in clusters and distance of centroid with every node in that cluster is calculated so that minimum distance 5 nodes from each cluster can be selected.

In [None]:
def euclideanDist(node1,node2):
	return np.sqrt(sum((node1-node2) ** 2))

In [None]:
clusters=[]

iris=irisData.to_numpy()

for cluster in km.clusters:
  c=[]
  for id in cluster.nodeIds:
    c.append({'node':iris[id],'d':euclideanDist(iris[id],cluster.centroidNode),'y':irisDataSet['y'][id]})
  clusters.append(np.array(c))
  

Here selection of 5 representative is done from each cluster.

In [None]:
_clusters=[]
for cluster in clusters:
  _clusters.append(np.array(sorted(cluster,key=lambda k: k['d'])))

In [None]:
rep=[]
for cluster in _clusters:
  r=[]
  for i in range(5):
    if(i<len(cluster)):
      r.append(cluster[i])
  rep.append(np.array(r))

In [None]:
X=[]
Y=[]
for r in rep:
  for nodes in r:
    X.append(nodes['node'])
    Y.append(nodes['y'])
Y=[((0 if i=="Iris-setosa" else 1) if i!="Iris-virginica" else 2)for i in Y]
X=np.array(X)

MLP implemented below.

In [None]:
class MLP(object):

  

  def __init__(self,iterNos,lr,nInputs=4,nOutputs=3):

    self.nInputs=nInputs
    self.nOutputs=nOutputs
    self.nHidden=self.nInputs+1

    self.NN=[]
    self.NN.append([{'weights': np.random.uniform(size=self.nInputs)} for i in range(self.nHidden)])
    self.NN.append([{'weights': np.random.uniform(size=self.nHidden)} for i in range(self.nOutputs)])

    self.iterNo=iterNos
    self.rate=lr



  def derivSigmoid(self,a):
    return a*(1-a)
  
  def forwardSigmoid(self,a):
    return 1/(1+math.exp(-a))

  def fit(self,x,y):

    for i in range(self.iterNo):
      for xi,yi in zip(x,y):
        outputs=self.forwardProp(xi)

        expected=[0.0 for i in range(self.nOutputs)]
        expected[yi]=1
        self.backProp(xi,expected)
        self.updateWeights(xi)
  
  def forwardProp(self,input):

    row=input

    for layer in self.NN:

      prev=np.array([])
      for neuron in layer:

        sum=neuron['weights'].T.dot(row)
        result=self.forwardSigmoid(sum)
        neuron['result']=result
        prev=np.append(prev,[result])

      row=prev
    
    return row

  def backProp(self,input,expected):
    row=input
    for i in reversed(range(len(self.NN))):

      layer=self.NN[i]
      errors=np.array([])

      if i==len(self.NN)-1:
        results=[neuron['result'] for neuron in layer]
        errors = expected-np.array(results) 
      else:
        for j in range(len(layer)):
          herror=0
          nextlayer=self.NN[i+1]
          for neuron in nextlayer:
              herror+=(neuron['weights'][j]*neuron['delta'])
          errors=np.append(errors,[herror])
            
      for j in range(len(layer)):
        neuron=layer[j]
        neuron['delta']=errors[j]*self.derivSigmoid(neuron['result'])

  def updateWeights(self,input):
    
    for i in range(len(self.NN)):
      inputs = input
      if i!=0:
          inputs=[neuron['result'] for neuron in self.NN[i-1]]

      for neuron in self.NN[i]:
        for j in range(len(inputs)):
          neuron['weights'][j]+=self.rate*neuron['delta']*inputs[j]

  def pred(self,input):
    outputs = self.forwardProp(input)
    return np.argmax(outputs)

  def accuracy(self,x,y):
    c=0
    for xi,yi in zip(x,y):
      if self.pred(xi)==yi:
        c+=1

    return (c/float(len(y)))*100



MLP ran on selected representative data points.

In [None]:
mlp=MLP(400,0.05)
mlp.fit(X,Y)

In [None]:
x=irisDataSet[['x1','x2','x3','x4']].to_numpy()
y=irisDataSet['y'].to_numpy()
y=[((0 if i=="Iris-setosa" else 1) if i!="Iris-virginica" else 2)for i in y]

Now accuracy calculated for whole iris dataset.

In [106]:
print("Accuracy : ",mlp.accuracy(x,y))

Accuracy :  97.33333333333334
