In [1]:
import sklearn
from sklearn import metrics
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

import random 
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift
from sklearn.cluster import Birch
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import MiniBatchKMeans

In [3]:
iris_df = pd.read_csv('Iris.csv')
iris_df.sample(10)
iris_df.shape

(150, 6)

In [8]:
iris_df = iris_df.sample(frac=1).reset_index(drop=True)
iris_df

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,31,4.8,3.1,1.6,0.2,Iris-setosa
1,117,6.5,3.0,5.5,1.8,Iris-virginica
2,107,4.9,2.5,4.5,1.7,Iris-virginica
3,91,5.5,2.6,4.4,1.2,Iris-versicolor
4,148,6.5,3.0,5.2,2.0,Iris-virginica
...,...,...,...,...,...,...
145,24,5.1,3.3,1.7,0.5,Iris-setosa
146,75,6.4,2.9,4.3,1.3,Iris-versicolor
147,50,5.0,3.3,1.4,0.2,Iris-setosa
148,38,4.9,3.1,1.5,0.1,Iris-setosa


In [11]:
iris_df = iris_df.drop(columns="Id")
iris_df.sample(10)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
56,4.8,3.4,1.6,0.2,Iris-setosa
105,4.6,3.6,1.0,0.2,Iris-setosa
54,5.1,3.4,1.5,0.2,Iris-setosa
63,6.2,2.2,4.5,1.5,Iris-versicolor
17,6.7,3.1,5.6,2.4,Iris-virginica
148,4.9,3.1,1.5,0.1,Iris-setosa
141,6.4,2.8,5.6,2.2,Iris-virginica
92,6.5,3.2,5.1,2.0,Iris-virginica
136,6.8,2.8,4.8,1.4,Iris-versicolor
49,7.7,3.8,6.7,2.2,Iris-virginica


In [13]:
from sklearn.preprocessing import LabelEncoder
lab_enc = LabelEncoder()
iris_df['Species'] = lab_enc.fit_transform(iris_df['Species'].astype(str))
iris_df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,4.8,3.1,1.6,0.2,0
1,6.5,3.0,5.5,1.8,2
2,4.9,2.5,4.5,1.7,2
3,5.5,2.6,4.4,1.2,1
4,6.5,3.0,5.2,2.0,2


In [14]:
iris_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
SepalLengthCm    150 non-null float64
SepalWidthCm     150 non-null float64
PetalLengthCm    150 non-null float64
PetalWidthCm     150 non-null float64
Species          150 non-null int32
dtypes: float64(4), int32(1)
memory usage: 5.4 KB


In [16]:
iris_features = iris_df.drop(columns='Species', axis=1)
iris_features.sample(10)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
137,7.0,3.2,4.7,1.4
8,5.5,4.2,1.4,0.2
97,6.1,2.8,4.7,1.2
130,7.6,3.0,6.6,2.1
80,5.1,3.8,1.6,0.2
102,6.2,2.8,4.8,1.8
95,5.4,3.4,1.7,0.2
43,7.2,3.2,6.0,1.8
109,5.2,2.7,3.9,1.4
37,5.4,3.7,1.5,0.2


In [38]:
iris_labels = iris_df['Species']
iris_labels.shape

(150,)

In [45]:
def build_model(clustering_model, data, labels):
    model = clustering_model(data)
    
    print('Homo\tComp1\tV-Measure\tARI\tAMI\tsilhouette')
    print( 60 * '-')
    print('%.3f\t%.3f\t%.3f\t%.3f\t%.3f\t%.3f'
         %(metrics.homogeneity_score(labels, model.labels_), 
           metrics.completeness_score(labels, model.labels_),
           metrics.v_measure_score(labels, model.labels_), 
           metrics.adjusted_rand_score(labels, model.labels_), 
           metrics.adjusted_mutual_info_score(labels, model.labels_), 
           metrics.silhouette_score(data, model.labels_)))

In [46]:
def k_means(data, n_cluster=3, max_iters=1000):
    model = KMeans(n_clusters=n_cluster, max_iter=max_iters).fit(data)
    return model

In [47]:
build_model(k_means, iris_features, iris_labels)

Homo	Comp1	V-Measure	ARI	AMI	silhouette
------------------------------------------------------------
0.751	0.765	0.758	0.730	0.748	0.553


Choosing Clustering Algorithm