**Importing Required Libraries**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import linkage, dendrogram, cut_tree
from sklearn.preprocessing import StandardScaler
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
iris = pd.read_csv("../input/iris-flower-dataset/IRIS.csv")

In [None]:
iris.head()

In [None]:
iris.shape

In [None]:
iris.info()

In [None]:
iris.describe()

In [None]:
iris.species.unique()

In [None]:
print(" The number of coulumns in given dataset is: ",iris.shape[1])
print("The target variable is Species:", iris.species.unique())

In [None]:
iris.isnull().sum()

In [None]:
df = iris[['sepal_length','sepal_width','petal_length','petal_width']]
df

In [None]:
# finding outliers if any
cols = df.columns
for i in cols:
    sns.boxplot(y=df[i])
    plt.show()


**From the above boxplot we can say that there are outliers in the column 'sepal width'**

In [None]:
# removing outliers from 'sepal width (cm) column'
q1 = df['sepal_width'].quantile(0.25)
q3 = df['sepal_width'].quantile(0.75)
iqr = q3 - q1
df = df[(df['sepal_width'] >= q1-1.5*iqr) & (df['sepal_width'] <= q3+1.5*iqr)]
df.shape

**After outlier treatment the number of rows are reduced to 146 from 150**

In [None]:
# Boxplot after removing outlier's  (sepal width column)
sns.boxplot(y=df['sepal_width'])
plt.show()

In [None]:
# Standardizing
std_scaler = StandardScaler()
df_norm = std_scaler.fit_transform(df)


In [None]:
#To find no of clusters
cluster_rng = range(1,20)
cluster_errors = []

for a in cluster_rng:
    clusters = KMeans(a, n_init = 10)
    clusters.fit(df_norm)
    labels = clusters.labels_
    centroids = clusters.cluster_centers_
    cluster_errors.append(clusters.inertia_)
    
c_df = pd.DataFrame({'no_of_cluster': cluster_rng, 'cluster_errors': cluster_errors})
c_df[0:20]

In [None]:
#Ploting elbow curve 
plt.figure(figsize=(12,6))
plt.plot(c_df.no_of_cluster, c_df.cluster_errors, marker = 'o') 
plt.xlabel('K') 
plt.ylabel('Error') 
plt.title('Elbow Plot') 
plt.show()

**From the above elbow curve we can say that '3' is the optimal number of cluster**

In [None]:
# Creating object of the model and fitting it
model = KMeans(n_clusters=3, max_iter=50)
model.fit(df)

In [None]:
#analysis of cluster found
df.index = pd.RangeIndex(len(df.index))
df_kmeans = pd.concat([df, pd.Series(model.labels_)], axis=1)
df_kmeans.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'ClusterID']

kmeans_clusters_Sepal_length = pd.DataFrame(df_kmeans.groupby(['ClusterID']).agg({'sepal_length':'mean'}))
kmeans_clusters_Sepal_width = pd.DataFrame(df_kmeans.groupby(['ClusterID']).agg({'sepal_width':'mean'}))
kmeans_clusters_Petal_length = pd.DataFrame(df_kmeans.groupby(['ClusterID']).agg({'petal_length':'mean'}))
kmeans_clusters_Petal_width = pd.DataFrame(df_kmeans.groupby(['ClusterID']).agg({'petal_width':'mean'}))

In [None]:
df2 = pd.concat([pd.Series([0,1,2]), kmeans_clusters_Sepal_length, kmeans_clusters_Sepal_width, kmeans_clusters_Petal_length, kmeans_clusters_Petal_width
                ], axis=1)
df2.columns = ['ClusterID','sepal_length_mean','sepal_width_mean','petal_length_mean',
               'petal_width_mean']
df2.head()

In [None]:
sns.countplot(x=df_kmeans.ClusterID)
plt.title('Count plot')
plt.show()

In [None]:
#heirarchical clustering with full dendrogram
plt.figure(figsize=(15,7))
mergings = linkage(df_kmeans, method = 'ward', metric='euclidean')

In [None]:
# set cut-off to 50
max_d = 7.08                # max_d as in max_distance
dendrogram(mergings,
           truncate_mode='lastp',  # show only the last p merged clusters
           p=150,                  # Try changing values of p
           leaf_rotation=90.,      # rotates the x axis labels
           leaf_font_size=8.,      # font size for the x axis labels
          )

plt.axhline(y=max_d, c='k')
plt.show()

In [None]:
#heirarchical clustering with full dendrogram for 50
plt.figure(figsize=(15,7))
mergings = linkage(df_kmeans, method = 'ward', metric='euclidean')

# set cut-off to 50
max_d = 7.08                # max_d as in max_distance
dendrogram(mergings,
           truncate_mode='lastp',  # show only the last p merged clusters
           p=50,                  # Try changing values of p
           leaf_rotation=90.,      # rotates the x axis labels
           leaf_font_size=8.,      # font size for the x axis labels
          )

plt.axhline(y=max_d, c='k')
plt.show()

In [None]:
# Scatter plot to visualize the clusters
plt.figure(figsize=(10,7))
sns.scatterplot(x='sepal_length',y='sepal_width', data=df_kmeans, hue='ClusterID', palette=['green','blue','red'])

# Plotting the centroids of the clusters
plt.scatter(model.cluster_centers_[:, 0], model.cluster_centers_[:,1], 
            s = 100, c = 'black',marker="*" ,label = 'Centroids')
plt.show()