In [None]:
import numpy as np
import pandas as pd 
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('../input/iris-flower-dataset/IRIS.csv')
data.head()

In [None]:
data = data.rename(columns={'sepal_length': 'Sepal Length',
                            'sepal_width': 'Sepal Width',
                            'petal_length': 'Petal Length',
                            'petal_width': 'Petal Width'})

In [None]:
data

In [None]:
data.shape #Structure of data

In [None]:
data.describe() #Detailed View

In [None]:
data.isna().sum().to_frame('Null Values')

In [None]:
data.species.unique().tolist()

In [None]:
sns.jointplot(data=data, s=100, alpha=0.7, height=6)
plt.show()

In [None]:
plt.figure(figsize=(10,6))
sns.boxplot(data=data)
plt.show()

In [None]:
plt.figure(figsize=(5,5))
sns.heatmap(data.corr(), cmap='YlGnBu', annot=True, 
            cbar=False, annot_kws={'size': 14})
plt.show()

In [None]:
sns.pairplot(data=data, hue='species', height=3)
plt.show()

In [None]:
from sklearn.cluster import KMeans

In [None]:
X = data.iloc[:, [2,3]].values
y = data['species'].map({'Iris-setosa': 0, 'Iris-virginica':1, 'Iris-versicolor':2}).values

In [None]:
print(f'Shape of X: {X.shape}')
print(f'Shape of y: {y.shape}')

In [None]:
# Finding the number of clusters
wcss = []
for i in range(1,11):
    kmeans_c = KMeans(n_clusters=i, init='k-means++', 
                      max_iter = 300, n_init = 10, random_state=0)
    kmeans_c.fit(X)
    wcss.append(kmeans_c.inertia_)
    print(f'wcss {i}: {kmeans_c.inertia_}')

In [None]:
# Plotting Elbow Curve
plt.figure(figsize=(10,5))
plt.plot(range(1,11), wcss, color='blue')
plt.title('The Elbow Curve', fontsize=14)
plt.xlabel('Number of Clusters', fontsize=15)
plt.ylabel('WCSS')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=3, init='k-means++', 
                max_iter = 300, n_init = 10, random_state=0)
y_means = kmeans.fit_predict(X)
y_means

In [None]:
# Cluster Center values
kmeans.cluster_centers_

In [None]:
species = ['Iris-setosa', 'Iris-virginica', 'Iris-versicolor']
fig, ax = plt.subplots(1,2, figsize=(15,5))
ax[0].set_title('Unclustered data', fontsize=16)
ax[0].set_xlabel('Petal Length', fontsize=12)
ax[0].set_ylabel('Petal Width', fontsize=12)
sns.scatterplot(data['Petal Length'], data['Petal Width'], 
                s=75, color='purple', alpha=0.8, ax=ax[0])
ax[1].set_title('Clustered data', fontsize=16)
ax[1].set_xlabel('Petal Length', fontsize=12)
ax[1].set_ylabel('Petal Width', fontsize=12)
for i in range(0, 3):
    sns.scatterplot(X[y_means == i, 0], X[y_means == i, 1], s=75, label=species[i], ax=ax[1])
sns.scatterplot(kmeans.cluster_centers_[:,0], kmeans.cluster_centers_[:,1], 
                s=700, color='black', label='Centroids', marker='.', ax=ax[1])
plt.show()