In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans

from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram,linkage

# K-Means Clustering
K-means clustering is one of the simplest and popular unsupervised machine learning algorithms. Typically, unsupervised algorithms make inferences from datasets using only input vectors without referring to known, or labelled, outcomes. A cluster refers to a collection of data points aggregated together because of certain similarities. You’ll define a target number k, which refers to the number of centroids you need in the dataset. A centroid is the imaginary or real location representing the center of the cluster. In other words, the K-means algorithm identifies k number of centroids, and then allocates every data point to the nearest cluster, while keeping the centroids as small as possible. The ‘means’ in the K-means refers to averaging of the data; that is, finding the centroid.

To start off with we will generate 150 random data coordinates and scatter them along the X and Y axis and use the K-means algorithm to find accurate cluster points among the data.

In [None]:
#Random Data coordinates
X = np.random.rand(150,2)

The K-means algorithm starts with a first group of randomly selected centroids, which are used as the beginning points for every cluster, and then performs iterative  calculations to optimize the positions of the centroids

**This iteration stops when:**
1. The centroids have stabilized — there is no change in their values because the clustering has been successful.
2. The defined number of iterations has been achieved.

**Create K-means model**

***cluster_centers_***: Is used for finding the center of the clusters and,

***labels_***: Is used for getting the labels property of the K-means clustering example dataset; that is, how the data points are categorized into the two clusters.

In [None]:
#K-means Model
KMM=KMeans(n_clusters = 3)  #Using 3 clusters
KMM.fit(X)

centroids=KMM.cluster_centers_  #To find the cluster centers
labels=KMM.labels_   # To find label for each data coordinate

print("The 3 Centroid co-ordinates are: \n{}\n".format(centroids))
print("The Labels are: \n{}".format(labels))

**We will now visualize the data along with the cluster diagram and use different colours for different labels**

We are using 3 different colors for 3 different labels, and a green colour to show the clusters.

In [None]:
colors = ['b.','r.','y.','g.']

plt.figure(figsize=(12,6))
for i in range(len(X)):
    plt.plot(X[i][0],X[i][1], colors[labels[i]], markersize = 10)  #Scatter the data points
    
plt.scatter(centroids[:,0],centroids[:,1],marker="o", s=80000, c= "green",alpha=0.7) 
plt.scatter(centroids[:,0],centroids[:,1],marker="X", s=50, c= "black")  #Cluster centers
plt.grid()
plt.show()

**Elbow Method**

Now you must be wondering how i arrived at the conclusion that i need 3 clusters for this data. This is decided using the "Elbow method" It basically plots the error against the number of clusters. To choose the appropriate number of clusters, we choose the number before which there is a steepest decrease in error. The optimum number of clusters is where the elbow occurs. (E.g. for the Iris dataset, the elbow is created at cluster number 3, hence we choose 3 clusters.)

In [None]:
#Elbow method
Error =[]

for i in range(1,11):
    kmeans = KMeans(n_clusters=i).fit(X)
    kmeans.fit(X)
    Error.append(kmeans.inertia_)
    
plt.plot(range(1,11),Error)
plt.title("Elbow method")
plt.xlabel("Number of clusters")
plt.ylabel("Error")
plt.grid()
plt.show()

# Case Study
To understand this algorithm better, we will use the popular **Iris Flower Dataset**

In [None]:
data=pd.read_csv("/kaggle/input/iris-flower-dataset/IRIS.csv")
data.head()

In [None]:
X=data.iloc[:,[0,1,2,3]].values  #We will extract the data columns we need.

**K-Means Model**

In [None]:
kmeans = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)  #I got 3 clusters as per the elboe method
y_kmeans = kmeans.fit_predict(X)

In [None]:
#Visualising the clusters
plt.figure(figsize=(12,6))
plt.scatter(X[y_kmeans == 0, 0], X[y_kmeans == 0, 1], s = 30, c = 'red', label = 'Iris-setosa')
plt.scatter(X[y_kmeans == 1, 0], X[y_kmeans == 1, 1], s = 30, c = 'blue', label = 'Iris-versicolour')
plt.scatter(X[y_kmeans == 2, 0], X[y_kmeans == 2, 1], s = 30, c = 'green', label = 'Iris-virginica')

#Plotting the centroids of the clusters
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1], s = 50, c = 'black', label = 'Centroids',marker="X")
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:,1], s = 40000, c = 'purple', alpha=0.5, marker="o")
plt.title("Cluster Diagram for Iris Dataset")
plt.grid()
plt.legend()
plt.show()

**Using Seabron for for scatter plot**

In [None]:
sns.jointplot(data=data, x="sepal_length", y="sepal_width",hue="species",kind="scatter")
plt.grid()
plt.show()

# Hierarchical clustering

Hierarchical clustering is another unsupervised machine learning algorithm, which is used to group the unlabeled datasets into a cluster. In this algorithm, we develop the hierarchy of clusters in the form of a tree, and this tree-shaped structure is known as the dendrogram. Sometimes the results of K-means clustering and hierarchical clustering may look similar, but they both differ depending on how they work. As there is no requirement to predetermine the number of clusters as we did in the K-Means algorithm.

So, as we have seen in the K-means clustering that there are some challenges with this algorithm, which are a predetermined number of clusters, and it always tries to create the clusters of the same size. To solve these two challenges, we can opt for the hierarchical clustering algorithm because, in this algorithm, we don't need to have knowledge about the predefined number of clusters.

We will be using the **Agglomerative Hierarchical clustering** technique:

It is a popular example of HCA. To group the datasets into clusters, it follows the bottom-up approach. It means, this algorithm considers each dataset as a single cluster at the beginning, and then start combining the closest pair of clusters together. It does this until all the clusters are merged into a single cluster that contains all the datasets.

In [None]:
#Data to cluster hierarchically
X = np.array([[0.4,0.53],[0.22,0.38],[0.35,0.32],[0.26,0.19],[0.08,0.41],[0.45,0.3]]) 
Y = np.array([[4,0.5],[0.2,8],[0.35,0.2],[0.26,0.17],[0.8,0.16],[0.43,0.78]])
print("X:{}\n".format(X))
print("Y:{}".format(Y))

In [None]:
#We use single linkage in our model and distance between data will be calculated using euclidean distange method.
HCA=AgglomerativeClustering(n_clusters = 2, affinity='euclidean', linkage ="single") 
HCA.fit_predict(X)

In [None]:
linked1 = linkage(X, 'single')
print("Link 1: \n{}".format(linked1))
print("\n")
linked2 = linkage(Y, 'single')
print("Link 2: \n{}".format(linked2))

In [None]:
#Dendograms for link 1 and link 2
dendrogram(linked1)
plt.title("X")
plt.grid()
plt.show()


dendrogram(linked2)
plt.title("Y")
plt.grid()
plt.show()

# Wrapping Up
**We implemented two very popular types of clustering algorithms, namely K-Means clustering and Hierarchical Clustering. They both fall under the unsupervised machine learning category as there is no labelling of data initially.**

**There were some drawbacks of K-Means clustering like knowing the number of clusters and every cluster having the same size, but these drawbacks are overcome by the Hierarchical clustering and hence it is a widely used algorithm for all kinds of clustering data.**

# ----------------------------------------------------------------------------------------------------

**I hope this notebook was easy to understand and useful for y'all.**

**If you found it helpful please do upvote!**

**If you have and queries or suggestions for me, feel free to comment down below, i would love to improve!**

# Thank You!!