<b> Note: This notebook has been run using python 3.9.5 </b>

In [None]:
# To check you python version uncomment below:

from platform import python_version
print(python_version())

In [None]:
#Import required packages
import numpy as np
import matplotlib.pyplot as plt

from sklearn import cluster, datasets, mixture
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler

import warnings
from itertools import cycle, islice


# Clustering
Clustering is an unsupervised learning technique that aims to make sense of the underlying structure and pattern of unlabelled data. The task is to group data that are most similar together, and data from different groups should be highly dissimilar in traits.

In this Notebook example, we'll explore how the four clustering algorithms (Agglomerativet, K-means, DBSCAN, EM for GMM) learnt in the pack clusters different types of datasets. Below are the datasets that we will cluster. 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles, make_moons, make_blobs, make_classification
from sklearn.preprocessing import StandardScaler

# Set the random seed for reproducibility
np.random.seed(0)

# Create and plot the noisy circles dataset
circles_data, circles_labels = make_circles(n_samples=1500, factor=0.5, noise=0.05)
plt.figure(figsize=(12, 4))
plt.subplot(141) # (nrows, ncols, index)
plt.scatter(circles_data[:, 0] # all the rows, first column (x)
            , circles_data[:, 1] # all the rows, second column (y)
            , c=circles_labels #  a scalar or sequence of n numbers to be mapped to colors using cmap and norm. Note as norm isn't in the parameters: By default, a linear scaling is used, mapping the lowest value to 0 and the highest to 1.
            , cmap='viridis' # Colormap instance or registered colormap name used to map scalar data to colors. 
            , s=5) # marker size
plt.title("Noisy Circles")
plt.axis('equal')

# Create and plot the noisy moons dataset
moons_data, moons_labels = make_moons(n_samples=1500, noise=0.1)
plt.subplot(142)
plt.scatter(moons_data[:, 0], moons_data[:, 1], c=moons_labels, cmap='viridis', s=5)
plt.title("Noisy Moons")
plt.axis('equal')

# Create and plot the noisy blobs dataset
blobs_data, blobs_labels = make_blobs(n_samples=1500, centers=2, cluster_std=1.0, random_state=42)
plt.subplot(143)
plt.scatter(blobs_data[:, 0], blobs_data[:, 1], c=blobs_labels, cmap='viridis', s=5)
plt.title("Noisy Blobs")
plt.axis('equal')

# Create and plot the noisy aniso dataset
aniso_data, aniso_labels = make_classification(n_samples=1500, n_features=2, n_informative=2,
                                              n_redundant=0, n_clusters_per_class=1,
                                              weights=[0.1, 0.9], flip_y=0.1, random_state=42)
plt.subplot(144)
plt.scatter(aniso_data[:, 0], aniso_data[:, 1], c=aniso_labels, cmap='viridis', s=5)
plt.title("Noisy Aniso")
plt.axis('equal')

plt.tight_layout()
plt.show()


## 1. Agglomerative Clustering
Falling under Hierarchical Clustering, this uses a bottom-top approach to group data based on distance proximity to each other –smaller distance is considered more similar. <p>
sk.learn.cluster.AgglomerativeCLustering package supports the single, complete, average and ward linkage method. In this example, we will use euclidean as the distance calculation.

- Apply Agglomerative clustering on all 4 datasets and plot results.
- Try using different linkage methods such as simple, complete, average, ward

Hint: one example is done for you

In [None]:
# Example for one dataset
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles
from sklearn.cluster import AgglomerativeClustering

plt.figure(figsize=(30, 5))
plt.tight_layout(w_pad=3.0)

# Define a function to apply Agglomerative Clustering with the specified linkage method
def apply_and_plot_agglomerative(linkage, subplot):
    clustering = AgglomerativeClustering(n_clusters=2, linkage=linkage)
    agglomerative_clusters = clustering.fit_predict(circles_data)

    plt.subplot(1, 4, subplot)
    plt.scatter(circles_data[:, 0], circles_data[:, 1], c=agglomerative_clusters, cmap='viridis', s=5)
    plt.title(f'Agglomerative ({linkage})')
    plt.axis('equal')

# Apply and plot Agglomerative Clustering with different linkage methods
linkage_methods = ['single', 'complete', 'average', 'ward']
for i, linkage in enumerate(linkage_methods, 1):
    apply_and_plot_agglomerative(linkage, i)


plt.show()


In [None]:
# TO TRY YOURSELF: Fill in the TO ADD sections
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles
from sklearn.cluster import AgglomerativeClustering

plt.figure(figsize=(30, 15))
plt.tight_layout(w_pad=3.0)

# Define a function to apply Agglomerative Clustering with the specified linkage method on given datasets
def apply_and_plot_agglomerative(linkage_methods, dataset):

    for subplot, linkage in enumerate(linkage_methods, 1):
        for i, data in enumerate(dataset, 1):
            # TO ADD:  function to apply Agglomerative Clustering with the specified linkage method & predict specified dataset

            
            plt.subplot(, , (i-1)*4+subplot) #TO ADD: nrows = no. of linkage methods, ncols = no. of datasets. HINT: use len()
            if i == 1:
                plt.title(f'Agglomerative ({linkage})')
            plt.scatter(data[:, 0], data[:, 1], c=agglomerative_clusters, cmap='viridis', s=5)
            plt.axis('equal')

# Apply and plot Agglomerative Clustering with different linkage methods on different datasets
linkage_methods = ['single', 'complete', 'average', 'ward']
dataset = [] #TO ADD: the variables for the 4 different datasets we've created earlier

apply_and_plot_agglomerative(linkage_methods, dataset)


plt.show()

# 2. K-Means Clustering
This is a popular clustering technique that falls under Centroid-based clustering. This iterative method aims to partition data points into k-number of clusters, grouping similar data points together to minimise variance within clusters. For K-Means clustering, each data point only belongs to one cluster.

- Apply Kmeans clustering on all 4 datasets and plot results.
- Try using different parameters such as "init", "n_init", "max_iter", "random_state". Search google on what each parameter does.

Hint: one example is done for you, and the second is started.

In [None]:
# simple example for one dataset
from sklearn.cluster import KMeans

# Define the number of clusters (in this case, 2 for the circles dataset)
n_clusters = 2

# Apply K-Means clustering to the circles dataset
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans_clusters = kmeans.fit_predict(circles_data)

# Plot the clustered data
plt.figure(figsize=(4, 3))
plt.scatter(circles_data[:, 0], circles_data[:, 1], c=kmeans_clusters, cmap='viridis', s=5)
plt.title(f'K-Means Clustering (K={n_clusters})')
plt.axis('equal')
plt.show()


In [None]:
# TO TRY YOURSELF: Fill in the TO ADD sections
# moons_data with some parameters

from sklearn.cluster import KMeans

# Define the number of clusters (in this case, 2 for given dataset)
n_clusters = 2

# Define K-Means clustering parameters 
# TO ADD: look up what these parameters mean and play around with changing the parameters and look at the result
init_method = 
n_init = 
max_iter = 
random_state = 

# Apply K-Means clustering to the moons_data dataset
kmeans = KMeans(n_clusters=n_clusters, init=init_method, n_init=n_init, max_iter=max_iter, random_state=random_state)
kmeans_clusters = kmeans.fit_predict(moons_data)

# Plot the clustered data
plt.figure(figsize=(4, 3))
plt.scatter(moons_data[:, 0], moons_data[:, 1], c=kmeans_clusters, cmap='viridis', s=5)
plt.title(f'K-Means Clustering (K={n_clusters})')
plt.axis('equal')
plt.show()

In [None]:
# TO ADD: for blobs_data

In [None]:
# TO ADD: for aniso_data

## 3. DBSCAN
Density-based Spatial Clustering of Application with Noise (DBSCAN) is a density-based clustering method that requires two parameters:<p>
- Epsilon: radius to create a circle from a datapoint, forming a perimeter around that datapoint
- minPoints: minimum number of data points inside a circle (including the data point that the circle is created around) that must exist for a cluster to be formed. 


- Apply DBSCAN clustering on all 4 datasets and plot results.
- Try changing the value of "eps" and "min_samples" and observe output
- Try using different parameters (changing values) such as "metric", "algorithm". Search google on what each parameter does.

Hint: one example is done for you, and the second is started.

In [None]:
# simple example with one dataset
from sklearn.cluster import DBSCAN
# Apply DBSCAN clustering to the circles dataset
dbscan = DBSCAN(eps=0.2, min_samples=20)
dbscan_clusters = dbscan.fit_predict(circles_data)

# Plot the clustered data
plt.figure(figsize=(4, 3))
plt.scatter(circles_data[:, 0], circles_data[:, 1], c=dbscan_clusters, cmap='viridis', s=5)
plt.title("DBSCAN Clustering")
plt.axis('equal')
plt.show()

In [None]:
# TO TRY YOURSELF: Fill in the TO ADD sections
# moons_data with various parameters

from sklearn.cluster import DBSCAN

# Define DBSCAN clustering parameters
# TO ADD: look up what these parameters mean and play around with changing the parameters and look at the result
eps_values =
min_samples_values =
metric = 
algorithm = 

# Iterate through different parameter combinations
for eps in eps_values:
    for min_samples in min_samples_values:
        # Apply DBSCAN clustering to the circles dataset
        dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric=metric, algorithm=algorithm)
        dbscan_clusters = dbscan.fit_predict(moons_data)

        # Plot the clustered data
        plt.figure(figsize=(4, 3))
        plt.scatter(moons_data[:, 0], moons_data[:, 1], c=dbscan_clusters, cmap='gist_rainbow', s=5)
        plt.title(f"DBSCAN Clustering (eps={eps}, min_samples={min_samples})")
        plt.axis('equal')
        plt.show()


In [None]:
# TO ADD: for blobs_data


In [None]:
# TO ADD: for aniso_data

## 4. Expectation-Maximization (EM) Algorithm for Gaussian Mixture Method (GMM)
Expectation-Maximization (EM) Algorithm for Gaussian Mixture Method (GMM)
is a distribution-based clustering method that assumes the data points are Gaussian distributed and each Gaussian distribution is assigned to a cluster. This uses Expectation-Maximization algorithm to find the optimised parameters of the Gaussian distribution for each cluster. 

- Apply EM for GMM clustering on all 4 datasets and plot results.
- Try changing the value for n_components and observe the output
- Try using different parameters such as "covariance_type", "max_iter", "init_params". Search google on what each parameter does.

Hint: one example is done for you, and the second is started.

In [None]:
# simple example for one dataset
from sklearn.mixture import GaussianMixture

# Apply EM algorithm for GMM clustering to the circles dataset
n_components = 2  # Number of Gaussian components
gmm = GaussianMixture(n_components=n_components, random_state=0)
gmm_clusters = gmm.fit_predict(circles_data)

# Plot the clustered data
plt.figure(figsize=(4, 3))
plt.scatter(circles_data[:, 0], circles_data[:, 1], c=gmm_clusters, cmap='viridis', s=5)
plt.title("Gaussian Mixture Model (GMM) Clustering")
plt.axis('equal')
plt.show()

In [None]:
# TO TRY YOURSELF: Fill in the TO ADD sections
# moons_data

from sklearn.mixture import GaussianMixture

# Define GMM clustering parameters
# TO ADD: look up what these parameters mean and play around with changing the parameters and look at the result
n_components =
covariance_type =
max_iter =
init_params = 

# Apply GMM clustering to the circles dataset
gmm = GaussianMixture(n_components=n_components, covariance_type=covariance_type, max_iter=max_iter, init_params=init_params, random_state=0)
gmm_clusters = gmm.fit_predict(moons_data)

# Plot the clustered data
plt.figure(figsize=(4, 3))
plt.scatter(moons_data[:, 0], moons_data[:, 1], c=gmm_clusters, cmap='viridis', s=5)
plt.title("Gaussian Mixture Model (GMM) Clustering")
plt.axis('equal')
plt.show()

In [None]:
# TO ADD: for blobs_data

In [None]:
# TO ADD: for aniso_data

## 5. Comparison
Below shows the comparison of the four algorithms we have used. Alter the parameters for each of the algorithms to see how it affects the clusters formed.

- Compare all 4 algorithms for all 4 datasets and decide which algorithms are best for what kind of data

First one is done for you

In [None]:
#algorithms
n_clusters = 2
#Agglomerative Clustering
clustering = AgglomerativeClustering(n_clusters=n_clusters, linkage='single') # TO ALTER: change the linkage to see result for that linkage
agglomerative_clusters = clustering.fit_predict(circles_data)

#K-Means
kmeans = KMeans(n_clusters=n_clusters, random_state=0)
kmeans_clusters = kmeans.fit_predict(circles_data)

#DBSCAN
dbscan = DBSCAN(eps=0.2, min_samples=20)
dbscan_clusters = dbscan.fit_predict(circles_data)

#EM FOR GMM
gmm = GaussianMixture(n_components=n_components, random_state=0)
gmm_clusters = gmm.fit_predict(circles_data)

# Create subplots to compare the clustering results
plt.figure(figsize=(16, 8))

# K-Means
plt.subplot(221)
plt.scatter(circles_data[:, 0], circles_data[:, 1], c=agglomerative_clusters, cmap='viridis', s=5)
plt.title("Agglomerative Clustering")
plt.axis('equal')

# Agglomerative Clustering
plt.subplot(222)
plt.scatter(circles_data[:, 0], circles_data[:, 1], c=kmeans_clusters, cmap='viridis', s=5)
plt.title("Kmeans Clustering")
plt.axis('equal')

# DBSCAN
plt.subplot(223)
plt.scatter(circles_data[:, 0], circles_data[:, 1], c=dbscan_clusters, cmap='viridis', s=5)
plt.title("DBSCAN Clustering")
plt.axis('equal')

# GMM
plt.subplot(224)
plt.scatter(circles_data[:, 0], circles_data[:, 1], c=gmm_clusters, cmap='viridis', s=5)
plt.title("GMM Clustering")
plt.axis('equal')

plt.tight_layout()
plt.show()

In [None]:
# TO ADD: for moons_data

In [None]:
# TO ADD: for blobs_data

In [None]:
# TO ADD: for aniso_data