<h2>Clustering: Food Retailer<h2>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
os.environ['OMP_NUM_THREADS'] = '1'

# Read the data
X = pd.read_csv("fr_data_final.csv")
X

<h3>K-means Clustering (Scikit-Learn)</h3>

<b>Select the number of clusters through SSE (elbow method)</b>

In [None]:
from sklearn.cluster import KMeans

# Select the number of clusters by means of SSE
SSE = []
range_n_clusters = range(2,9)
for nclust in range_n_clusters:
    # Initialize K-means clustering
    km = KMeans(n_clusters=nclust, 
                init='random', # or 'k-means++'
                n_init=10, # n. of times the algorithm is run with different centroid seeds (the best is taken)
                max_iter=100, 
                random_state=0)  # random number generation for seeds initialization (use an int to make the generation deterministic)
    # Generate K-means clustering
    km.fit(X)
    SSE.append(km.inertia_)
    
plt.plot(range_n_clusters, SSE, marker='o')
plt.xlabel('Number of clusters K')
plt.ylabel('Sum of Squared Distances (SSE)')
plt.show()

<b>Select the number of clusters through silhouette-based method</b>

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score

sil=[]
range_n_clusters = range(2,9)
for nclust in range_n_clusters:
    # Initialize K-means clustering
    km = KMeans(n_clusters=nclust, 
                init='random', # or 'k-means++'
                n_init=10, 
                max_iter=100, 
                random_state=0)
    # Generate K-means clustering
    cluster_labels = km.fit_predict(X)
    # Compute the average silhouette coefficient
    silhouette_avg_coeff = silhouette_score(X, cluster_labels)
    sil.append(silhouette_avg_coeff)
    
plt.plot(range_n_clusters, sil, marker='o')
plt.xlabel('Number of clusters K')
plt.ylabel('Average silhouette coefficient')
plt.show()

<b>Display the silhouette plot</b>

In [None]:
from sklearn.metrics import silhouette_samples, silhouette_score

# Select the number of clusters by means of the silohuette coefficient
range_n_clusters = range(2,9)
for nclust in range_n_clusters:
    
    # Initialize K-means clustering
    km = KMeans(n_clusters=nclust,
                init='random',
                n_init=10, 
                max_iter=100,
                random_state=0)
    
    # Generate K-means clustering, compute cluster centers and predict the cluster label for each example
    cluster_labels = km.fit_predict(X)
    cluster_labels_set = np.unique(cluster_labels)
    
    # Compute the average silhouette coefficient
    silhouette_avg_coeff = silhouette_score(X, cluster_labels)
    print("N° of clusters =", nclust, "-> Average silhouette coefficient: ", silhouette_avg_coeff)
    
    # Compute the silhouette coefficient for each example
    silhouette_example_coeff = silhouette_samples(X, cluster_labels)
    
    # For the examples belonging to each cluster plot the silhouette coefficient
    fig = plt.figure(nclust) 
    fig.set_size_inches(6,6)
    ax1 = fig.add_subplot()
    
    y_lower = 0
    for i in cluster_labels_set:
        # Aggregate and sort the silhouette coefficients for the examples belonging to the cluster
        cluster_i_silhouette_coeff = silhouette_example_coeff[cluster_labels==i]
        cluster_i_silhouette_coeff.sort()
        y_upper = y_lower + cluster_i_silhouette_coeff.shape[0]
        ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, cluster_i_silhouette_coeff)
        ax1.text(-0.05, y_lower + 0.5 * cluster_i_silhouette_coeff.shape[0], str(i))
        y_lower = y_upper + 10
    
    # Enrich the silhouette plot
    ax1.axvline(x=silhouette_avg_coeff, color="black", linestyle="--")
    ax1.set_xlabel("Silhouette Coefficients")
    ax1.set_ylabel("Cluster Label")

<b>Generate the clustering model by K-means</b>

In [None]:
# Generate the final K-means clustering model
km = KMeans(n_clusters=3,
            init='random',
            n_init=10, 
            max_iter=100,
            random_state=0)
cluster_labels = km.fit_predict(X)
X['Cluster_Label'] = cluster_labels
X

In [None]:
plt.scatter(X.values[:,5],X.values[:,6],c=cluster_labels)
plt.xlabel("Wine")
plt.ylabel("Frozen")
plt.show()

<b>Agglomerative Hierarchical Clustering (SciPy and Scikit-Learn)</b>

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

# Compute the linkage matrix
linkage_matrix = linkage(X, metric='euclidean', method='ward')

plt.figure(figsize=(10,6))
plt.title('Hierarchical Clustering Dendrogram')
# Show only the last p merged clusters in the dendogram
dendrogram(linkage_matrix, truncate_mode='lastp', p=12)
plt.show()

In [None]:
# Get the clusters
from scipy.cluster.hierarchy import fcluster
num_clusters = 3
clusters = fcluster(linkage_matrix, num_clusters, criterion='maxclust')
clusters

<b>...What is the linkage matrix? </b>

In [None]:
X_subsample = X.head(5)
X_subsample

In [None]:
linkage_matrix = linkage(X_subsample, metric='euclidean', method='ward')
linkage_matrix

In [None]:
linkage_matrix_df = pd.DataFrame(linkage_matrix, columns = ['N° cluster (or point)','N° cluster (or point)','Distance', 'Cluster Size'])
linkage_matrix_df

In [None]:
from sklearn.cluster import AgglomerativeClustering

# Initialize the hierarchical clustering algorithm
# affinity -> 'euclidean', 'manhattan', 'cosine' or 'precomputed' (similarity matrix)
# linkage -> ‘ward’ (default), ‘complete’, ‘average’, ‘single’
hc = AgglomerativeClustering(metric='euclidean', linkage='ward', n_clusters=3)
# Generate hierarchical clustering and predict the cluster label for each example
cluster_labels = hc.fit_predict(X)
X['Cluster_Label'] = cluster_labels
X

<b>Density-Based Clustering -> DBSCAN (Scikit-Learn)</b>

In [None]:
from sklearn.cluster import DBSCAN

# Initialize the density-based clustering algorithm
# From scikit-learn: [‘cityblock’, ‘cosine’, ‘euclidean’, ‘l1’, ‘l2’, ‘manhattan’]
# From scipy.spatial.distance [‘braycurtis’, ‘canberra’, ‘chebyshev’, ‘correlation’, ‘dice’, ‘hamming’, ‘jaccard’, ‘kulsinski’, 
# ‘mahalanobis’, ‘minkowski’, ‘rogerstanimoto’, ‘russellrao’, ‘seuclidean’, ‘sokalmichener’, 
#‘sokalsneath’, ‘sqeuclidean’, ‘yule’]
dbs = DBSCAN(eps=0.5, min_samples=5, metric='euclidean')
# Generate density-based clustering and predict the cluster label for each example
cluster_labels = dbs.fit_predict(X)
X['Cluster_Label'] = cluster_labels
X

In [None]:
cluster_labels