<a href="https://colab.research.google.com/github/roitraining/PythonML/blob/Development/Ch06-ClusterAnalysis/06-01-ClusterAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Let's create a random data set with two features and three clusters

In [None]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

# Creating a sample dataset with 4 clusters
x, y = make_blobs(n_samples=400, n_features=2, centers=3)
print (x[:5]) # shape location
print (y[:5]) # cluster member



## Plot it just to visualize it first

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (16, 9)
plt.plot(x[:,0],x[:,1],'o')
plt.show()


## Run a cluster model

In [None]:
from sklearn import cluster
CLUSTERS = 3
k_means = cluster.KMeans(n_clusters=CLUSTERS)
k_means.fit(x)

## Plot using the cluster analysis results

In [None]:
%matplotlib inline
labels = k_means.labels_
centroids = k_means.cluster_centers_

for i in range(CLUSTERS):
    ds = x[np.where(labels==i)]
    # plot the data observations
    plt.plot(ds[:,0],ds[:,1],'o')
    # plot the centroids
    lines = plt.plot(centroids[i,0],centroids[i,1],'kx')
plt.show()



## Use the elbow chart to help figure out how many clusters you should use

In [None]:
def plot_elbow(data, cluster_cnt = 6):
   CLUSTERS = range(1, cluster_cnt)
   kmeans = [cluster.KMeans(n_clusters=i) for i in CLUSTERS]

   score = [kmeans[i].fit(data).score(data) for i in range(len(kmeans))]
   plt.plot(CLUSTERS ,score)
   plt.xlabel('Number of Clusters')
   plt.ylabel('Score')
   plt.title('Elbow Curve')
   plt.show()

plot_elbow(x)

In [None]:
%matplotlib inline

def silhouette_plot(data, count = 6):
   from sklearn.datasets import make_blobs
   from sklearn.cluster import KMeans
   from sklearn.metrics import silhouette_samples, silhouette_score

   import matplotlib.pyplot as plt
   import matplotlib.cm as cm
   import numpy as np

# Generating the sample data from make_blobs
# This particular setting has one distinct cluster and 3 clusters placed close
# together.
#X, y = make_blobs(n_samples=500, n_features=2, centers=4, cluster_std=1, center_box=(-10.0, 10.0), shuffle=True, random_state=1)  # For reproducibility

   range_n_clusters = range(2, count + 1)

   for n_clusters in range_n_clusters:
       # Create a subplot with 1 row and 2 columns
       fig, (ax1, ax2) = plt.subplots(1, 2)
       fig.set_size_inches(18, 7)

       # The 1st subplot is the silhouette plot
       # The silhouette coefficient can range from -1, 1 but in this example all
       # lie within [-0.1, 1]
       ax1.set_xlim([-0.1, 1])
       # The (n_clusters+1)*10 is for inserting blank space between silhouette
       # plots of individual clusters, to demarcate them clearly.
       ax1.set_ylim([0, len(data) + (n_clusters + 1) * 10])

       # Initialize the clusterer with n_clusters value and a random generator
       # seed of 10 for reproducibility.
       clusterer = KMeans(n_clusters=n_clusters, random_state=10)
       cluster_labels = clusterer.fit_predict(data)

       # The silhouette_score gives the average value for all the samples.
       # This gives a perspective into the density and separation of the formed
       # clusters
       silhouette_avg = silhouette_score(data, cluster_labels)
       print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)

       # Compute the silhouette scores for each sample
       sample_silhouette_values = silhouette_samples(data, cluster_labels)

       y_lower = 10
       for i in range(n_clusters):
           # Aggregate the silhouette scores for samples belonging to
           # cluster i, and sort them
           ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

           ith_cluster_silhouette_values.sort()

           size_cluster_i = ith_cluster_silhouette_values.shape[0]
           y_upper = y_lower + size_cluster_i

           color = cm.nipy_spectral(float(i) / n_clusters)
           ax1.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_silhouette_values, facecolor=color, edgecolor=color, alpha=0.7)

           # Label the silhouette plots with their cluster numbers at the middle
           ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

           # Compute the new y_lower for next plot
           y_lower = y_upper + 10  # 10 for the 0 samples

       ax1.set_title("The silhouette plot for the various clusters.")
       ax1.set_xlabel("The silhouette coefficient values")
       ax1.set_ylabel("Cluster label")

       # The vertical line for average silhouette score of all the values
       ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

       ax1.set_yticks([])  # Clear the yaxis labels / ticks
       ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

       # 2nd Plot showing the actual clusters formed
       colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
       ax2.scatter(data[:, 0], data[:, 1], marker='.', s=30, lw=0, alpha=0.7, c=colors, edgecolor='k')

       # Labeling the clusters
       centers = clusterer.cluster_centers_
       # Draw white circles at cluster centers
       ax2.scatter(centers[:, 0], centers[:, 1], marker='o', c="white", alpha=1, s=200, edgecolor='k')

       for i, c in enumerate(centers):
           ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=50, edgecolor='k')

       ax2.set_title("The visualization of the clustered data.")
       ax2.set_xlabel("Feature space for the 1st feature")
       ax2.set_ylabel("Feature space for the 2nd feature")

       plt.suptitle(("Silhouette analysis for KMeans clustering on sample data with n_clusters = %d" % n_clusters), fontsize=14, fontweight='bold')

   plt.show()

silhouette_plot(x, 6)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

plt.rcParams['figure.figsize'] = (16, 9)

# Creating a sample dataset with 4 clusters
x2, y2 = make_blobs(n_samples=400, n_features=3, centers=4)
print (x2) # shape location
print (y2) # cluster member


In [None]:
%matplotlib inline
fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(x2[:, 0], x2[:, 1], x2[:, 2])


In [None]:
plot_elbow(x2)

In [None]:
%matplotlib inline
CLUSTERS = 4
kmeans = cluster.KMeans(n_clusters=CLUSTERS)
kmeans = kmeans.fit(x2)
labels = kmeans.predict(x2)
C = kmeans.cluster_centers_

fig = plt.figure()
ax = Axes3D(fig)
ax.scatter(x2[:, 0], x2[:, 1], x2[:, 2], c=labels)
ax.scatter(C[:, 0], C[:, 1], C[:, 2], c='#050505', s=1000)



In [None]:
silhouette_plot(x2, 6)


In [None]:
%matplotlib inline
x, y = make_blobs(n_samples=10, n_features=2, centers=3)
print (x)
print (y)
from scipy.cluster.hierarchy import dendrogram, linkage
z = linkage(x, 'ward')
dendrogram(z, leaf_rotation = 90, leaf_font_size=12)

#End of notebook