# Chateau Winery (A): Unsupervised Learning

## Step 1. Import libraries

In [None]:
#Import all the standard libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Import specific libraries for k-Means clustering
from sklearn.cluster import ? # ? KMeans

from sklearn.metrics import silhouette_score

#suppress warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Step 1: Read data into wine_data
wine_data = pd.?("wine_data.csv") # ? read_csv

# Step 2: Display first ten rows of data
wine_data.head(?) # ? 10

In [None]:
#Check the shape of the data
wine_data.? # ? shape

## Step 2: Visualize data

Before we start clustering, let's begin by creating a scatterplot of our data. 

In [None]:
# Create scatter plot with Pinot Noir on the x-axis and Champagne on the y-axis

plt.scatter(x=wine_data['Pinot Noir'],y=wine_data['Champagne'])

# Add a title to the plot
plt.title("Exhibit 2A: The Data")

# Add labels for the x and y axes
plt.xlabel("Pinot Noir")
plt.ylabel("Champagne")

plt.show()

## Step 3. Clustering

### K-Means cluster with k = 2
Next we can easily apply the k-means algorithm in Python using tools from the  `sklearn.cluster`  module we imported at the beginning of the notebook.

Initialize clustering
First we need to create a  KMeans  object (which we'll call  kmeans2 ) and specify our desired value for the number of clusters ( k ). We do this using  `KMeans()`, which takes the following arguments:

* `n_clusters` , our desired value for  k .
* `init` , an optional parameter with the starting points for the cluster centroids. If this parameter is left out, the initial centroids will be chosen automatically. See the documentation here for more details.
* `n_init` , an optional parameter that specifies the number of times the algorithm is run with different starting points. Because we are only running with the initial cluster centroids specified on page 3 of the case, we set this parameter to one.

In [None]:
kmeans2 = KMeans(n_clusters=2, init=np.array([[8, 17],[27, 1]]), n_init=1)

In [None]:
# Since, KMeans function prefer data in numpy array, let's create an array with data required and convert it into array using .to_numpy()

X_wine = wine_data[['Pinot Noir', 'Champagne']].? # ? to_numpy()


In [None]:
# Check the first 5 elements of the array

X_wine[?]    # ? 0:5

### Now perform the clustering...

In [None]:
# Fit the model and generate the clusters

kmeans2.?(X_wine)  # ? fit

In [None]:
#Cluster membership - Get lables

kmeans2.? # ? labels_

### You can look at the centroid of the clusters formed

In [None]:
#look at the cluster centers

kmeans2.? # ? cluster_centers_

### Now create a scatterplot with different colors based on the clusters and mark the centroid of the clusters

In [None]:
#plt.figure(figsize=(10,10))

# Plot chart title and label the x and y axes
plt.title("Exhibit 2D: Final Centroid Locations")
plt.xlabel("Pinot Noir")
plt.ylabel("Champagne")

# Plot the observations color-coded by cluster
plt.scatter(x=wine_data['Pinot Noir'], y=wine_data['Champagne'], 
            c=kmeans2.labels_)

# Plot the final cluster centroids in black
plt.scatter(kmeans2.cluster_centers_[:,0] ,kmeans2.cluster_centers_[:,1], marker="X", 
            color='black',s=200)  # scatter plot between x and y coordinates of the cluster centers'; kmeans2.cluster_centers_[:,0] denotes the x-coordinates and kmeans2.cluster_centers_[:,1] denotes the y-coordinates

plt.show()

In [None]:
# Labels assigned to each customer

kmeans2.? # ? labels_

In [None]:
# Adding a column Cluster to the dataframe

wine_data['Cluster']=kmeans2.labels_+1

wine_data

In [None]:
# Replace the index by the name

wine_data.set_index('name', inplace = True)

In [None]:
# Generate the silhouette_score

print('Silhouette Score(n=2):',round(silhouette_score(wine_data,kmeans2.labels_),2))

## Step 4. Now you can play a little with different number of clusters and see how they look

In [None]:
k = 2 # <-- replace "2" with your desired number of clusters

####### Step 4a: Initialize clustering #########################################
kmeansK = KMeans(n_clusters=k, random_state=123)


####### Step 4b: Prepare input data ############################################
# Nothing to do here, as X_wine was already created earlier


####### Step 4c: Perform k-means clustering ####################################
kmeansK.fit(X_wine)


####### Step 4d: View output ###################################################
# Plot chart title and label the x and y axes
plt.title("Optimized clusters, k="+str(k))
plt.xlabel("Pinot Noir")
plt.ylabel("Champagne")

# Plot the observations color-coded by cluster
plt.scatter(x=wine_data['Pinot Noir'],y=wine_data['Champagne'], c=kmeansK.labels_)

# Plot the final cluster centroids in black
plt.scatter(x=kmeansK.cluster_centers_[:,0] ,y=kmeansK.cluster_centers_[:,1], marker="X", 
            color='black', s=200)

plt.show()

## Step 5. Choosing Clusters through Cluster Quality Measures

Let us try to answer the question of optimum number of clusters, k in this data.

There are two methods for understanding cluster quality, Elbow method (number of clusters at which the cost function wcss (within cluster sum of squared) is minimum)

The other method is Silhouette score which is a measure of how similar an object is to its own cluster compared to other clusters.

Overall silhouette score is measure of cluster separability.

### Method 1: Elbow Method

kmeans.inertia_ => total within cluster sum of squared

kmeans.inertia_ / i => mean WCSS for each cluster

In [None]:
# For getting WCSS for different values of k, k-Means shall be performed with a For loop.

wcss = [ ] # Initialize empty list to store WCSS


# Loop over different values of k starting from 2 clusters to 50 clusters and calculate the wcss
for i in range(2, 50):
    # Initialize clustering with k = i
    kmeansI = KMeans(n_clusters=i, random_state=123) # initialize clustering algorithm/ model
    
    # Apply clustering
    kmeansI.fit(X_wine)
    
    # Calculate WCSS and add to list for each value of number of clusters,k from 2 to 50
    wcss.append(kmeansI.inertia_ / i)


In [None]:
# Plot the WCSS for each cluster solution from 2 to 50.

plt.plot(range(2, 50),wcss)
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
#plt.xticks(np.arange(0,50,5)) # to set axis ticks to 5, by default it is set at 10
plt.show()

### Method 2: Silhouette Score

In [None]:
silhouette_scores = [] # Initialize empty list to store silhouette scores

# Loop over different values of k and calculate the average silhouette score
for i in range(2, 50):
    
    # Initialize clustering with k = i
    kmeansI = KMeans(n_clusters=i, random_state=123)
    
    # Apply clustering
    kmeansI.fit(X_wine)
    
    # Calculate the silhouette score when k = i
    labelsI = kmeansI.fit_predict(X_wine)
    scoreI = silhouette_score(X_wine, labelsI)
    
    # Add silhouette score at k = i to silhouette_scores
    silhouette_scores.append((i, scoreI))

In [None]:
df_scores = pd.DataFrame(silhouette_scores)

In [None]:
# Plot the average silhouette score for each value of k
#x,y = zip(*silhouette_scores)
#plt.plot(x, y)

plt.plot(df_scores[0],df_scores[1])

plt.plot()

# Label the chart and the x and y axes

plt.title("Silhouette Plot")
plt.xlabel("Number of Clusters (k)")
plt.xlim([2,50])
plt.xticks(np.arange(0,51,5))
plt.ylabel("Average Silhouette Score")
plt.show()

## Clustering (An alternative Approach)

### Hierarchical/Agglomerative Clustering

In [None]:
# for hierarchial clustering
# from scipy.cluster.hierarchy import dendrogram, linkage, fcluster

from scipy.cluster.hierarchy import ?, linkage, fcluster # ? dendrogram

In [None]:
# Step 1: Read data into wine_data dataframe
wine_data = pd.read_csv("wine_data.csv") 
# Set index as name
wine_data.set_index('name', inplace = True)
# Check how the dataframe looks
wine_data.head() # head() by default shows the first 5 rows in the dataframe

In [None]:
# Generate cluster linkages

Z = linkage(wine_data, method = 'ward') # method can take different parameter values like 'single', 'complete'

In [None]:
# Add the cluster generated from Hierarchical to the Wine_data dataframe

wine_data["Hierarchical"] = fcluster(Z, 3, criterion = 'maxclust')

pd.set_option('display.max_rows', None) # to display all rows of the dataframe

wine_data

In [None]:
#Plot dendrogram
plt.figure(figsize=(5,3),dpi=250)

?(Z, labels = wine_data.index) #? dendrogram

plt.show()

## Compare the output of k-Means and Hierarchial Clustering with 3 clusters
Note: Re-Run the kmeansK with k =3 in step 4 and then execute this code

In [None]:
k = 3 # <-- initialize k with your desired number of clusters

####### Step 4a: Initialize clustering #########################################
kmeansK = KMeans(n_clusters=k, random_state=123)


####### Step 4b: Prepare input data ############################################
# Nothing to do here, as X_wine was already created earlier


####### Step 4c: Perform k-means clustering ####################################
kmeansK.fit(X_wine)


####### Step 4d: View output ###################################################
# Plot chart title and label the x and y axes
plt.title("Optimized clusters, k="+str(k))
plt.xlabel("Pinot Noir")
plt.ylabel("Champagne")

# Plot the observations color-coded by cluster
plt.scatter(x=wine_data['Pinot Noir'],y=wine_data['Champagne'], c=kmeansK.labels_)

# Plot the final cluster centroids in black
plt.scatter(x=kmeansK.cluster_centers_[:,0] ,y=kmeansK.cluster_centers_[:,1], marker="X", 
            color='black', s=200)

plt.show()

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,10))

fig.suptitle('k-Means vs Hierarchical')

ax1.scatter(x=wine_data['Pinot Noir'],y=wine_data['Champagne'], 
            c=kmeansK.labels_)

ax1.set_title("k-means (k =3)")

ax2.scatter(x=wine_data['Pinot Noir'], y=wine_data['Champagne'], 
            c=wine_data["Hierarchical"])
ax2.set_title("Hierarchical (n =3)")

plt.show()

In [None]:
wine_data["K-Means(k=3)"] = kmeansK.labels_

wine_data.head(10)

### Now plot again with a circle around the differing clusters

In [None]:
# Now plot again with a circle around the differing clusters

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,10))

fig.suptitle('k-Means vs Hierarchical')

ax1.scatter(wine_data['Pinot Noir'], wine_data['Champagne'], 
            c=kmeansK.labels_)

ax1.set_title("k-means (k =3)")

ax2.scatter(wine_data['Pinot Noir'], wine_data['Champagne'], 
            c=wine_data["Hierarchical"])

ax2.scatter([24,17,13] ,[24,21,3], facecolors='none', edgecolors='blue', s=300)

ax2.set_title("Hierarchical (n =3)")

plt.show()