# **Unsupervised Machine Learning on Wholesale Customers Data**

In [None]:

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# **Reading the CSV data file and creating the data frame**

In [None]:
import pandas as pd

wholesale_data = pd.read_csv('../input/wholesale-customers-data-set/Wholesale customers data.csv')


# just printing random observations and attributes.
print(wholesale_data.sample(5))
print("\n\n")

# dropping of the attributes 'Channel' and 'Region'
# 'Channel' represents the hotel, cafe or retail store
# 'Region' represents the customer region 
# dropping of the attributes 'Channel' & ' Region' won't affect the clustering, as we are trying to relate the customers to the products
# they buy in order to maximize the business
wholesale_data.drop(labels=['Channel', 'Region'], axis=1, inplace=True)
print(wholesale_data.head(5))

# **Now to check for null values in the dataset**

In [None]:
# Gives us the basic analysis information of the wholesale customer dataset 
wholesale_data.info()

# **Since the above attributes have non-null values we do not need to do null checks for the attributes**

# **Let's take a look at the basic statisical data**


In [None]:
wholesale_data.describe()

# Plotting the results of describe, can observe that mean >> median in almost all the cases: distribution are scre


In [None]:
wholesale_data.describe().transpose()[['mean','50%']].plot.barh(figsize=(10,6))

# **Now let us perform 'Standardization' and 'Decomposition'** **(Pre processing)**

# **Previously we saw the features 'Channel' & 'Region' removed. These values have a low magnitude, whereas the other features like fresh, milk, grocery, frozen, detergents_paper, delicassen seem to have a higher magnitude and it is very necessary to bring these data to the same magnitude, because K- means algorithm is distance based and can have adverse effect with magnitude. Hence similar magnitude is preferred.**

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

standard_scaler = StandardScaler()
# making sample mean= 0 and std=1
scaled_data = standard_scaler.fit_transform(wholesale_data)

_temp_PCA=PCA(6)
_temp_PCA.fit_transform(scaled_data)

plt.bar(range(1,7),_temp_PCA.explained_variance_ratio_,color='black')
plt.xlabel('PCA dims')
plt.title('Variance ratio by 6 features')
plt.ylabel('variance preserve')
plt.xticks(range(1,7))
training_PCA_data = PCA(2).fit_transform(scaled_data)

# print(scaled_data)

In [None]:
pd.DataFrame(scaled_data,columns=['Fresh', 'Milk', 'Grocery', 'Frozen', 'Detergents_Paper', 'Delicassen']).describe()

In [None]:
training_PCA_DF = pd.DataFrame(training_PCA_data,columns=['d1','d2'])
training_PCA_DF.head()

In [None]:
training_PCA_DF.plot.scatter('d1','d2',alpha=.1,s=100,color='BLACK',figsize=(8,5))

# **Now to determine the number of clusters for the K-means algorithm, this can be calculated in 2 ways:**

# 1. Elbow Method
# 2. Silhouette Method

# **Using Elbow Method**

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

# Using elbow methods to determine the number of clusters for the K-Means algorithm
# WCSS = Within Cluster Sum of Squares
wcss = [] 
for i in range(1, 25):
    km = KMeans(n_clusters = i, init = 'k-means++', 
                max_iter = 300, n_init = 10, random_state = 0)
    km.fit(training_PCA_DF)
    wcss.append(km.inertia_)
plt.plot(range(1, 25), wcss)
plt.legend(['wcss'])
plt.title('The Elbow Method', fontsize = 20)
plt.xlabel('Number of Clusters')
plt.ylabel('wcss')
plt.figure(figsize = (10,5))
plt.show()

# **Using Silhouette Method**

In [None]:
from sklearn.metrics import silhouette_score

# Averaging the clusters with silhouette methods 
sil_avg=[]
cluster_numbers = [4, 5, 6,7] 
print("Average Silhouette Method\n")
for one_cluster in cluster_numbers: 
    cluster = KMeans(n_clusters = one_cluster) 
    cluster_labels = cluster.fit_predict(training_PCA_DF) 
    silhouette_avg = silhouette_score(training_PCA_DF, cluster_labels)
    sil_avg.append([one_cluster,silhouette_avg])
    print(f"For clusters = {one_cluster}")
    print(f"The average silhouette score for {one_cluster} is = {silhouette_avg}")
sil_avg=np.array(sil_avg)
plt.plot(sil_avg[:,0],sil_avg[:,1],linestyle='dashed')
plt.xlabel('Number of Clusters')
plt.ylabel('avg_sil')
plt.legend(['avg_sil'])
plt.show()

# **As we can from the above averages that most values are closer to 5, meaning that number of cluster are fixed to 5**

# **Now we perform the K-Means clustering and plot the results in a scatterplot**

In [None]:
print(training_PCA_DF.sample(10))

In [None]:
clusters_K_Means = 5
random_state_K_Means = 0

kmean = KMeans(n_clusters=clusters_K_Means, random_state=
               random_state_K_Means).fit(training_PCA_DF)
kmean_Y = kmean.predict(training_PCA_DF)
lab = kmean.labels_
# print(np.unique(lab))
plt.figure(figsize=(10,5))
plt.title(f"K- Means with cluster value = {clusters_K_Means}",fontsize=15)
plt.scatter(training_PCA_DF['d1'], training_PCA_DF['d2'],c = kmean_Y, s=105, 
        alpha=0.6,marker='o')
plt.xlabel("X Axis")
plt.ylabel("Y Axis")
plt.show()

In [None]:
import scipy.cluster.hierarchy as shc
from scipy.cluster.hierarchy import dendrogram
from sklearn.cluster import AgglomerativeClustering

#Agglomerative Clustering

agglometric_clustering= AgglomerativeClustering(n_clusters=clusters_K_Means,affinity = 'euclidean',linkage = 'ward')
agglometric_clustering_y = agglometric_clustering.fit_predict(training_PCA_DF)
plt.figure(figsize =(10,5))
plt.scatter(training_PCA_DF['d1'], training_PCA_DF['d2'],c = agglometric_clustering_y, s=80, alpha=0.6,marker='o')
plt.title('Agglomerative Clustering',fontsize = 20)
plt.show()

plt.figure(figsize=(10,5))
plt.title('Agglomerative Clustering : Dendrogram',fontsize = 20)
dend=shc.dendrogram(shc.linkage(training_PCA_DF,method='ward') ,truncate_mode='level', p=5) 
plt.show()

In [None]:
cluster=AgglomerativeClustering(n_clusters=clusters_K_Means,affinity='euclidean',linkage='ward')
cluster.fit_predict(training_PCA_DF)

In [None]:
from sklearn.cluster import Birch


#birch clustering
birch_clustering = Birch(branching_factor=500, n_clusters=clusters_K_Means, threshold=1.5)
birch_clustering.fit(training_PCA_DF)
labels = birch_clustering.predict(training_PCA_DF)

plt.title('Birch Clustering',fontsize = 20)
plt.scatter(training_PCA_DF['d1'], training_PCA_DF['d2'], c=labels,alpha=0.6,marker='o',s=150)

In [None]:
from sklearn.cluster import MiniBatchKMeans

# mini batch clustering

minibatch_clustering = MiniBatchKMeans(n_clusters=clusters_K_Means, random_state=random_state_K_Means)
minibatch_clustering.fit(training_PCA_DF)

labels = minibatch_clustering.predict(training_PCA_DF)
plt.title('MiniBatchKMeans clustering',fontsize = 20)
plt.scatter(training_PCA_DF['d1'], training_PCA_DF['d2'], c=labels,alpha=0.6,marker='o',s=150)
plt.figure(figsize=(20,15))
plt.show()