In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler, MinMaxScaler, PowerTransformer, MaxAbsScaler, Normalizer, QuantileTransformer
import seaborn as sns
from sklearn import decomposition
import plotly.express as px

data = pd.read_csv("/kaggle/input/wholesale-customers-data-set/Wholesale customers data.csv")
data.head()

## Look at "Region" categorical composition

In [None]:
regions = pd.DataFrame(data['Region'].value_counts().T)
regions.rename(index={1:'Lisbon',2:'Oporto',3:'Other'},inplace=True)
print('Region Bar Plot')
regions.T.plot.bar()

## Simply drop region because it's very incomplete. It has a lot of "others"

In [None]:
df=data.drop(['Region'],axis=1)

## Look for obvious outliers

In [None]:
plt.figure(figsize=(16, 6))
sns.boxplot(data=df)

## Drop obvious outliers

In [None]:
df.drop(df.index[(df['Fresh']==112151) | (df['Milk']==73498) | (df['Grocery']==92780) | (df['Frozen']==60869) | (df['Delicassen']==47943)],inplace=True)

## Calculate a PCA on the full stardardized data, and look for obvious clusters

In [None]:
dfSTD=pd.DataFrame(StandardScaler().fit_transform(df))
pca = decomposition.PCA(n_components=3)

### Numpy 1.19.5 produces non-convergance random error on first pass, so retry if necessary ###
while True: 
    try: 
        XSTD = pca.fit_transform(dfSTD) 
        break 
    except: 
        continue
print("Explained Variance ratio:",pca.explained_variance_ratio_)
XSTD.shape

In [None]:
fig = px.scatter_3d(x=XSTD[:, 0], y=XSTD[:, 1], z=XSTD[:, 2], width=1200, height=900)
fig.show()

## Look at "Channel" categorical composition

In [None]:
channels = pd.DataFrame(data['Channel'].value_counts().T)
channels.rename(index={1:'HoReCa',2:'Retail'},inplace=True)
print('Channels Bar Plot')
channels.T.plot.bar()

# First divide by channel and visualize product mix differences

In [None]:
# Divide Retail from HoReCa and try to divide HoReCa in Hotel, Restaurant and Café
dfHoReCa = df[df['Channel']==1].drop(['Channel'],axis=1)
dfRetail = df[df['Channel']==2].drop(['Channel'],axis=1)

# Plot both groups to visualize the difference
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True,figsize=(25, 10))
ax1.set_title('Hotels / Restaurants / Cafés')
ax2.set_title('Retail')
sns.boxplot(data=dfHoReCa, ax=ax1)
sns.boxplot(data=dfRetail, ax=ax2)

# "HoReCa Channel" Analysis and Clustering

In [None]:
# dfcolumns = dfHoReCa.columns
# ### Use QuantileTransformer if we want to force it to suggest a k=3 elbow ###
# dfHoReCa=pd.DataFrame(QuantileTransformer().fit_transform(dfHoReCa))
# dfHoReCa.columns = dfcolumns

In [None]:
distortions = []
K = range(1,10)
for k in K:
    model = KMeans(n_clusters=k)
    model.fit(dfHoReCa)
    distortions.append(model.inertia_)
print(distortions)
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('Elbows')
plt.show()

## Disregard the suggested k=2 Elbow, and use k=3 in order to *try* to classify into Hotels vs Restaurants vs Cafés

In [None]:
kmeans = KMeans(n_clusters=3,max_iter=1000,random_state=42)
kmeans.fit(dfHoReCa)
predict = kmeans.predict(dfHoReCa)
centroids = kmeans.cluster_centers_
print(centroids)


## Bar Chart with Totals and Percentages (HoReCa Channel)

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True,figsize=(25, 10))
ax1.set_title('Totals')
ax2.set_title('Percentages')

groups = pd.DataFrame({'Group 1: Restaurants?':centroids[0],'Group 2: Cafés?':centroids[1],'Group 3: Hotels?':centroids[2]},index=dfHoReCa.columns).T
stacked_data = groups
stacked_data.plot.barh(stacked=False,ax=ax1)
groups = pd.DataFrame({'Group 1: Restaurants?':centroids[0],'Group 2: Cafés?':centroids[1],'Group 3: Hotels?':centroids[2]},index=dfHoReCa.columns).T
stacked_data2 = groups.apply(lambda x: x*100/sum(x), axis=1)
stacked_data2.plot.barh(stacked=True,ax=ax2)

## Alternative view with boxplots (HoReCa Channel)

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(25, 10))
ax1.set_title('Group 1: Restaurants?')
ax2.set_title('Group 2: Cafés?')
ax3.set_title('Group 3: Hotels?')
data = dfHoReCa.copy()
data['predict'] = predict
sns.boxplot(data=data[data['predict']==0], ax=ax1)
sns.boxplot(data=data[data['predict']==1], ax=ax2)
sns.boxplot(data=data[data['predict']==2], ax=ax3)

## PCA (HoReCa)

In [None]:
#dfHoReCa=pd.DataFrame(StandardScaler().fit_transform(dfHoReCa))
pca = decomposition.PCA(n_components=3)

### Numpy 1.19.5 produces non-convergance random error on first pass, so retry if necessary ###
while True: 
    try: 
        XHoReCa = pca.fit_transform(dfHoReCa) 
        break 
    except: 
        continue
print("Explained Variance ratio:",pca.explained_variance_ratio_)
XHoReCa.shape

## Visualize HoReCa Clusters

In [None]:
fig = px.scatter_3d(x=XHoReCa[:, 0], y=XHoReCa[:, 1], z=XHoReCa[:, 2], color=predict,width=1200, height=900)
fig.show()

# "Retail Channel" Analysis and Clustering

In [None]:
dfcolumns = dfRetail.columns
dfRetail=pd.DataFrame(PowerTransformer().fit_transform(dfRetail))
dfRetail=pd.DataFrame(MinMaxScaler().fit_transform(dfRetail))
dfRetail.columns = dfcolumns

In [None]:
distortions = []
K = range(1,10)
for k in K:
    model = KMeans(n_clusters=k)
    model.fit(dfRetail)
    distortions.append(model.inertia_)
print(distortions)
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('Elbows')
plt.show()

## Disregard the clearly suggested k=3 Elbow, and use Elbow k=2 since this is a channels subset with very few samples

In [None]:
kmeans = KMeans(n_clusters=2,max_iter=1000,random_state=42)
kmeans.fit(dfRetail)
predict = kmeans.predict(dfRetail)
centroids = kmeans.cluster_centers_
print(centroids)

## Bar Chart with Totals and Percentages (Retail Channel)

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True,figsize=(25, 10))
ax1.set_title('Totals')
ax2.set_title('Percentages')

groups = pd.DataFrame({'Group 1':centroids[0],'Group 2':centroids[1]},index=dfRetail.columns).T
stacked_data = groups
stacked_data.plot.barh(stacked=False,ax=ax1)
groups = pd.DataFrame({'Group 1':centroids[0],'Group 2':centroids[1]},index=dfRetail.columns).T
stacked_data2 = groups.apply(lambda x: x*100/sum(x), axis=1)
stacked_data2.plot.barh(stacked=True,ax=ax2)

## Alternative view with boxplots (Retail Channel)

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(25, 10))
ax1.set_title('Group 1')
ax2.set_title('Group 2')
data = dfRetail.copy()
data['predict'] = predict
sns.boxplot(data=data[data['predict']==0], ax=ax1)
sns.boxplot(data=data[data['predict']==1], ax=ax2)

## PCA (Retail)

In [None]:
#dfRetail=pd.DataFrame(StandardScaler().fit_transform(dfRetail))
pca = decomposition.PCA(n_components=3)

### Numpy 1.19.5 produces non-convergance random error on first pass, so retry if necessary ###
while True: 
    try: 
        XRetail = pca.fit_transform(dfRetail)
        break 
    except: 
        continue
print("Explained Variance ratio:",pca.explained_variance_ratio_)
XRetail.shape

## Visualize Retail Clusters

In [None]:
fig = px.scatter_3d(x=XRetail[:, 0], y=XRetail[:, 1],z=XRetail[:, 2], color=predict,width=1200, height=900)
fig.show()

# "Full Dataset" Analysis and Clustering

In [None]:
df.drop(['Channel'],axis=1, inplace=True)

In [None]:
# dfcolumns = df.columns
# # df=pd.DataFrame(StandardScaler().fit_transform(df))
# df=pd.DataFrame(MinMaxScaler().fit_transform(df))
# df.columns = dfcolumns

In [None]:
distortions = []
K = range(1,10)
for k in K:
    model = KMeans(n_clusters=k)
    model.fit(df)
    distortions.append(model.inertia_)
print(distortions)
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('Elbows')
plt.show()

## Use the suggested k=3

In [None]:
kmeans = KMeans(n_clusters=3,max_iter=1000,random_state=42)
kmeans.fit(df)
predict = kmeans.predict(df)
centroids = kmeans.cluster_centers_
print(centroids)

## Bar Chart with Totals and Percentages (Full Dataset)

In [None]:
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True,figsize=(25, 10))
ax1.set_title('Totals')
ax2.set_title('Percentages')

groups = pd.DataFrame({'Group 1: Big Fresh':centroids[0],'Group 2: Small Balanced':centroids[1],'Group 3: Milk/Groceries':centroids[2]},index=df.columns).T
stacked_data = groups
stacked_data.plot.barh(stacked=False,ax=ax1)
groups = pd.DataFrame({'Group 1: Big Fresh':centroids[0],'Group 2: Small Balanced':centroids[1],'Group 3: Milk/Groceries':centroids[2]},index=df.columns).T
stacked_data2 = groups.apply(lambda x: x*100/sum(x), axis=1)
stacked_data2.plot.barh(stacked=True,ax=ax2)

## Alternative view with boxplots (Full Dataset)

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, figsize=(25, 10))
ax1.set_title('Group 1: Big Fresh')
ax2.set_title('Group 2: Small Balanced')
ax3.set_title('Group 3: Milk/Groceries')
data = df.copy()
data['predict'] = predict
sns.boxplot(data=data[data['predict']==0], ax=ax1)
sns.boxplot(data=data[data['predict']==1], ax=ax2)
sns.boxplot(data=data[data['predict']==2], ax=ax3)

## PCA (Full Dataset)

In [None]:
#df=pd.DataFrame(StandardScaler().fit_transform(df))
pca = decomposition.PCA(n_components=3)

### Numpy 1.19.5 produces non-convergance random error on first pass, so retry if necessary ###
while True: 
    try: 
        XFull = pca.fit_transform(df)
        break 
    except: 
        continue
print("Explained Variance ratio:",pca.explained_variance_ratio_)
XFull.shape

## Visualize Full dataset clusters

In [None]:
fig = px.scatter_3d(x=XFull[:, 0], y=XFull[:, 1],z=XFull[:, 2], color=predict,width=1200, height=900)
fig.show()

In [None]:
# import sys
# np.set_printoptions(threshold=sys.maxsize)
# print(XFull)