# **🗃️ Data Lab**

Useful dataset: [Customer analysis dataset](https://drive.google.com/file/d/1mTKbGB_PJbnilFuf42RFVhkEzACvQkdd/view?usp=drive_link) and [annotations](https://docs.google.com/spreadsheets/d/1CiAxAvS9nYp5NBFv7vG5XBw39a2KBRRx/edit?usp=sharing&ouid=107921194674515097266&rtpof=true&sd=true)


## Generate samples 🎯

In [None]:
import time
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.preprocessing import StandardScaler

# @markdown \

# ============
# Parameters
# ============

n_samples = 500 # @param {type:"integer"}
type_dataset = "blobs" # @param ["noisy_circles", "noisy_moons", "blobs", "no_structure", "anisotropic", "varied_var"]
noise = 0.04 # @param {type:"slider", min:0, max:0.5, step:0.01}
angle_aniso = 100 # @param {type:"slider", min:0, max:180, step:10}
random_state = 2 # @param {type:"integer"}


# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============

if type_dataset == "noisy_circles":
  X, _ = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=noise, random_state=random_state)

elif type_dataset == "noisy_moons":
  X, _ = datasets.make_moons(n_samples=n_samples, noise=noise, random_state=random_state)

elif type_dataset == "blobs":
  X, _ = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
  X += np.random.rand(n_samples, 2)*noise*X.min()

elif type_dataset == "no_structure":
  X = np.random.rand(n_samples, 2)

elif type_dataset == "anisotropic":
  X, _ = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
  t = np.tan(np.radians(angle_aniso))
  transformation = np.array(((1, t), (0, 1))).T
  X = np.dot(X, transformation)
  X += np.random.rand(n_samples, 2)*noise*X.min()

elif type_dataset == "varied_var":
  X, _ = datasets.make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state)
  X += np.random.rand(n_samples, 2)*noise*X.min()

X = StandardScaler().fit_transform(X)

_, ax = plt.subplots(figsize=(5,4))
ax.scatter(X[:, 0], X[:, 1], edgecolors='k')

print ("\nData shape: {0} \n".format(X.shape))

## Load a dataset 📑

In [None]:
# @markdown ---

# @markdown \
# @markdown ### 🔼 Upload your file (first)
# @markdown \

# @markdown ---
# @markdown ### Enter path to **.csv* file:
file_path = "/content/customer_segmentation.csv" # @param {type:"string"}

var_h = "Income" # @param {type:"string"}
var_v = "Age" # @param {type:"string"}
labels = "Sex" # @param {type:"string"}
normalization = "None" # @param ["MinMax [0,1]", "MinMax [-1,1]", "Z-Score", "None"]
Load_all_data = True # @param {type:"boolean"}

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv(file_path)

if not Load_all_data:
  X = np.c_[np.array(data[var_h]), np.array(data[var_v])]
else:
  X = np.array(data)

y = np.array(data[labels]) if labels != "" else None

if   normalization == "MinMax [0,1]":
  X = MinMaxScaler().fit_transform(X)
elif normalization == "MinMax [-1,1]":
  X = MinMaxScaler(feature_range=(-1,1)).fit_transform(X)
elif normalization == "Z-Score":
  X = StandardScaler().fit_transform(X)


if not Load_all_data:
  data[var_h], data[var_v] = X[:, 0], X[:, 1]
else:
  for i in range(len(data.columns)):
    data.iloc[:, i] = X[:, i]

_, ax = plt.subplots (figsize=(5,4))
# ax.scatter(X[:, 0], X[:, 1], c=y, cmap='Paired', edgecolors='k')
sns.scatterplot(ax=ax,data=data,x=var_h,y=var_v, hue=labels if labels != "" else None, palette='colorblind')
print ("\nData Loaded! ✅")
print (" - Shape: {0}\n".format(X.shape))

 # **📋 Determine the right num of clusters**

In [None]:
from sklearn import cluster
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import silhouette_score
from IPython.display import clear_output
import scipy.cluster.hierarchy as shc

# @markdown \

method = "Silhouette (Both)" # @param ["Elbow (KMeans only)", "Silhouette (Both)", "Dendogram (Agglom only)"]
min_n_clusters = 2 # @param {type:"integer"}
max_n_clusters = 9 # @param {type:"integer"}
random_state = 1 # @param {type:"integer"}
model = "Kmeans" # @param ["Kmeans", "MiniBatchKMeans", "AgglomerativeClustering"]
random_state = 4 # @param {type:"integer"}
metric = "euclidean" # @param ["euclidean", "cityblock", "cosine", "l1", "l2", "chebyshev", "mahalanobis"]
linkage = "ward" # @param ["average", "complete", "single", "ward"]

silhouette_distances = []

clusters = np.arange(min_n_clusters, max_n_clusters)
elbow_distances = []

if method == "Elbow (KMeans only)":
  # Eval kmeans over each num of clusters
  for n_cluster in clusters :
    #
    if model == "Kmeans":
      algo = cluster.KMeans(n_clusters=n_cluster, random_state=random_state);
    elif model == "MiniBatchKMeans":
      algo = cluster.MiniBatchKMeans(n_clusters=n_cluster, random_state=random_state);
    elif model == "AgglomerativeClustering":
       raise ValueError("Model must be either Kmeans or MiniBatchKMeans")

    # algo.fit( X )
    predictions = algo.fit_predict( X )
    centroids = algo.cluster_centers_
    distance = 0

    for i in range(len(predictions)) :
      centroide = centroids[predictions[i]]
      distance += euclidean_distances( centroide.reshape(1, -1) , X[i].reshape(1, -1) )

    elbow_distances.extend(distance/len(predictions))

  # Plot the elbow
  _, ax = plt.subplots (figsize = (7,5))
  ax.plot( clusters , elbow_distances , marker="x")
  ax.set_title("Elbow")
  clear_output()

elif method == "Silhouette (Both)":
  # Eval kmeans over each num of clusters
  for n_cluster in clusters :
    #
    if model == "Kmeans":
      algo = cluster.KMeans(n_clusters=n_cluster, random_state=random_state);
    elif model == "MiniBatchKMeans":
      algo = cluster.MiniBatchKMeans(n_clusters=n_cluster, random_state=random_state);
    elif model == "AgglomerativeClustering":
      algo = AgglomerativeClustering(n_clusters=n_cluster, linkage=linkage)

    # algo.fit( X )
    predictions = algo.fit_predict( X )

    distance = silhouette_score( X , predictions.reshape(-1,1) , metric=metric)
    silhouette_distances.append(distance)

  # Plot the silhouette
  _, ax = plt.subplots (figsize = (7,5))
  ax.plot( clusters , silhouette_distances , marker="x")
  ax.set_title("Silhouette")
  clear_output()

elif method == "Dendogram (Agglom only)":
  #
  if model == "Kmeans" or model == "MiniBatchKMeans" :
       raise ValueError("AgglomerativeClustering")
  plt.figure(figsize =(7,5))
  plt.title('Dendrogram')
  Dendrogram = shc.dendrogram((shc.linkage(X, method='ward')))
  plt.show()


print ("\nDone! ✅ \n")

 # **🤖 Setup Model**

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# @markdown \

# ============
# Parameters
# ============

model_type = "Kmeans" # @param ["Kmeans", "MiniBatchKMeans", "AgglomerativeClustering"]
n_clusters = 2 # @param {type:"integer"}
max_steps = 300 # @param {type:"integer"}
linkage = "ward" # @param ["average", "complete", "single", "ward"]
# distance_threshold = 1.2 # @param

if model_type == "AgglomerativeClustering":
  algo = cluster.AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)
elif model_type == "Kmeans":
  algo = cluster.KMeans(n_clusters=n_clusters, random_state=random_state);
elif model_type == "MiniBatchKMeans":
  algo = cluster.MiniBatchKMeans(n_clusters=n_clusters, random_state=random_state);


print ("\nModel is ready!⚙️🔧\n")
# print ("----- ")
print (" - Model: {0}".format(model_type))
print (" - n_clusters: {0}".format(n_clusters))
print (" - max_steps: {0}".format(max_steps))
if model_type == "AgglomerativeClustering": print (" - linkage: {0}".format(linkage))

# **🦾 Run training!**

In [None]:

# @markdown ### Start now!
# @markdown \

algo.fit(X)

# silhouette_average_score = silhouette_score(X, agglom.fit_predict(X))
# silhouette_scores.append(silhouette_average_score)

print ("\nTraining done! ✅")


# **💡 Analyze and Look for Insights!**

In [None]:

# @markdown ### 📊 Display result
# @markdown \

plot_type = "Boxplot" # @param ["Scatter", "Boxplot"]
# distance_threshold = 1.2 # @param
h_axis_name = "Occupation" # @param {type:"string"}
v_axis_name = "Age" # @param {type:"string"}

# plt.scatter(X[:, 0], X[:, 1], c=agglom.fit_predict(X))

if "cluster" not in data.columns:
  data.insert((data.shape[1]),'cluster',algo.fit_predict(X))
else:
  data['cluster'] = algo.fit_predict(X)



# labels if labels != "" else None,
_, ax = plt.subplots (figsize=(5,4))

if plot_type == "Scatter":
  sns.scatterplot(ax=ax,data=data,x=h_axis_name,y=v_axis_name, hue="cluster", palette='colorblind')
elif plot_type == "Boxplot":
  sns.boxplot(ax=ax,data=data,x=h_axis_name,y=v_axis_name, hue="cluster", palette='colorblind')


plt.title("{0} clusters".format(n_clusters))
plt.show()


In [None]:

# @markdown ### 📊 Plot all variables
# @markdown \

cluster_id = 1 # @param {type:"integer"}

data[data['cluster'] == cluster_id].hist(figsize = (12,10), grid=False, alpha=0.5)
plt.show()