# **🗃️ Data Lab**

Useful dataset: [Mall customer dataset](https://drive.google.com/file/d/1jACghC-it0Hlb091tc2BH_RgrFWOXXih/view?usp=sharing)


## Generate samples 🎯

In [None]:
import time
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.preprocessing import StandardScaler

# @markdown \

# ============
# Parameters
# ============

n_samples = 500 # @param {type:"integer"}
type_dataset = "blobs" # @param ["noisy_circles", "noisy_moons", "blobs", "no_structure", "anisotropic", "varied_var"]
noise = 0.04 # @param {type:"slider", min:0, max:0.5, step:0.01}
angle_aniso = 100 # @param {type:"slider", min:0, max:180, step:10}
random_state = 2 # @param {type:"integer"}


# ============
# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
# ============

if type_dataset == "noisy_circles":
  X, _ = datasets.make_circles(n_samples=n_samples, factor=0.5, noise=noise, random_state=random_state)

elif type_dataset == "noisy_moons":
  X, _ = datasets.make_moons(n_samples=n_samples, noise=noise, random_state=random_state)

elif type_dataset == "blobs":
  X, _ = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
  X += np.random.rand(n_samples, 2)*noise*X.min()

elif type_dataset == "no_structure":
  X = np.random.rand(n_samples, 2)

elif type_dataset == "anisotropic":
  X, _ = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
  t = np.tan(np.radians(angle_aniso))
  transformation = np.array(((1, t), (0, 1))).T
  X = np.dot(X, transformation)
  X += np.random.rand(n_samples, 2)*noise*X.min()

elif type_dataset == "varied_var":
  X, _ = datasets.make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state)
  X += np.random.rand(n_samples, 2)*noise*X.min()

X = StandardScaler().fit_transform(X)

_, ax = plt.subplots(figsize=(5,4))
ax.scatter(X[:, 0], X[:, 1], edgecolors='k')

print ("\nData shape: {0} \n".format(X.shape))

## Load a dataset 📑

In [None]:
# @markdown ---

# @markdown \
# @markdown ### 🔼 Upload your file (first)
# @markdown \

# @markdown ---
# @markdown ### Enter path to **.csv* file:
file_path = "/content/Mall_Customers.csv" # @param {type:"string"}

var_h = "Spending Score (1-100)" # @param {type:"string"}
var_v = "Annual Income (k$)" # @param {type:"string"}
labels = "Gender" # @param {type:"string"}
normalization = "MinMax [0,1]" # @param ["MinMax [0,1]", "MinMax [-1,1]", "Z-Score", "None"]


import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import warnings
warnings.filterwarnings("ignore")

data = pd.read_csv(file_path)
X = np.c_[np.array(data[var_h]), np.array(data[var_v])]
y = np.array(data[labels]) if labels != "" else None

if   normalization == "MinMax [0,1]":
  X = MinMaxScaler().fit_transform(X)
elif normalization == "MinMax [-1,1]":
  X = MinMaxScaler(feature_range=(-1,1)).fit_transform(X)
elif normalization == "Z-Score":
  X = StandardScaler().fit_transform(X)

data[var_h], data[var_v] = X[:, 0], X[:, 1]

_, ax = plt.subplots (figsize=(5,4))
# ax.scatter(X[:, 0], X[:, 1], c=y, cmap='Paired', edgecolors='k')
sns.scatterplot(ax=ax,data=data,x=var_h,y=var_v, hue=labels if labels != "" else None, palette='colorblind')
print ("\nData Loaded! ✅ \n")

 # **📋 Set up Agglomerative Clustering**

In [None]:
# @markdown ---
# @markdown \

# @markdown ## Compute dendogram
# @markdown \

# @markdown ---

import scipy.cluster.hierarchy as shc

plt.figure(figsize =(8, 6))
plt.title('Dendrogram')
Dendrogram = shc.dendrogram((shc.linkage(X, method='ward')))
plt.show()


In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score
import warnings
warnings.filterwarnings('ignore')

# @markdown \

# ============
# Parameters
# ============

n_clusters = 2 # @param {type:"integer"}
max_steps = 1 # @param {type:"integer"}
linkage = "ward" # @param ["average", "complete", "single", "ward"]
# distance_threshold = 1.2 # @param

agglom = AgglomerativeClustering(n_clusters=n_clusters, linkage=linkage)

silhouette_scores = []
print ("\nModel is ready!\n")

# **🤖 Run training!**

In [None]:

# @markdown ### Start now 🦾
# @markdown \

agglom.fit(X)

silhouette_average_score = silhouette_score(X, agglom.fit_predict(X))
silhouette_scores.append(silhouette_average_score)

print ("\nTraining done! ✅")


In [None]:

# @markdown ### 📊 Display result
# @markdown \

plt.scatter(X[:, 0], X[:, 1], c=agglom.fit_predict(X))
plt.title("{0} clusters".format(n_clusters))
plt.show()

