In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
import joblib

In [2]:
df = pd.read_csv("dataset.csv")
df.head()


Unnamed: 0,Customer_ID,Group,Satisfaction_Score,Age,Gender,Location,Purchase_History,Support_Contacted,Loyalty_Level,Satisfaction_Factor
0,81-237-4704,A,6,55,Male,Phoenix.AZ,Yes,No,Low,Price
1,14-117-0504,A,7,46,Female,Los Angeles.CA,Yes,No,Low,Product Variety
2,21-336-6416,A,7,55,Female,Houston.TX,Yes,Yes,Low,Packaging
3,59-781-3650,A,1,42,Male,Los Angeles.CA,Yes,No,Medium,Price
4,52-712-5734,B,9,37,Male,Austin.TX,Yes,Yes,Medium,Packaging


In [3]:
numeric_df = df.select_dtypes(include=["int64", "float64"])

In [4]:
numeric_df.dropna(inplace=True)

In [5]:
# Scale features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(numeric_df)

In [6]:
# Train Hierarchical (Agglomerative) Clustering
model = AgglomerativeClustering(
    n_clusters=3,     # can be changed later
    linkage="ward"
)

In [7]:
model.fit(scaled_data)

0,1,2
,"n_clusters  n_clusters: int or None, default=2 The number of clusters to find. It must be ``None`` if ``distance_threshold`` is not ``None``.",3
,"metric  metric: str or callable, default=""euclidean"" Metric used to compute the linkage. Can be ""euclidean"", ""l1"", ""l2"", ""manhattan"", ""cosine"", or ""precomputed"". If linkage is ""ward"", only ""euclidean"" is accepted. If ""precomputed"", a distance matrix is needed as input for the fit method. If connectivity is None, linkage is ""single"" and affinity is not ""precomputed"" any valid pairwise distance metric can be assigned. For an example of agglomerative clustering with different metrics, see :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_clustering_metrics.py`. .. versionadded:: 1.2",'euclidean'
,"memory  memory: str or object with the joblib.Memory interface, default=None Used to cache the output of the computation of the tree. By default, no caching is done. If a string is given, it is the path to the caching directory.",
,"connectivity  connectivity: array-like, sparse matrix, or callable, default=None Connectivity matrix. Defines for each sample the neighboring samples following a given structure of the data. This can be a connectivity matrix itself or a callable that transforms the data into a connectivity matrix, such as derived from `kneighbors_graph`. Default is ``None``, i.e, the hierarchical clustering algorithm is unstructured. For an example of connectivity matrix using :class:`~sklearn.neighbors.kneighbors_graph`, see :ref:`sphx_glr_auto_examples_cluster_plot_ward_structured_vs_unstructured.py`.",
,"compute_full_tree  compute_full_tree: 'auto' or bool, default='auto' Stop early the construction of the tree at ``n_clusters``. This is useful to decrease computation time if the number of clusters is not small compared to the number of samples. This option is useful only when specifying a connectivity matrix. Note also that when varying the number of clusters and using caching, it may be advantageous to compute the full tree. It must be ``True`` if ``distance_threshold`` is not ``None``. By default `compute_full_tree` is ""auto"", which is equivalent to `True` when `distance_threshold` is not `None` or that `n_clusters` is inferior to the maximum between 100 or `0.02 * n_samples`. Otherwise, ""auto"" is equivalent to `False`.",'auto'
,"linkage  linkage: {'ward', 'complete', 'average', 'single'}, default='ward' Which linkage criterion to use. The linkage criterion determines which distance to use between sets of observation. The algorithm will merge the pairs of cluster that minimize this criterion. - 'ward' minimizes the variance of the clusters being merged. - 'average' uses the average of the distances of each observation of  the two sets. - 'complete' or 'maximum' linkage uses the maximum distances between  all observations of the two sets. - 'single' uses the minimum of the distances between all observations  of the two sets. .. versionadded:: 0.20  Added the 'single' option For examples comparing different `linkage` criteria, see :ref:`sphx_glr_auto_examples_cluster_plot_linkage_comparison.py`.",'ward'
,"distance_threshold  distance_threshold: float, default=None The linkage distance threshold at or above which clusters will not be merged. If not ``None``, ``n_clusters`` must be ``None`` and ``compute_full_tree`` must be ``True``. .. versionadded:: 0.21",
,"compute_distances  compute_distances: bool, default=False Computes distances between clusters even if `distance_threshold` is not used. This can be used to make dendrogram visualization, but introduces a computational and memory overhead. .. versionadded:: 0.24 For an example of dendrogram visualization, see :ref:`sphx_glr_auto_examples_cluster_plot_agglomerative_dendrogram.py`.",False


In [8]:
joblib.dump(
    {
        "scaler": scaler,
        "features": numeric_df.columns.tolist(),
        "labels": model.labels_
    },
    "model.pkl"
)

['model.pkl']

In [9]:

print("✅ Hierarchical clustering training completed")

✅ Hierarchical clustering training completed


In [10]:
%%writefile requirements.txt
numpy
pandas
scikit-learn
streamlit
matplotlib
scipy
joblib

Writing requirements.txt


In [11]:
%%writefile runtime.txt
python-3.10

Writing runtime.txt
