## HDBSCAN for mixed data

In [None]:
import random

import numpy as np
import pandas as pd

from sklearn.preprocessing import RobustScaler
from sklearn.metrics import silhouette_score

import seaborn as sns
import matplotlib.pyplot as plt

import umap
import umap.plot as uplot

import hdbscan
import gower

random.seed(42)
sns.set_theme()

### The dataset 

**Used Phones & Tablets Pricing Dataset**

This dataset provides comprehensive details on various used mobile phones and tablets.

1. **device_brand**: The manufacturer or brand name of the device.
   
2. **os**: The operating system running on the device.
3. **screen_size**: The size of the device's screen in cm.
4. **4g**: Boolean indicating whether the device supports 4G connectivity.
5. **5g**: Boolean indicating whether the device supports 5G connectivity.
6. **rear_camera_mp**: The resolution of the rear camera, measured in megapixels.
7. **front_camera_mp**: The resolution of the front camera, also in megapixels.
8. **internal_memory**: Storage capacity of the device in gigabytes.
9.  **ram**: The amount of random-access memory (RAM) in gigabytes.
10. **battery**: Battery capacity, typically measured in milliampere-hours (mAh).
11. **weight**: The weight of the device, usually in grams.
12. **release_year**: The year the device was released.
13. **days_used**: The number of days the device has been used.
14. **normalized_used_price**: The price of the used device, normalized to a standard scale.
15. **normalized_new_price**: The original price of the device when new, normalized to a standard scale.

Citation: https://www.kaggle.com/datasets/ahsan81/used-handheld-device-data/data

In [None]:
# Load phone data
phone_df = pd.read_csv('data/phone_data/used_device_data.csv')
phone_df.head()

In [None]:
phone_df.shape

In [None]:
# Get feature info
phone_df.info()

In [None]:
# Find missing values
phone_df.isna().sum(axis=0)

In [None]:
phone_df[phone_df.isna().sum(axis=1) > 0].head(20)

In [None]:
# Drop missing values
phone_df = phone_df.dropna()

In [None]:
# Get only the categorical columns
cat_cols = phone_df.select_dtypes(include=['object', 'bool']).columns

# Determine the number of rows needed for the plot
n_rows = int(np.ceil(len(cat_cols) / 2))

# Create a figure and axes with subplots()
fig, axs = plt.subplots(n_rows, 2, figsize=(15, 5*n_rows)) # Adjust the size as necessary
axs = axs.ravel()  # this makes it easier to iterate over the axes

# Loop through the columns and create the bar plots
for i, col in enumerate(cat_cols):
    sns.countplot(x=col, data=phone_df, ax=axs[i])
    axs[i].set_title(f'Bar plot of {col}')
    axs[i].tick_params(axis='x', rotation=75)  # Optional: only if x-tick labels are long

# Remove any unused subplots
if len(cat_cols) % 2:
    fig.delaxes(axs[-1])

# 
plt.tight_layout()
plt.show()

In [None]:
# Plot numerical features
_ = phone_df.hist(figsize=(10,10))

In [None]:
# We will not use price for clustering since we want 
# to cluster based on phone characteristics
original_phone_df = phone_df.copy()
phone_df = phone_df.drop(columns=['normalized_used_price', 'normalized_new_price'])

In [None]:
numeric_col_names = phone_df.select_dtypes(include=[np.number]).columns.tolist()

phone_df[numeric_col_names] = RobustScaler().fit_transform(phone_df[numeric_col_names])
_ = phone_df.hist(figsize=(10,10))

### HDBSCAN clustering

In [None]:
categ_cool_bool = [col in cat_cols for col in phone_df.columns]

# Find gower distance
dist_matrix = gower.gower_matrix(phone_df, cat_features=categ_cool_bool)

# Umap embedding
game_umap = umap.UMAP(metric='precomputed', n_neighbors=30, min_dist=0.3).fit(dist_matrix)

uplot.points(game_umap)

In [None]:
# Run HDDBSCAN 
clusterer = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=30, gen_min_span_tree=True, metric='precomputed')
clusterer.fit(dist_matrix.astype(np.double))

np.unique(clusterer.labels_, return_counts=True)

In [None]:
# Evaluate the clustering
dbcv = hdbscan.validity.validity_index(
    dist_matrix.astype(np.double), 
    labels=clusterer.labels_, 
    metric='precomputed', 
    d = phone_df.shape[1],
    per_cluster_scores=True
)

non_noise_idx = np.where(clusterer.labels_!=-1)[0]

sil_score = silhouette_score(
    dist_matrix[non_noise_idx,:][:, non_noise_idx],
    clusterer.labels_[non_noise_idx],
    metric='precomputed'
)

print('DBCV : {}'.format(dbcv[0]))
print('Silouethe : {}'.format(sil_score))

uplot.points(game_umap, labels=clusterer.labels_)

In [None]:
from ch_10_funcs import plot_silouethes_dens

plot_silouethes_dens(
    data_df = dist_matrix.astype(np.double),
    clusters=clusterer.labels_ ,
    distance_measure='precomputed',
    figsize=(12,20),
)

In [None]:
pd.DataFrame(dbcv[1])

### Characterize clusters

In [None]:
# Use original features, remove noise
numeric_col_names = original_phone_df.select_dtypes(include=[np.number]).columns.tolist()
original_phone_df['cluster'] = clusterer.labels_
original_phone_df = original_phone_df[original_phone_df['cluster'] != -1]

In [None]:
# Plot features
for feature in original_phone_df.columns:
    
    if feature != 'cluster':
    
        if feature in numeric_col_names:
            
            plt.figure(figsize=(14, 6))
            sns.boxplot(original_phone_df, x='cluster', y=feature)
            plt.title(feature)
            
        else:
            
            proportions = original_phone_df.groupby(
                'cluster'
            )[feature].value_counts(
                normalize=True
            ).unstack()
                    
            # Create stacked bar plot
            ax = proportions.plot(kind='bar', stacked=True, figsize=(14, 6))
            plt.title(f'Cluster distribution for {feature}')
            plt.ylabel('Proportion')
            
            if proportions.shape[1] > 15:
                
                # Define hatch patterns
                patterns = ["|" , "\\" , "/" , "+" , ".", "*","x", "o"]


                # Apply hatch patterns
                for i, bar_container in enumerate(ax.containers):
                    hatch = patterns[i % len(patterns)]  # Cycle through patterns
                    for bar in bar_container:
                        bar.set_hatch(hatch)

            # Add legend
            plt.legend(title=feature, bbox_to_anchor=(1.05, 1), loc='upper left')

            # Get category labels
            category_labels = proportions.columns.tolist()[::-1]
            plt.show()