## HDBSCAN for categorical data

In [None]:
# !{sys.executable} -m pip install hdbscan
import pandas as pd
import numpy as np
import seaborn as sns

import hdbscan
from hdbscan.validity import validity_index

import umap
import umap.plot as uplot

from sklearn.metrics import silhouette_score
from sklearn.metrics.cluster import adjusted_rand_score

import matplotlib.pyplot as plt

sns.set_theme()

### Load animal zoo dataset

The Zoo Dataset is a comprehensive collection of data about various animals found in zoos worldwide. The dataset is composed of several attributes related to these animals, such as:

- `animal_name`: The name of the animal.

- `hair`: Indicates if the animal has hair (1 for yes, 0 for no).
- `feathers`: Indicates if the animal has feathers (1 for yes, 0 for no).
- `eggs`: Indicates if the animal lays eggs (1 for yes, 0 for no).
- `milk`: Indicates if the animal produces milk (1 for yes, 0 for no).
- `airborne`: Indicates if the animal can fly (1 for yes, 0 for no).
- `aquatic`: Indicates if the animal lives in water (1 for yes, 0 for no).
- `predator`: Indicates if the animal is a predator (1 for yes, 0 for no).
- `toothed`: Indicates if the animal has teeth (1 for yes, 0 for no).
- `backbone`: Indicates if the animal has a backbone (1 for yes, 0 for no).
- `breathes`: Indicates if the animal breathes air (1 for yes, 0 for no).
- `venomous`: Indicates if the animal is venomous (1 for yes, 0 for no).
- `fins`: Indicates if the animal has fins (1 for yes, 0 for no).
- `legs`: Number of legs the animal has (integer value).
- `tail`: Indicates if the animal has a tail (1 for yes, 0 for no).
- `domestic`: Indicates if the animal is domesticated (1 for yes, 0 for no).
- `catsize`: Indicates if the animal is cat-sized or larger (1 for yes, 0 for no).
- `class_type`: Numerical code indicating the animal's taxonomic class.


Citation : Forsyth,Richard. (1990). Zoo. UCI Machine Learning Repository. https://doi.org/10.24432/C5R59V.

In [None]:
# Load the data
zoo_path = 'data/hierarchical/zoo/zoo.csv'
zoo_df = pd.read_csv(zoo_path)

# Load class mapping
class_mappings = 'data/hierarchical/zoo/class.csv'
class_mapper = pd.read_csv(class_mappings)

In [None]:
zoo_df.head()

In [None]:
class_mapper

In [None]:
zoo_df.shape

In [None]:
zoo_df.isna().sum(axis=0)

In [None]:
# Check number of unique animals 
zoo_df['animal_name'].nunique()

In [None]:
# Drop duplicates since there should be one species per row
zoo_df = zoo_df.drop_duplicates('animal_name')
zoo_df.shape

In [None]:
# Map class id to class name
class_map_dict = class_mapper.set_index('Class_Number')['Class_Type'].to_dict()
zoo_df['class_type'] = zoo_df['class_type'].map(class_map_dict)

# Extract class as separate object and drop class from zoo_df
animal_class = zoo_df['class_type'] 
zoo_df = zoo_df.drop(columns=['class_type'])

In [None]:
# Set animal name as index
zoo_df = zoo_df.set_index('animal_name', drop=True)

In [None]:
_ = zoo_df.hist(figsize=(7,7))
plt.tight_layout()

In [None]:
zoo_umap = umap.UMAP(metric='hamming', n_neighbors=20).fit(zoo_df.values)

In [None]:
uplot.points(zoo_umap, labels=animal_class)

### HDBSCAN clustering

In [None]:
def print_clustering_stats(clusterer, data_df, data_labels, metric):
    
    # Create reduced version of data (exclude noise)
    np_labels = np.array(clusterer.labels_)
    non_noise_idx = np.where(np_labels != -1)

    non_noise_labels = np_labels[non_noise_idx]
    data_labels_sub = data_labels[non_noise_idx]
    digits_data_sub = data_df[non_noise_idx]
    noise_size = np_labels.shape[0] - non_noise_labels.shape[0]
    
    print('ARI : {}'.format(adjusted_rand_score(np_labels, data_labels)))
    print('ARI sub : {}'.format(adjusted_rand_score(non_noise_labels, data_labels_sub)))
    print('noise size : {}'.format(noise_size))
    print('Silouethe : {}'.format(silhouette_score(data_df, np_labels, metric=metric)))
    print('Silouethe sub : {}'.format(silhouette_score(digits_data_sub, data_labels_sub, metric=metric)))
    print('DBCV : {}'.format(validity_index(data_df, np_labels, metric=metric)))
    

In [None]:
clusterer = hdbscan.HDBSCAN(
    min_samples=5, 
    min_cluster_size=10, 
    gen_min_span_tree=True, 
    metric='hamming'
)
clusterer.fit(zoo_df.values)

print_clustering_stats(
    clusterer=clusterer,
    data_df=zoo_df.values, 
    data_labels=animal_class.values,
    metric='hamming'
)

uplot.points(zoo_umap, labels=clusterer.labels_)

In [None]:
clusterer = hdbscan.HDBSCAN(
    min_samples=2, 
    min_cluster_size=7, 
    gen_min_span_tree=True, 
    metric='hamming'
)
clusterer.fit(zoo_df.values)

print_clustering_stats(
    clusterer=clusterer,
    data_df=zoo_df.values, 
    data_labels=animal_class.values,
    metric='hamming'
)

uplot.points(zoo_umap, labels=clusterer.labels_)