## Animal zoo dataset - DBSCAN clustering


In [None]:
# Zoo dataset : https://www.kaggle.com/datasets/uciml/zoo-animal-classification/code?datasetId=586&searchQuery=hier&select=zoo.csv

import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt

import umap
import umap.plot as uplot

from sklearn.cluster import DBSCAN 
from sklearn.neighbors import NearestNeighbors
from kneed import KneeLocator
from sklearn.metrics import adjusted_rand_score

from hdbscan.validity import validity_index

sns.set_theme()

The Zoo Dataset is a comprehensive collection of data about various animals found in zoos worldwide. The dataset is composed of several attributes related to these animals, such as:

- `animal_name`: The name of the animal.

- `hair`: Indicates if the animal has hair (1 for yes, 0 for no).
- `feathers`: Indicates if the animal has feathers (1 for yes, 0 for no).
- `eggs`: Indicates if the animal lays eggs (1 for yes, 0 for no).
- `milk`: Indicates if the animal produces milk (1 for yes, 0 for no).
- `airborne`: Indicates if the animal can fly (1 for yes, 0 for no).
- `aquatic`: Indicates if the animal lives in water (1 for yes, 0 for no).
- `predator`: Indicates if the animal is a predator (1 for yes, 0 for no).
- `toothed`: Indicates if the animal has teeth (1 for yes, 0 for no).
- `backbone`: Indicates if the animal has a backbone (1 for yes, 0 for no).
- `breathes`: Indicates if the animal breathes air (1 for yes, 0 for no).
- `venomous`: Indicates if the animal is venomous (1 for yes, 0 for no).
- `fins`: Indicates if the animal has fins (1 for yes, 0 for no).
- `legs`: Number of legs the animal has (integer value).
- `tail`: Indicates if the animal has a tail (1 for yes, 0 for no).
- `domestic`: Indicates if the animal is domesticated (1 for yes, 0 for no).
- `catsize`: Indicates if the animal is cat-sized or larger (1 for yes, 0 for no).
- `class_type`: Numerical code indicating the animal's taxonomic class.


Citation : Forsyth,Richard. (1990). Zoo. UCI Machine Learning Repository. https://doi.org/10.24432/C5R59V.

In [None]:
# Load the data
zoo_path = 'data/hierarchical/zoo/zoo.csv'
zoo_df = pd.read_csv(zoo_path)

# Load class mapping
class_mappings = 'data/hierarchical/zoo/class.csv'
class_mapper = pd.read_csv(class_mappings)

In [None]:
zoo_df.head()

In [None]:
class_mapper

In [None]:
zoo_df.shape

In [None]:
zoo_df.isna().sum(axis=0)

In [None]:
# Check number of unique animals 
zoo_df['animal_name'].nunique()

In [None]:
# Drop duplicates since there should be one species per row
zoo_df = zoo_df.drop_duplicates('animal_name')
zoo_df.shape

In [None]:
# Map class id to class name
class_map_dict = class_mapper.set_index('Class_Number')['Class_Type'].to_dict()
zoo_df['class_type'] = zoo_df['class_type'].map(class_map_dict)

# Extract class as separate object and drop class from zoo_df
animal_class = zoo_df['class_type'] 
zoo_df = zoo_df.drop(columns=['class_type'])

In [None]:
# Set animal name as index
zoo_df = zoo_df.set_index('animal_name', drop=True)

In [None]:
_ = zoo_df.hist(figsize=(7,7))
plt.tight_layout()

### DBSCAN

In [None]:
# Default values
umap_obj = umap.UMAP(metric='hamming')
embedding = umap_obj.fit_transform(zoo_df.to_numpy())

uplot.points(umap_obj, labels=animal_class)

In [None]:
# Identify MinPts
min_pts = zoo_df.shape[1] + 1

# Find distances to min_pts'th neighbor
knn = NearestNeighbors(metric='hamming')
knn.fit(zoo_df)
distances, _ = knn.kneighbors(zoo_df, n_neighbors=min_pts)


# Sort the distances to the min_pts'th neighbor
sort_dist = np.sort(distances[:,-1])

kneedle = KneeLocator(
    range(sort_dist.shape[0]), 
    sort_dist, 
    S=1.0, 
    curve="convex", 
    direction="increasing"
)
kneedle.plot_knee()

In [None]:
# Cluster 
dbscan = DBSCAN(eps=0.12, min_samples=min_pts, metric='hamming')
_ = dbscan.fit(zoo_df)

uplot.points(umap_obj, labels=dbscan.labels_)

In [None]:
# Identify MinPts
min_pts = int(np.log(zoo_df.shape[0]))
min_pts

In [None]:
# Find distances to min_pts'th neighbor
knn = NearestNeighbors(metric='hamming')
knn.fit(zoo_df)
distances, _ = knn.kneighbors(zoo_df, n_neighbors=min_pts)


# Sort the dsitances to min_pts'th neighbor
sort_dist = np.sort(distances[:,-1])

kneedle = KneeLocator(
    range(sort_dist.shape[0]), 
    sort_dist, 
    S=1.0, 
    curve="convex", 
    direction="increasing"
)
kneedle.plot_knee()

In [None]:
dbscan = DBSCAN(eps=0.19, min_samples=min_pts, metric='hamming')
_ = dbscan.fit(zoo_df)

uplot.points(umap_obj, labels=dbscan.labels_)

In [None]:
dbscan = DBSCAN(eps=0.124, min_samples=min_pts, metric='hamming')
_ = dbscan.fit(zoo_df)

uplot.points(umap_obj, labels=dbscan.labels_)

In [None]:
validity_index(
    X=zoo_df.to_numpy(), 
    labels=dbscan.labels_, 
    metric='hamming', 
    per_cluster_scores=True
)

In [None]:
uplot.points(umap_obj, labels=animal_class)

In [None]:
adjusted_rand_score(animal_class, dbscan.labels_)