### Louvain clustering on categorical data

In [None]:
import random

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
import seaborn as sns

from sklearn.metrics.cluster import adjusted_rand_score
from sklearn.neighbors import kneighbors_graph

import igraph as ig

from ch_11_funcs import (
    print_clustering_stats, 
    display_network_clusters_labels,
    plot_clust_stats
)

sns.set_theme()
plt.rcParams["figure.figsize"] = (10,10)
random.seed(2)

### Load and preprocess the zoo dataset

The Zoo Dataset is a comprehensive collection of data about various animals found in zoos worldwide. The dataset is composed of several attributes related to these animals, such as:

- `animal_name`: The name of the animal.

- `hair`: Indicates if the animal has hair (1 for yes, 0 for no).
- `feathers`: Indicates if the animal has feathers (1 for yes, 0 for no).
- `eggs`: Indicates if the animal lays eggs (1 for yes, 0 for no).
- `milk`: Indicates if the animal produces milk (1 for yes, 0 for no).
- `airborne`: Indicates if the animal can fly (1 for yes, 0 for no).
- `aquatic`: Indicates if the animal lives in water (1 for yes, 0 for no).
- `predator`: Indicates if the animal is a predator (1 for yes, 0 for no).
- `toothed`: Indicates if the animal has teeth (1 for yes, 0 for no).
- `backbone`: Indicates if the animal has a backbone (1 for yes, 0 for no).
- `breathes`: Indicates if the animal breathes air (1 for yes, 0 for no).
- `venomous`: Indicates if the animal is venomous (1 for yes, 0 for no).
- `fins`: Indicates if the animal has fins (1 for yes, 0 for no).
- `legs`: Number of legs the animal has (integer value).
- `tail`: Indicates if the animal has a tail (1 for yes, 0 for no).
- `domestic`: Indicates if the animal is domesticated (1 for yes, 0 for no).
- `catsize`: Indicates if the animal is cat-sized or larger (1 for yes, 0 for no).
- `class_type`: Numerical code indicating the animal's taxonomic class.


Citation : Forsyth,Richard. (1990). Zoo. UCI Machine Learning Repository. https://doi.org/10.24432/C5R59V.

In [None]:
# Load the data
zoo_path = 'data/hierarchical/zoo/zoo.csv'
zoo_df = pd.read_csv(zoo_path)

# Load class mapping
class_mappings = 'data/hierarchical/zoo/class.csv'
class_mapper = pd.read_csv(class_mappings)

In [None]:
zoo_df.head()

In [None]:
class_mapper

In [None]:
zoo_df.shape

In [None]:
zoo_df.isna().sum(axis=0)

In [None]:
# Check number of unique animals 
zoo_df['animal_name'].nunique()

In [None]:
# Drop duplicates since there should be one species per row
zoo_df = zoo_df.drop_duplicates('animal_name')
zoo_df.shape

In [None]:
# Map class id to class name
class_map_dict = class_mapper.set_index('Class_Number')['Class_Type'].to_dict()
zoo_df['class_type'] = zoo_df['class_type'].map(class_map_dict)

# Extract class as separate object and drop class from zoo_df
animal_class = zoo_df['class_type'] 
zoo_df = zoo_df.drop(columns=['class_type'])

In [None]:
# Set animal name as index
zoo_df = zoo_df.set_index('animal_name', drop=True)

In [None]:
_ = zoo_df.hist(figsize=(7,7))
plt.tight_layout()

### Encode and cluster the data

In [None]:
N_NEIGHBORS = 10

In [None]:
# Create KNN graph
distance_matrix = kneighbors_graph(zoo_df, n_neighbors=N_NEIGHBORS, mode='distance', metric='hamming')

# Eliminate zeros
distance_matrix.eliminate_zeros()

# Transform distance matrix into adjacency matrix
distance_matrix.data = 1.0 - distance_matrix.data
adj_matrix = distance_matrix

# Create graph from adjacency matrix
g = ig.Graph.Weighted_Adjacency(adj_matrix, mode="undirected")

In [None]:
map_dict = {species_name:species_id for species_id, species_name in enumerate(set(animal_class))}
map_dict

In [None]:
# Colour the nodes
collor_palette = ig.ClusterColoringPalette(n=len(map_dict))
node_colors = [collor_palette[map_dict[species]] for species in animal_class]
g.vs['color'] = node_colors

legend_colors = []
legend_text = []
for species_name, species_id in map_dict.items():
    
    legend_colors.append(
        Line2D([0], [0], color=collor_palette[species_id], lw=4)
    )
    legend_text.append(str(species_name))

In [None]:
# Use auto layout
lf_layout = g.layout_fruchterman_reingold(weights='weight')

_, ax = plt.subplots(figsize=(10,10))
ig.plot(
    g,
    vertex_size=0.2, 
    edge_width=0.1,
    layout=lf_layout,
    target=ax
)

ax.legend(legend_colors, legend_text)

plt.show()

In [None]:
ig_clusters = ig.Graph.community_multilevel(
    g, resolution=1,
    weights='weight'
)

print_clustering_stats(
    ig_clusters=ig_clusters,
    min_cluster_size=0
)

In [None]:
display_network_clusters_labels(
    ig_clusters,
    vertex_size=0.25, 
    edge_width=0.05,
    layout=lf_layout,
    color_edges=True,
    min_size=0
)

In [None]:
print('ARI : {}'.format(adjusted_rand_score(ig_clusters.membership, animal_class)))

In [None]:
plot_clust_stats(
    start_res=0.1,
    end_res=1.5,
    step=0.1,
    graph=g,
    original_data=zoo_df.to_numpy(),
    original_labels=animal_class,
    metric='hamming'
)

In [None]:
ig_clusters = ig.Graph.community_multilevel(
    g, resolution=0.6,
    weights='weight'
)

display_network_clusters_labels(
    ig_clusters,
    vertex_size=0.25, 
    edge_width=0.05,
    layout=lf_layout,
    color_edges=True,
    min_size=0
)

In [None]:
_, ax = plt.subplots(figsize=(10,10))
ig.plot(
    g,
    vertex_size=0.2, 
    edge_width=0.1,
    layout=lf_layout,
    target=ax
)

ax.legend(legend_colors, legend_text)

plt.show()