## Agglomerative clustering - mixed data

In [None]:
import math
import random
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, inconsistent
from scipy.spatial.distance import squareform

from sklearn.preprocessing import StandardScaler, OrdinalEncoder

from ch_9_funcs import plot_cluster_dendrogram, plot_silouethes_agglomer, cophenetic_corr
from ch_9_consts import CAR_85_COLUMN_DESCRIPTION

import umap 
import umap.plot as uplot

import gower

random.seed(1)
sns.set_theme()

### Load and preprocess

The 1985 Automobile Dataset is a comprehensive collection of data that captures various specifications and details about automobiles from that year. It typically includes various characteristics of the cars. Dataset contains following columns:

- `symboling`: Insurance risk rating, ranges from -3 to 3.

- `normalized-losses`: Average loss payment per insured vehicle, continuous from 65 to 256.
- `make`: Car manufacturer, e.g., BMW, Audi.
- `fuel-type`: Type of fuel used, diesel or gas.
- `aspiration`: Type of aspiration, standard (std) or turbo.
- `num-of-doors`: Number of doors, either four or two.
- `body-style`: Car body style, e.g., sedan, hatchback.
- `drive-wheels`: Type of drive wheels, 4WD, FWD, RWD.
- `engine-location`: Location of the engine, front or rear.
- `wheel-base`: Distance between front and rear wheels, continuous from 86.6 to 120.9.
- `length`: Length of the car, continuous from 141.1 to 208.1.
- `width`: Width of the car, continuous from 60.3 to 72.3.
- `height`: Height of the car, continuous from 47.8 to 59.8.
- `curb-weight`: Weight of the car without occupants, continuous from 1488 to 4066.
- `engine-type`: Type of engine, e.g., DOHC, OHCV.
- `num-of-cylinders`: Number of cylinders, e.g., four, six.
- `engine-size`: Size of the engine, continuous from 61 to 326.
- `fuel-system`: Type of fuel system, e.g., 1bbl, mpfi.
- `bore`: Diameter of each cylinder, continuous from 2.54 to 3.94.
- `stroke`: Distance piston travels in cylinder, continuous from 2.07 to 4.17.
- `compression-ratio`: Compression ratio of the engine, continuous from 7 to 23.
- `horsepower`: Engine power, continuous from 48 to 288.
- `peak-rpm`: Maximum engine speed, continuous from 4150 to 6600.
- `city-mpg`: City mileage, continuous from 13 to 49.
- `highway-mpg`: Highway mileage, continuous from 16 to 54.
- `price`: Price of the car, continuous from 5118 to 45400.

Citation : Schlimmer,Jeffrey. (1987). Automobile. UCI Machine Learning Repository. https://doi.org/10.24432/C5B01C.

In [None]:
# Load and preview the dataset
data_path = 'data/hierarchical/auto_clean/auto_86_dataset.csv'
car_df = pd.read_csv(data_path)
car_df.head()

In [None]:
car_df.shape

In [None]:
car_df = car_df.replace('?', np.nan)

In [None]:
# Check data for missing values
car_df.isna().sum(axis=0)

In [None]:
car_df = car_df.drop(columns=['normalized-losses'])

In [None]:
car_df.info()

In [None]:
pd.set_option('display.max_columns', None)
car_df

In [None]:
categ_cols = ['make', 'fuel-type', 'aspiration', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'fuel-system']

In [None]:
num_mapper = {
    'one': 1,
    'two': 2,
    'three': 3,
    'four': 4,
    'five': 5,
    'six': 6,
    'seven': 7,
    'eight': 8,
    'nine': 9,
    'ten': 10
}

car_df['num-of-doors'] = car_df['num-of-doors'].map(num_mapper)
car_df['num-of-cylinders'] = car_df['num-of-cylinders'].map(num_mapper)

In [None]:
num_cols = list(set(car_df.columns) - set(categ_cols))

car_df[num_cols] = car_df[num_cols].astype(float)

In [None]:
car_df.info()

In [None]:
# Drop missing values
car_df = car_df.dropna().reset_index(drop=True)

In [None]:
# Visual inspection of numerical variables
plt.rcParams['figure.figsize'] = [12, 14]
_ = car_df.hist(bins=20)

In [None]:
# Plot categorical columns
num_plots = len(categ_cols)

# Determine number of rows needed for the grid
num_rows = math.ceil(num_plots / 3.0)

# Create subplots
fig, axs = plt.subplots(num_rows, 3, figsize=(15, num_rows*5))

# Flatten the axes array if there's more than one row
axs = axs.flatten()

# Plot the data
for i, column in enumerate(categ_cols):
    ax = axs[i] 
    car_df[column].value_counts().plot(
        kind='bar', 
        ax=ax, 
        title=column, 
        xlabel=''
    )

plt.tight_layout()

In [None]:
# Auto-create price bins
bins = np.linspace(car_df['price'].min(), car_df['price'].max(), num=5)
bins

In [None]:
# Customize price bins
custom_bins = [ 5000 , 10000,  15188,  25259 , 35329,  45500]

In [None]:
# Bin the price so we can easily plot prices
# on the dendrogram
car_sec_info = car_df.copy()
car_sec_info['price_bins'] = pd.cut(car_sec_info['price'], custom_bins, labels=["P1", "P2", "P3", "P4", "P5"])

car_df = car_df.drop(columns='price')

In [None]:
# Number of cars per price bin
car_sec_info['price_bins'].value_counts()

In [None]:
# Get categorical and numerical columns
categ_cols = list(car_df.select_dtypes(include=['object']).columns)
num_cols = list(set(car_df.columns) - set(categ_cols))

categ_bool = [col in categ_cols for col in car_df.columns]

In [None]:
# Scale numerical columns
scaler = StandardScaler()
car_df[num_cols] = scaler.fit_transform(car_df[num_cols])

In [None]:
# Visual inspection of numerical variables
plt.rcParams['figure.figsize'] = [12, 14]
_ = car_df.hist(bins=20)

In [None]:
# Encode categorical variables
ord_enc = OrdinalEncoder()
car_df[categ_cols] = ord_enc.fit_transform(car_df[categ_cols])

### Perform clustering based on gower distance



In [None]:
# Calculate distance based on both, numerical and categorical variables
gow_distances = gower.gower_matrix(car_df, cat_features=categ_bool)

In [None]:
# Perfrom UMAP dimensionality reduction
umap_obj = umap.UMAP(n_neighbors=30, metric='precomputed')
embedding_mtx = umap_obj.fit_transform(gow_distances)

uplot.points(umap_obj)

In [None]:
umap.plot.connectivity(umap_obj, show_points=True)

In [None]:
umap.plot.diagnostic(umap_obj, diagnostic_type='pca')

In [None]:
# Create dendrogram based on precomputed distance
gow_distances_condens = squareform(gow_distances)

linkage_matrix = linkage(
    gow_distances_condens, 
    method='average'
)

In [None]:
cophenetic_corr(
    linkage_matrix=linkage_matrix,
    data_mtx = gow_distances,
    distance_measure='precomputed'
)

In [None]:
# Plot the dendrogram
plt.rcParams['figure.figsize'] = [12, 30] 
_ = dendrogram(
    linkage_matrix, 
    orientation='left', 
    labels=car_df.index,
    leaf_font_size=7
)

In [None]:
# Plot the dendrogram
plt.rcParams['figure.figsize'] = [12, 30] 
_ = dendrogram(
    linkage_matrix, 
    orientation='left', 
    labels=car_sec_info['price_bins'].to_numpy(),
    leaf_font_size=7
)

In [None]:
# Calculate inconsistence matrix
depth=4
incosistent_mtx = inconsistent(linkage_matrix,depth)

# Plot inconsistency scores for given depth
plt.figure(figsize=(10, 7))
plt.plot(incosistent_mtx[:,-1])
plt.scatter(range(incosistent_mtx.shape[0]),incosistent_mtx[:,-1])
plt.title('Inconsistency values at depth = {}'.format(depth))
plt.xlabel('Index')
plt.ylabel('Inconsistency coefficient')
plt.show()

In [None]:
# Perform clustering based on inconsistency
# and print clusters
clusters = fcluster(Z=linkage_matrix, t=1.6, criterion='inconsistent', R=incosistent_mtx)
clusters

In [None]:
labels = [
    'clust {} - price bin {}'.format(clust, price) 
    for clust, price in zip(clusters, car_sec_info['price_bins'].to_numpy())
]

dendr_colors = plot_cluster_dendrogram(
    linkage_matrix=linkage_matrix,
    dataset_df=car_df,
    clusters=clusters,
    leaf_font_size=7,
    labels=labels
)

In [None]:
_ = plot_silouethes_agglomer(
    data_df=gow_distances,
    clusters=clusters,
    dendr_colors=dendr_colors,
    embedding_mtx=embedding_mtx,
    distance_measure='precomputed',
    figsize=(16,12)
)

In [None]:
car_sec_info['cluster'] = clusters
clust_size = car_sec_info['cluster'].value_counts().sort_index(ascending=False)
clust_size

In [None]:
keep_clust = clust_size[clust_size>2].index.to_list()
keep_clust

In [None]:
car_sec_info = car_sec_info[car_sec_info['cluster'].isin(keep_clust)]
numeric_col_names = car_sec_info.select_dtypes(include=[np.number]).columns.tolist()


# Plot cluster feature values
for feature in car_sec_info.columns:
    
    if feature != 'cluster':
        
        print(CAR_85_COLUMN_DESCRIPTION[feature])
    
        if feature in numeric_col_names:
            
            plt.figure(figsize=(14,5))
            sns.boxplot(car_sec_info, x='cluster', y=feature)
            plt.title(feature)
            plt.show()
            
        else:
            
            
            proportions = car_sec_info.groupby(
                'cluster'
            )[feature].value_counts(
                normalize=True
            ).unstack()
            
            
            if proportions.shape[1] > 15:
                
                # Create stacked bar plot
                ax = proportions.plot(kind='bar', stacked=True, figsize=(14, 6), colormap='hsv')
                plt.title(f'Cluster distribution for {feature}')
                plt.ylabel('Proportion')
                
                # Define hatch patterns
                patterns = ["|" , "\\" , "/" , "+" , "-", ".", "*","x", "o", "O" ]

                # Apply hatch patterns
                for i, bar_container in enumerate(ax.containers):
                    hatch = patterns[i % len(patterns)]  # Cycle through patterns
                    for bar in bar_container:
                        bar.set_hatch(hatch)
        
            else:
            
                # Create stacked bar plot
                ax = proportions.plot(kind='bar', stacked=True, figsize=(14, 6), colormap='Set1')
                plt.title(f'Cluster distribution for {feature}')
                plt.ylabel('Proportion')
            

            # Add legend
            plt.legend(title=feature, bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=13)

            # Get category labels
            category_labels = proportions.columns.tolist()[::-1]
            plt.show()