## Car dataset clustering

In [None]:
#  kaggle : https://www.kaggle.com/datasets/uciml/autompg-dataset?datasetId=1489
#  original link : https://archive.ics.uci.edu/dataset/9/auto+mpg

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt

from scipy.cluster.hierarchy import dendrogram, linkage, fcluster, inconsistent

from sklearn.preprocessing import StandardScaler

import umap
import umap.plot as uplot

from ch_9_funcs import plot_cluster_dendrogram, plot_silouethes_agglomer, cophenetic_corr

sns.set_theme()

### Load and preprocess the data

The car MPG (Miles Per Gallon) dataset, often known as the Auto MPG dataset, is a popular collection of data that was sourced from the 1970s and 1980s. It provides a detailed insight into various attributes of cars that were in the market during that period. 

Dataset contains following columns:

1. `mpg`: Stands for Miles Per Gallon. This measures the distance in miles that a car can travel per gallon of fuel.

2. `cylinders`: Indicates the number of cylinders in the car's engine. This can be related to the power output of the engine.

3. `displacement`: A measure of the total volume of all the cylinders in an engine, typically measured in cubic inches or cubic centimeters.

4. `horsepower`: The power output of the car's engine, typically measured in horsepower.

5. `weight`: The total weight of the car, typically measured in pounds.

6. `acceleration`: A measure of how quickly the car can increase its speed, typically represented in seconds to go from 0 to 60 miles per hour.

7. `model year`: The year when the car model was released, typically represented as a two-digit number from 70 to 82 (for 1970 to 1982).

8. `origin`: A categorical variable representing the region where the car was manufactured. This is usually represented as a number: 1 for America, 2 for Europe, and 3 for Asia.

9. `car name`: The full name of the car model, typically in the format of "Manufacturer Model" (e.g., "ford torino").

Citation :  Quinlan,R.. (1993). Auto MPG. UCI Machine Learning Repository. https://doi.org/10.24432/C5859H.



In [None]:
# Load car mpg dataset
car_df_path = 'data/hierarchical/car_mpg/auto-mpg.csv'
car_df = pd.read_csv(car_df_path)
car_df.shape

In [None]:
# Preview
car_df.head()

In [None]:
# Check for missing values
car_df.isna().sum(axis=0)

In [None]:
# Preview column datatypes
car_df.info()

In [None]:
car_df['horsepower'].unique()

In [None]:
# Remove missing values
car_df = car_df[car_df['horsepower'] != '?'].reset_index(drop=True)
car_df.shape

In [None]:
# Get number of unique car names
car_df['car name'].nunique()

In [None]:
# Give unique name to each car by adding prefix (nth_occurence_car-name)
diversifier = car_df.groupby('car name').cumcount().to_numpy()
diversifier = ['' if val==0 else '_' + str(val) for val in diversifier]
diversifier = pd.Series(diversifier)
car_df['car name'] = car_df['car name'] + diversifier + '_' + car_df['model year'].astype(str)

# Set car name as index
car_df = car_df.set_index('car name', drop=True)

In [None]:
# Cast other columns to float
car_df = car_df.astype(np.float32)

In [None]:
# Plot variable value distribution
_ = car_df.hist(figsize=(12,12))

In [None]:
# Remove origin column and save it as separate variable
origin = car_df['origin']

origin_mapping = {1: 'USA', 2: 'Europe', 3: 'Asia'}
origin = origin.map(origin_mapping)

In [None]:
# Make a copy of the dataframe for latter use
car_df_copy = car_df.copy()
car_df_copy['origin'] = origin

car_df = car_df.drop(columns=['origin'])


In [None]:
# Scale the data
scaler = StandardScaler()
car_df[:] = scaler.fit_transform(car_df)

## Agglomerative clustering

In [None]:
linkage_matrix = linkage(car_df.to_numpy(), method='ward')

In [None]:
cophenetic_corr(
    linkage_matrix=linkage_matrix,
    data_mtx = car_df.to_numpy()
)

In [None]:
# Plot dendrogram and label it by car type
plt.rcParams['figure.figsize'] = [12, 50]
_ = dendrogram(
    linkage_matrix, 
    labels=car_df.index,
    orientation='left', 
    leaf_font_size=7
)

In [None]:
# Plot dendrogram and add country of origin
plt.rcParams['figure.figsize'] = [12, 50]
_ = dendrogram(
    linkage_matrix, 
    labels=origin.values,
    orientation='left',
    leaf_font_size=7
)

### Clustering using the height method

In [None]:
# Perfrom UMAP dimensionality reduction
umap_obj = umap.UMAP(n_neighbors=15)
embedding_mtx = umap_obj.fit_transform(car_df)

uplot.points(umap_obj)

In [None]:
umap.plot.connectivity(umap_obj, show_points=True)

In [None]:
fig, ax = plt.subplots(1,1, figsize=(8,8))
umap.plot.diagnostic(umap_obj, diagnostic_type='pca',ax=ax)

In [None]:
# Perform clustering based on height
clusters = fcluster(
    Z=linkage_matrix,
    t=18, 
    criterion='distance',
)

dendr_colors = plot_cluster_dendrogram(
    linkage_matrix=linkage_matrix,
    dataset_df=car_df,
    clusters=clusters,
    leaf_font_size=7
)

In [None]:
_ = plot_silouethes_agglomer(
    data_df=car_df,
    clusters=clusters,
    dendr_colors=dendr_colors,
    embedding_mtx=embedding_mtx
)

In [None]:
car_df_copy['cluster'] = clusters
numeric_col_names = car_df_copy.select_dtypes(include=[np.number]).columns.tolist()

# Plot features
for feature in car_df_copy.columns:
    
    if feature != 'cluster':
    
        if feature in numeric_col_names:
            
            plt.figure(figsize=(14,5))
            sns.boxplot(car_df_copy, x='cluster', y=feature)
            plt.title(feature)
            
        else:
            
            proportions = car_df_copy.groupby(
                'cluster'
            )[feature].value_counts(
                normalize=True
            ).unstack()
            
            
                    
            # Create stacked bar plot
            ax = proportions.plot(kind='bar', stacked=True, figsize=(14, 6))
            plt.title(f'Cluster distribution for {feature}')
            plt.ylabel('Proportion')

            # Add legend
            plt.legend(title=feature, bbox_to_anchor=(1.05, 1), loc='upper left')

            # Get category labels
            category_labels = proportions.columns.tolist()[::-1]
            plt.show()

### Clustering using inconsistency method

In [None]:
# Calculate inconsistence matrix
depth=4
incosistent_mtx = inconsistent(linkage_matrix,depth)

# Plot inconsistency scores for given depth
plt.figure(figsize=(10, 7))
plt.plot(incosistent_mtx[:,-1])
plt.scatter(range(incosistent_mtx.shape[0]),incosistent_mtx[:,-1])
plt.title('Inconsistency values at depth = {}'.format(depth))
plt.xlabel('Index')
plt.ylabel('Inconsistency coefficient')
plt.show()

In [None]:
# Perform clustering based on inconsistency
# and print clusters
clusters = fcluster(Z=linkage_matrix, t=2.7, criterion='inconsistent', R=incosistent_mtx)
clusters

In [None]:
label_list = ['clust {} - {}'.format(clust, name) + name for name, clust in zip(car_df.index, clusters)]

dendr_colors = plot_cluster_dendrogram(
    linkage_matrix=linkage_matrix,
    dataset_df=car_df,
    clusters=clusters,
    leaf_font_size=8,
    labels=label_list
)

In [None]:
_ = plot_silouethes_agglomer(
    data_df=car_df,
    clusters=clusters,
    dendr_colors=dendr_colors,
    embedding_mtx=embedding_mtx
)

### Cluster characterization

In [None]:
car_df_copy['cluster'] = clusters
car_df_copy['cylinders'] = car_df_copy['cylinders'].astype(object)
numeric_col_names = car_df_copy.select_dtypes(include=[np.number]).columns.tolist()

# Plot features
for feature in car_df_copy.columns:
    
    if feature != 'cluster':
    
        if feature in numeric_col_names:
            
            plt.figure(figsize=(14,5))
            sns.boxplot(car_df_copy, x='cluster', y=feature)
            plt.title(feature)
            
        else:
            
            proportions = car_df_copy.groupby(
                'cluster'
            )[feature].value_counts(
                normalize=True
            ).unstack()
            
            
                    
            # Create stacked bar plot
            ax = proportions.plot(kind='bar', stacked=True, figsize=(14, 6))
            plt.title(f'Cluster distribution for {feature}')
            plt.ylabel('Proportion')

            # Add legend
            plt.legend(title=feature, bbox_to_anchor=(1.05, 1), loc='upper left')

            # Get category labels
            category_labels = proportions.columns.tolist()[::-1]
            plt.show()

In [None]:
car_df_copy[car_df_copy['cluster']==23]

In [None]:
car_df_copy[car_df_copy['cluster']==2]