In [29]:
import pandas as pd
import numpy as np
import os
import itertools

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.cluster import AgglomerativeClustering

In [30]:
def preprocess_adult(train_path='datasets/adult/adult.data',
                     test_path='datasets/adult/adult.test'):
    adult_names = ["age", "workclass", "fnlwgt", "education", "education-num",
                    "marital-status", "occupation", "relationship", "race","sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"]
    adult_data = pd.read_csv(train_path, header=None, 
                            names=adult_names, na_values=' ?')
    adult_data = adult_data.apply(lambda x: x.str.strip() if x.dtype == "object" else x) # strip whitespace
    adult_test = pd.read_csv(test_path, header=None, 
                             names=adult_names, na_values=' ?')
    adult_test = adult_test.apply(lambda x: x.str.strip() if x.dtype == "object" else x) # strip whitespace
    dfs = [adult_data, adult_test]
    adult_df = pd.concat(dfs)
    adult_df = adult_df.dropna()
    adult_df = adult_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) # strip whitespace

    # last column in adult has some textual discrepancy
    adult_df = adult_df.replace(">50K.", ">50K")
    adult_df = adult_df.replace("<=50K.", "<=50K")

    # Split into X and y
    X, y = adult_df.drop("income", axis=1), adult_df["income"]

    # Select categorical and numerical features
    cat_idx = X.select_dtypes(include=["object", "bool"]).columns
    num_idx = X.select_dtypes(include=['int64', 'float64']).columns
    steps = [('cat', OneHotEncoder(handle_unknown='ignore'), cat_idx), ('num', StandardScaler(), num_idx)]
    col_transf = ColumnTransformer(steps)

    # label encoder to target variable so we have classes 0 and 1
    assert(len(np.unique(y)) == 2)
    X = col_transf.fit_transform(X)
    y = LabelEncoder().fit_transform(y)
    return X, y

In [34]:
X, y = preprocess_adult()
clustering = AgglomerativeClustering(n_clusters=30, compute_distances=True)
clustering.fit(X.toarray())

In [45]:
clustering.children_[0][1]

43493

In [46]:
clustering.children_

array([[13601, 43493],
       [28123, 40169],
       [18133, 41043],
       ...,
       [90433, 90439],
       [90300, 90440],
       [90418, 90441]])

In [48]:
from scipy.cluster.hierarchy import dendrogram
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack(
        [model.children_, model.distances_, counts]
    ).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [50]:
plot_dendrogram(clustering)

AttributeError: 'AgglomerativeClustering' object has no attribute 'distances_'

In [None]:
clustering