In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency

# --- Load raw data ---
# Assuming you have a dataset with columns: ID, Age, Income, Spending
df = pd.read_csv("sample_1d_2d_clusters.csv")
print("Raw dataset:")
print(df.head())

# --- Clustering ---
# Cluster age into 3 groups
kmeans_age = KMeans(n_clusters=3, random_state=42)
df['Cluster_age'] = kmeans_age.fit_predict(df[['Age']])

# Cluster income + spending into 3 groups
kmeans_income = KMeans(n_clusters=3, random_state=42)
df['Cluster_income'] = kmeans_income.fit_predict(df[['Income', 'Spending']])

print("\nClustered dataset:")
print(df.head())

# --- Crosstab ---
crosstab = pd.crosstab(df['Cluster_age'], df['Cluster_income'])
print("\nCrosstab between clusters:")
print(crosstab)

# Heatmap
sns.heatmap(crosstab, annot=True, fmt="d", cmap="Blues")
plt.title("Age vs Income+Spending Cluster Crosstab")
plt.show()

# --- Correlation Metrics ---
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2, _, _, _ = chi2_contingency(confusion_matrix)
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

cramers_v_score = cramers_v(df['Cluster_age'], df['Cluster_income'])
nmi = normalized_mutual_info_score(df['Cluster_age'], df['Cluster_income'])
ari = adjusted_rand_score(df['Cluster_age'], df['Cluster_income'])

print("\nCluster Correlation Metrics:")
print(f"Cramér's V: {cramers_v_score:.3f}")
print(f"Normalized Mutual Information (NMI): {nmi:.3f}")
print(f"Adjusted Rand Index (ARI): {ari:.3f}")


Raw dataset:
   ID  Age  Income  SpendingScore
0   1   22   15000             25
1   2   25   16000             28
2   3   24   15500             26
3   4   45   60000             70
4   5   46   62000             68


KeyError: "['Spending'] not in index"