In [7]:
import pandas as pd
from sklearn.metrics import normalized_mutual_info_score, adjusted_rand_score
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the datasets
df_age = pd.read_csv("age_clusters.xlsx")
df_income = pd.read_csv("income_spending_clusters.xlsx")

# Merge them on ID
df_merged = pd.merge(df_age, df_income, on="ID", suffixes=("_age", "_income"))

print("Merged dataset:")
print(df_merged)

# Crosstab
crosstab = pd.crosstab(df_merged['Cluster_age'], df_merged['Cluster_income'])
print("\nCrosstab between clusters:")
print(crosstab)

# Heatmap
sns.heatmap(crosstab, annot=True, fmt="d", cmap="Blues")
plt.title("CSV (Age) vs CSV (Income+Spending) Cluster Crosstab")
plt.show()

# --- Correlation Metrics ---

# Cramér's V function
def cramers_v(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2 = np.sum((confusion_matrix - confusion_matrix.mean())**2 / confusion_matrix.mean())
    n = confusion_matrix.sum().sum()
    phi2 = chi2 / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))

# Calculate metrics
cramers_v_score = cramers_v(df_merged['Cluster_age'], df_merged['Cluster_income'])
nmi = normalized_mutual_info_score(df_merged['Cluster_age'], df_merged['Cluster_income'])
ari = adjusted_rand_score(df_merged['Cluster_age'], df_merged['Cluster_income'])

print("\nCluster Correlation Metrics:")
print(f"Cramér's V: {cramers_v_score:.3f}")
print(f"Normalized Mutual Information (NMI): {nmi:.3f}")
print(f"Adjusted Rand Index (ARI): {ari:.3f}")


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xad in position 41: invalid start byte