# Multiple Correspondence Analysis (MCA) – Restaurant Consumer Profiles

This notebook reproduces the full pipeline for:

1. Loading the **restaurant consumer profile dataset**
2. Cleaning and engineering features (BMI, age groups)
3. Converting categorical variables to a **binary indicator matrix**
4. Running **Multiple Correspondence Analysis (MCA)**
5. Visualising:
   - Scree plot (explained inertia)
   - MCA biplot (Dim 1 vs Dim 2)
   - Clusters in MCA space
   - Heatmap of indicators by personality type

Data files are stored in:

- `data/raw/` – original CSV
- `data/processed/` – cleaned data & MCA matrix
- `data/processed/mca/` – MCA outputs

In [None]:
from pathlib import Path
import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import prince  # MCA implementation

plt.rcParams["figure.figsize"] = (6, 4)
plt.rcParams["figure.dpi"] = 120

# Paths (relative to repo root)
RAW_CSV_PATH = Path("data/raw/restaurant-data-and-consumer-ratings/userprofile.csv")
PROCESSED_DIR = Path("data/processed")
MCA_DIR = PROCESSED_DIR / "mca"

ROLE_COLUMN = "personality"   # grouping variable for heatmap
N_COMPONENTS = 5
N_CLUSTERS = 3

PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
MCA_DIR.mkdir(parents=True, exist_ok=True)

In [None]:
print("Loading raw data from:", RAW_CSV_PATH)
df_raw = pd.read_csv(RAW_CSV_PATH)

df_raw.shape, df_raw.head()

In [None]:
df = df_raw.copy()

# Drop irrelevant columns if present
cols_to_drop = [c for c in ["latitude", "longitude", "the_geom_meter", "color"] if c in df.columns]
df = df.drop(columns=cols_to_drop)
print("After dropping lat/long/geom/color:", df.shape)

# Clean text columns: strip & lowercase
obj_cols = df.select_dtypes(include="object").columns
for col in obj_cols:
    df[col] = df[col].astype(str).str.strip().str.lower()

# Ensure birth_year is numeric
if "birth_year" in df.columns:
    df["birth_year"] = pd.to_numeric(df["birth_year"], errors="coerce")

# Compute BMI
df["bmi"] = df["weight"] / (df["height"] ** 2)

# Filter unrealistic adult heights
df = df[df["height"] >= 1.6]
print("After filtering height >= 1.6m:", df.shape)

# BMI clinical categories
df["bmi_group"] = pd.cut(
    df["bmi"],
    bins=[0, 18.5, 25, 30, 35, 40, np.inf],
    labels=["underweight", "normal", "overweight", "obesity_i", "obesity_ii", "obesity_iii"],
    include_lowest=True,
)

# Age decade groups
if "birth_year" in df.columns:
    age_bins = np.arange(1930, 2031, 10)  # 1930–2030 by decade
    age_labels = [f"{y}s" for y in range(1930, 2030, 10)]
    df["age_group"] = pd.cut(df["birth_year"], bins=age_bins, labels=age_labels, right=False)
else:
    df["age_group"] = np.nan

# Drop now-unused numeric fields
for col in ["birth_year", "height", "weight", "bmi"]:
    df = df.drop(columns=[col], errors="ignore")

# Drop userID if present
df = df.drop(columns=["userID"], errors="ignore")

# Drop rows with missing values
df = df.dropna()
print("After dropna:", df.shape)

df.head()

In [None]:
cleaned_path = PROCESSED_DIR / "userprofiles_cleaned.csv"
df.to_csv(cleaned_path, index=False)
print("Saved cleaned dataset to:", cleaned_path)

In [None]:
categorical_cols = df.columns

X = pd.get_dummies(df[categorical_cols], drop_first=False)
X = X.astype(int)  # ensure numeric 0/1

print("MCA matrix shape:", X.shape)
X.head()

In [None]:
mca_matrix_path = PROCESSED_DIR / "userprofiles_mca_matrix.csv"
X.to_csv(mca_matrix_path, index=False)
print("Saved MCA matrix to:", mca_matrix_path)

binary_cols = list(X.columns)
X_binary = X.copy()

In [None]:
n_components = min(N_COMPONENTS, len(binary_cols))
print(f"Running MCA with {n_components} dimensions...")

mca = prince.MCA(
    n_components=n_components,
    n_iter=5,
    copy=True,
    check_input=True,
    random_state=42,
).fit(X_binary)

row_coords = mca.row_coordinates(X_binary)
col_coords = mca.column_coordinates(X_binary)
eigenvalues = mca.eigenvalues_

# Explained inertia (handle prince version differences)
try:
    explained_attr = mca.explained_inertia_
    explained = explained_attr() if callable(explained_attr) else explained_attr
except AttributeError:
    total = eigenvalues.sum()
    explained = eigenvalues / total if total > 0 else np.zeros_like(eigenvalues)

eigs_df = pd.DataFrame({
    "dimension": np.arange(1, len(eigenvalues) + 1),
    "eigenvalue": eigenvalues,
    "explained_inertia": explained
})

eigs_df

In [None]:
row_coords.index = df.index

row_coords.to_csv(MCA_DIR / "row_coordinates.csv")
col_coords.to_csv(MCA_DIR / "column_coordinates.csv")
eigs_df.to_csv(MCA_DIR / "eigenvalues_explained_inertia.csv", index=False)

categories = {col: sorted(df[col].unique()) for col in df.columns}
with open(MCA_DIR / "userprofiles_categories.json", "w") as f:
    json.dump(categories, f, indent=4)

print("Saved MCA outputs to:", MCA_DIR)

In [None]:
plt.figure(figsize=(6, 4))
plt.plot(eigs_df["dimension"], eigs_df["explained_inertia"], marker="o")
plt.title("MCA – Scree Plot")
plt.xlabel("Dimension")
plt.ylabel("Explained inertia")
plt.grid(True, linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()

In [None]:
scree_path = MCA_DIR / "mca_scree_plot.png"
plt.savefig(scree_path, dpi=300)
print("Saved scree plot to:", scree_path)

In [None]:
if n_components >= 2:
    fig, ax = plt.subplots(figsize=(8, 8))

    # Respondents
    ax.scatter(row_coords.iloc[:, 0], row_coords.iloc[:, 1],
               s=10, alpha=0.3, label="Respondents")

    # Categories
    ax.scatter(col_coords.iloc[:, 0], col_coords.iloc[:, 1],
               marker="x", s=60, label="Variables")

    for i, label in enumerate(col_coords.index):
        ax.text(col_coords.iloc[i, 0], col_coords.iloc[i, 1],
                label, fontsize=7, ha="center", va="center")

    ax.axhline(0, color="black", linewidth=0.5)
    ax.axvline(0, color="black", linewidth=0.5)
    ax.set_xlabel("Dimension 1")
    ax.set_ylabel("Dimension 2")
    ax.set_title("MCA Biplot – Dim 1 vs Dim 2")
    ax.legend()
    plt.tight_layout()
    plt.show()
else:
    print("Not enough dimensions for 2D biplot.")

In [None]:
biplot_path = MCA_DIR / "mca_biplot_dim1_dim2.png"
fig.savefig(biplot_path, dpi=300)
print("Saved biplot to:", biplot_path)

In [None]:
if n_components >= 2:
    X_mca = row_coords.iloc[:, :n_components].values
    kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)
    df["cluster"] = kmeans.fit_predict(X_mca)

    # Save cluster assignments
    df[["cluster"]].to_csv(MCA_DIR / "cluster_assignments.csv")
    print("Saved cluster assignments to:", MCA_DIR / "cluster_assignments.csv")

    # Plot clusters on Dim 1 vs Dim 2
    fig, ax = plt.subplots(figsize=(8, 8))
    scatter = ax.scatter(
        row_coords.iloc[:, 0], row_coords.iloc[:, 1],
        c=df["cluster"], cmap="tab10", s=20, alpha=0.7
    )

    ax.axhline(0, color="black", linewidth=0.5)
    ax.axvline(0, color="black", linewidth=0.5)
    ax.set_xlabel("Dimension 1")
    ax.set_ylabel("Dimension 2")
    ax.set_title("MCA – Respondents clustered in MCA space")

    cbar = plt.colorbar(scatter)
    cbar.set_label("Cluster")
    plt.tight_layout()
    plt.show()
else:
    print("Not enough dimensions for clustering plot.")

In [None]:
cluster_plot_path = MCA_DIR / "mca_clusters_dim1_dim2.png"
fig.savefig(cluster_plot_path, dpi=300)
print("Saved cluster plot to:", cluster_plot_path)

In [None]:
if ROLE_COLUMN in df.columns:
    df_role = pd.concat(
        [df[[ROLE_COLUMN]].reset_index(drop=True),
         X_binary.reset_index(drop=True)],
        axis=1
    )

    role_theme = df_role.groupby(ROLE_COLUMN)[binary_cols].mean()

    plt.figure(figsize=(max(8, len(binary_cols)*0.4),
                        max(6, len(role_theme)*0.4)))

    plt.imshow(role_theme.values, aspect="auto")
    plt.colorbar(label="Proportion = 1")
    plt.xticks(np.arange(len(binary_cols)), binary_cols, rotation=90)
    plt.yticks(np.arange(len(role_theme.index)), role_theme.index)
    plt.title(f"{ROLE_COLUMN} → Indicator Heatmap")
    plt.tight_layout()
    plt.show()
else:
    print(f"ROLE_COLUMN '{ROLE_COLUMN}' not found; skipping heatmap.")

In [None]:
heatmap_path = MCA_DIR / f"{ROLE_COLUMN}_indicator_heatmap.png"
plt.savefig(heatmap_path, dpi=300)
print("Saved heatmap to:", heatmap_path)

## Interpretation Notes

### Dimension 1
- Describe which categories lie at the positive vs negative ends.
- Example: Extroverted, social drinkers, high budget vs introverted, abstemious, low budget.

### Dimension 2
- Describe another behavioural contrast.
- Example: Family-oriented vs romantic ambience preferences.

### Clusters
- Briefly describe what type of consumer each cluster seems to represent.

### Heatmap (Personality → Indicators)
- Which personalities favour social drinking?
- Which groups have higher BMI categories?
- How does budget vary by personality type?