# Test Pipeline

In [None]:
# !pip install --quiet pandas numpy matplotlib seaborn torch torchvision facenet-pytorch insightface pytorch-metric-learning xgboost scikit-learn opencv-python onnxruntime mozuma

In [23]:
#!/usr/bin/env python3
"""
bmi_face_predictor.py

Pipeline:
 1. Detect & align faces via MTCNN
 2. Extract embeddings via ArcFace backbone (torch_arcface_insightface)
 3. Add projection head and fine‑tune embeddings with Triplet Loss
 4. Train an XGBoost regressor
 5. Predict BMI on new images
"""

import os
import pandas as pd
import cv2
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from facenet_pytorch import MTCNN
from pytorch_metric_learning import losses, miners
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from mozuma.models.arcface.pretrained import torch_arcface_insightface

# 0. Device setup (CUDA > MPS > CPU)
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')
print(f"Using device: {device}")

# 1. Face Detection & Alignment
mtcnn = MTCNN(image_size=112, margin=0, keep_all=False, device=device)

# Helper: load & align
def load_and_align(img_path):
    if not os.path.isfile(img_path):
        raise FileNotFoundError(f"Image not found: {img_path}")
    img = cv2.imread(img_path)
    if img is None:
        raise IOError(f"Failed to read image: {img_path}")
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    aligned = mtcnn(img_rgb)
    if aligned is None:
        # fallback zero tensor
        return torch.zeros(3, 112, 112, device=device)
    return aligned.to(device)

# 2. ArcFace backbone + projector
class ArcFaceEmbedder(nn.Module):
    def __init__(self, device):
        super().__init__()
        # Load pre-trained ArcFace (ResNet100) via Mozuma
        self.backbone = torch_arcface_insightface(device=device)
        # Freeze backbone parameters
        for p in self.backbone.parameters():
            p.requires_grad = False
        # Projection head to 128-d
        self.projector = nn.Sequential(
            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
        )

    def forward(self, x):
        # x: (B,3,112,112)
        with torch.no_grad():
            emb = self.backbone(x)  # (B,512)
        out = self.projector(emb)  # (B,128)
        return nn.functional.normalize(out, p=2, dim=1)

# 3. Dataset + filtering missing
class BMIDataset(Dataset):
    def __init__(self, df, images_dir):
        self.df = df.reset_index(drop=True)
        self.images_dir = images_dir

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.loc[idx]
        path = os.path.join(self.images_dir, row['name'])
        aligned = load_and_align(path)
        bmi = torch.tensor(row['bmi'], dtype=torch.float32, device=device)
        return aligned, bmi

# Filter entries with missing files
def filter_existing(df, images_dir):
    exists = df['name'].apply(lambda f: os.path.isfile(os.path.join(images_dir, f)))
    missing = df.loc[~exists, 'name'].tolist()
    if missing:
        print(f"Filtering {len(missing)} missing images: {missing}")
    return df.loc[exists].reset_index(drop=True)

# Load and preprocess annotations
df = pd.read_csv('data/data.csv')
df['bmi'] = df['bmi'].astype(float)
df['is_training'] = df['is_training'].astype(int)
df['name'] = df['name'].astype(str)
images_dir = os.path.join('data', 'Images')
df = filter_existing(df, images_dir)
print(f"Dataset contains {len(df)} valid entries.")

# Split
train_df = df[df['is_training'] == 1]
val_df   = df[df['is_training'] == 0]

# DataLoaders
train_ds = BMIDataset(train_df, images_dir)
val_ds   = BMIDataset(val_df, images_dir)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=4, pin_memory=True)

# 4. Model, loss, optimizer
embedder = ArcFaceEmbedder(device).to(device)
miner     = miners.TripletMarginMiner(margin=0.2, type_of_triplets='semi-hard')
criterion = losses.TripletMarginLoss(margin=0.2)
optimizer = torch.optim.AdamW(embedder.projector.parameters(), lr=1e-4, weight_decay=1e-5)

# Fine‑tune projection head
print("Fine-tuning embeddings...")
for epoch in range(10):
    embedder.train()
    running = 0.0
    for imgs, bmis in train_loader:
        feats = embedder(imgs)
        labels = torch.round(bmis).long()
        triplets = miner(feats, labels)
        loss = criterion(feats, labels, triplets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running += loss.item()
    print(f"Epoch {epoch+1}, Loss: {running/len(train_loader):.4f}")

# 5. Extract embeddings
def get_embeddings(df):
    embedder.eval()
    all_embs = []
    for _, row in df.iterrows():
        img = load_and_align(os.path.join(images_dir, row['name']))
        with torch.no_grad():
            emb = embedder(img.unsqueeze(0)).cpu().numpy().ravel()
        all_embs.append(emb)
    return np.vstack(all_embs)

print("Extracting embeddings...")
X_train = get_embeddings(train_df)
X_val   = get_embeddings(val_df)
y_train = train_df['bmi'].values
y_val   = val_df['bmi'].values

# 6. Train XGBoost
xgb_model = xgb.XGBRegressor(max_depth=6, learning_rate=0.05, n_estimators=200, objective='reg:squarederror')
print("Training regressor...")
xgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=10, verbose=True)

# 7. Evaluate
preds = xgb_model.predict(X_val)
print("Validation MAE:", mean_absolute_error(y_val, preds))

# 8. Save
torch.save(embedder.state_dict(), 'arcface_bmi_embedder.pt')
xgb_model.save_model('bmi_xgb.json')

# 9. Inference
def predict_bmi(path):
    img = load_and_align(path)
    with torch.no_grad():
        emb = embedder(img.unsqueeze(0)).cpu().numpy()
    return xgb_model.predict(emb)[0]

if __name__ == '__main__':
    sample = train_df.iloc[0]['name']
    print("Example BMI:", predict_bmi(os.path.join(images_dir, sample)))


Using device: mps
Filtering 244 missing images: ['img_4.bmp', 'img_5.bmp', 'img_40.bmp', 'img_86.bmp', 'img_93.bmp', 'img_112.bmp', 'img_113.bmp', 'img_238.bmp', 'img_239.bmp', 'img_300.bmp', 'img_301.bmp', 'img_309.bmp', 'img_436.bmp', 'img_437.bmp', 'img_530.bmp', 'img_531.bmp', 'img_579.bmp', 'img_617.bmp', 'img_670.bmp', 'img_671.bmp', 'img_734.bmp', 'img_735.bmp', 'img_742.bmp', 'img_820.bmp', 'img_824.bmp', 'img_860.bmp', 'img_861.bmp', 'img_1036.bmp', 'img_1037.bmp', 'img_1070.bmp', 'img_1099.bmp', 'img_1121.bmp', 'img_1127.bmp', 'img_1164.bmp', 'img_1172.bmp', 'img_1173.bmp', 'img_1300.bmp', 'img_1301.bmp', 'img_1324.bmp', 'img_1325.bmp', 'img_1326.bmp', 'img_1327.bmp', 'img_1472.bmp', 'img_1473.bmp', 'img_1524.bmp', 'img_1525.bmp', 'img_1612.bmp', 'img_1613.bmp', 'img_1618.bmp', 'img_1619.bmp', 'img_1704.bmp', 'img_1718.bmp', 'img_1719.bmp', 'img_1720.bmp', 'img_1721.bmp', 'img_1822.bmp', 'img_1823.bmp', 'img_1830.bmp', 'img_1846.bmp', 'img_1847.bmp', 'img_1848.bmp', 'img_1849

Traceback (most recent call last):
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "<string>", line 1, in <module>
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
Traceback (most recent call last):
  File "<string>", line 1, in <module>
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
  File "<string>", line 1, in <module>
  File "/Library/Frameworks/Python.framework/Versions/3.12/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
    exitcode = _main(fd, parent_sentinel)
          exitcode = _main(fd, parent_sentinel) 
                        ^ ^ ^ ^ ^ ^ ^ ^ ^ ^ ^^^^^^^^^^^    ^^exitcode =

RuntimeError: DataLoader worker (pid(s) 38973, 38975) exited unexpectedly

In [None]:
# Filter out rows with missing image files
images_dir = os.path.join('data', 'Images')
df = filter_existing_images(df, images_dir)
print(f"Loaded {len(df)} entries with images from {images_dir}")

# Split train/validation based on 'is_training'
train_df = df[df['is_training'] == 1]
val_df   = df[df['is_training'] == 0]

# Create datasets
datasets = {
    'train': BMIDataset(train_df, images_dir),
    'val':   BMIDataset(val_df, images_dir)
}

# 4. Fine‑tune embeddings with Triplet Loss
miner     = miners.TripletMarginMiner(margin=0.2, type_of_triplets='semi-hard')
criterion = losses.TripletMarginLoss(margin=0.2)
embedder  = ArcFaceEmbedder().to(device)
optimizer = torch.optim.AdamW(embedder.parameters(), lr=1e-4, weight_decay=1e-5)

print("Training with Triplet Loss...")
train_loader = DataLoader(datasets['train'], batch_size=32, shuffle=True)
for epoch in range(10):
    embedder.train()
    total_loss = 0.0
    for imgs, bmis in train_loader:
        feats = embedder(imgs)
        labels = torch.round(bmis).long()
        hard_pairs = miner(feats, labels)
        loss = criterion(feats, labels, hard_pairs)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1} – Triplet Loss: {total_loss/len(train_loader):.4f}")

# 5. Extract embeddings for regression
def extract_embeddings(df, images_dir):
    embedder.eval()
    embeddings = []
    for _, row in df.iterrows():
        img_path = os.path.join(images_dir, row['name'])
        aligned = load_and_align(img_path).to(device)
        with torch.no_grad():
            emb = embedder(aligned.unsqueeze(0)).cpu().numpy().ravel()
        embeddings.append(emb)
    return np.vstack(embeddings)

# Extract embeddings for training and validation sets
print("Extracting embeddings for training and validation sets...")
X_train = extract_embeddings(train_df, images_dir)
X_val   = extract_embeddings(val_df, images_dir)
y_train = train_df['bmi'].values
y_val   = val_df['bmi'].values

# 6. Train XGBoost Regressor
xgb_model = xgb.XGBRegressor(
    max_depth=6,
    learning_rate=0.05,
    n_estimators=200,
    objective='reg:squarederror',
)

print("Training XGBoost Regressor...")
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=True
)

# 7. Evaluate
print("Evaluating on validation set...")
preds = xgb_model.predict(X_val)
print("Validation MAE:", mean_absolute_error(y_val, preds))

# 8. Save models
torch.save(embedder.state_dict(), 'arcface_bmi_embedder.pth')
xgb_model.save_model('bmi_xgb.json')

# 9. Inference helper
def predict_bmi(image_path):
    aligned = load_and_align(image_path).to(device)
    with torch.no_grad():
        emb = embedder(aligned.unsqueeze(0)).cpu().numpy()
    return xgb_model.predict(emb)[0]

if __name__ == '__main__':
    sample_name = train_df.iloc[0]['name']
    sample_path = os.path.join(images_dir, sample_name)
    print("Example prediction:", predict_bmi(sample_path))
