In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
import torch.nn.functional as F
import numpy as np
from tqdm import tqdm

In [4]:
import math

class ArcMarginProduct(nn.Module):
    """
    Implementación de ArcFace
    """
    def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m

        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)

        one_hot = torch.zeros(cosine.size(), device=input.device)
        one_hot.scatter_(1, label.view(-1,1), 1.0)

        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s

        return output


In [5]:
data_dir = "../data/preprocessed"

transform = transforms.Compose([
    transforms.Resize((112,112)),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

train_dataset = datasets.ImageFolder(root=f"{data_dir}/train", transform=transform)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)

num_classes = len(train_dataset.classes)
print("Clases:", train_dataset.classes)


Clases: ['Abir Ahmed', 'Adriana Sanchez', 'Adriana Solanilla', 'Alejandro Tulipano', 'Amy Olivares', 'Blas de Leon', 'Carlos Beitia', 'Carlos Hernandez', 'Cesar Rodriguez', 'Javier Bustamante', 'Jeremy Sanchez', 'Jonathan Peralta', 'Kevin Rodriguez', 'Mahir Arcia', 'Michael Jordan']


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

backbone = models.resnet50(pretrained=True)
backbone.fc = nn.Linear(backbone.fc.in_features, 512)

arc_margin = ArcMarginProduct(512, num_classes).to(device)

backbone = backbone.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(
    list(backbone.parameters()) + list(arc_margin.parameters()),
    lr=0.01,
    momentum=0.9,
    weight_decay=5e-4
)

In [None]:
num_epochs = 10
epoch_losses = []

for epoch in range(num_epochs):
    backbone.train()
    arc_margin.train()
    running_loss = 0.0
    pbar = tqdm(train_loader)
    for inputs, labels in pbar:
        inputs, labels = inputs.to(device), labels.to(device)

        features = backbone(inputs)
        outputs = arc_margin(features, labels)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        pbar.set_description(f"Epoch {epoch+1}/{num_epochs} Loss: {running_loss/len(train_loader):.4f}")
    
        # Al final de la época
        epoch_loss = running_loss / len(train_loader)
        epoch_losses.append(epoch_loss)
        

In [None]:
torch.save(backbone.state_dict(), "../models/arcface_backbone.pth")
torch.save(arc_margin.state_dict(), "../models/arcface_margin.pth")

In [None]:
backbone.eval()
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        features = backbone(inputs)
        # Aquí features son tus embeddings 512-D


## Celda para guardar métricas en JSON

In [None]:
import json

# Crear diccionario de métricas
metrics = {
    "loss_per_epoch": epoch_losses,
    "num_epochs": num_epochs,
    "batch_size": train_loader.batch_size,
    "num_classes": num_classes,
}

# Guardar en JSON
with open("metrics.json", "w") as f:
    json.dump(metrics, f, indent=4)

print("✅ Métricas guardadas en metrics.json")


## Crear la galería de embeddings

In [None]:
import os
import random
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image

# Configuración
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
TEST_DIR = "../data/preprocessed/test"
MODEL_PATH = "../models/arcface_backbone.pth"
OUTPUT_EMBEDDINGS = "../models/gallery_embeddings.pth"

# Transformación igual que en entrenamiento
transform = transforms.Compose([
    transforms.Resize((112,112)),
    transforms.ToTensor(),
    transforms.Normalize([0.5], [0.5])
])

# Cargar backbone
backbone = models.resnet50(pretrained=False)
backbone.fc = nn.Linear(backbone.fc.in_features, 512)
backbone.load_state_dict(torch.load(MODEL_PATH))
backbone = backbone.to(DEVICE)
backbone.eval()

# Diccionario de embeddings
embeddings = {}

# Recorre cada clase
for person in os.listdir(TEST_DIR):
    person_dir = os.path.join(TEST_DIR, person)
    if not os.path.isdir(person_dir):
        continue

    # Lista de imágenes
    images = [f for f in os.listdir(person_dir) if f.lower().endswith((".jpg", ".png"))]
    if not images:
        print(f"⚠️ No hay imágenes en {person_dir}")
        continue

    # Imagen random
    img_name = random.choice(images)
    img_path = os.path.join(person_dir, img_name)
    print(f"Procesando {person}: {img_name}")

    # Preprocesar
    img = Image.open(img_path).convert("RGB")
    input_tensor = transform(img).unsqueeze(0).to(DEVICE)

    with torch.no_grad():
        emb = backbone(input_tensor)
        emb = nn.functional.normalize(emb).squeeze(0).cpu()

    embeddings[person] = emb

# Guardar embeddings
torch.save(embeddings, OUTPUT_EMBEDDINGS)
print(f"\n✅ Embeddings de galería guardados en {OUTPUT_EMBEDDINGS}")


## Iferencia

In [1]:
import cv2
import torch
import numpy as np
import mediapipe as mp
from torchvision import transforms
from torch.nn.functional import normalize
from torchvision import models

# Configuración
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Cargar ResNet50
backbone = models.resnet50(pretrained=False)
backbone.fc = torch.nn.Linear(backbone.fc.in_features, 512)
backbone.load_state_dict(torch.load("../models/arcface_backbone.pth"))
backbone = backbone.to(DEVICE)
backbone.eval()

# Transformación igual que en entrenamiento
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((112,112)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# Cargar embeddings de galería
reference_db = torch.load("../models/gallery_embeddings.pt")

# Mediapipe Face Detection
mp_face_detection = mp.solutions.face_detection
face_detector = mp_face_detection.FaceDetection(model_selection=0, min_detection_confidence=0.5)

# Función para obtener embedding
def get_embedding(face_img):
    face_tensor = transform(face_img).unsqueeze(0).to(DEVICE)
    with torch.no_grad():
        emb = backbone(face_tensor)
        emb = normalize(emb, dim=1)
    return emb.squeeze(0).cpu().numpy()

# Comparar con referencias
def recognize_face(embedding, threshold=0.91):
    best_match = None
    best_score = -1
    for name, ref_emb in reference_db.items():
        score = np.dot(embedding, ref_emb.numpy())
        if score > best_score:
            best_score = score
            best_match = name
    if best_score >= threshold:
        return best_match, best_score
    else:
        return "Unknown", best_score

# ------------------------------
# Cambiar aquí el origen del video:

# cap = cv2.VideoCapture(0)  # Webcam tiempo real

# cap = cv2.VideoCapture("video.mp4")  # Video archivo
# ------------------------------

cap = cv2.VideoCapture("..data/crudo/Abir1.mp4")

print("[INFO] Starting video stream...")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = face_detector.process(rgb_frame)

    if results.detections:
        for det in results.detections:
            bbox = det.location_data.relative_bounding_box
            ih, iw, _ = frame.shape
            x1 = int(bbox.xmin * iw)
            y1 = int(bbox.ymin * ih)
            w = int(bbox.width * iw)
            h = int(bbox.height * ih)
            x2 = x1 + w
            y2 = y1 + h

            # Crop face
            face_img = frame[y1:y2, x1:x2]
            if face_img.size == 0:
                continue

            emb = get_embedding(face_img)
            name, score = recognize_face(emb)

            # Draw
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0,255,0), 2)
            cv2.putText(frame, f"{name} ({score:.2f})", (x1, y1-10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255,255,255), 2)

    cv2.imshow("ArcFace Recognition", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()




RuntimeError: Error(s) in loading state_dict for ResNet:
	Missing key(s) in state_dict: "conv1.weight", "bn1.weight", "bn1.bias", "bn1.running_mean", "bn1.running_var", "layer1.0.conv1.weight", "layer1.0.bn1.weight", "layer1.0.bn1.bias", "layer1.0.bn1.running_mean", "layer1.0.bn1.running_var", "layer1.0.conv2.weight", "layer1.0.bn2.weight", "layer1.0.bn2.bias", "layer1.0.bn2.running_mean", "layer1.0.bn2.running_var", "layer1.0.conv3.weight", "layer1.0.bn3.weight", "layer1.0.bn3.bias", "layer1.0.bn3.running_mean", "layer1.0.bn3.running_var", "layer1.0.downsample.0.weight", "layer1.0.downsample.1.weight", "layer1.0.downsample.1.bias", "layer1.0.downsample.1.running_mean", "layer1.0.downsample.1.running_var", "layer1.1.conv1.weight", "layer1.1.bn1.weight", "layer1.1.bn1.bias", "layer1.1.bn1.running_mean", "layer1.1.bn1.running_var", "layer1.1.conv2.weight", "layer1.1.bn2.weight", "layer1.1.bn2.bias", "layer1.1.bn2.running_mean", "layer1.1.bn2.running_var", "layer1.1.conv3.weight", "layer1.1.bn3.weight", "layer1.1.bn3.bias", "layer1.1.bn3.running_mean", "layer1.1.bn3.running_var", "layer1.2.conv1.weight", "layer1.2.bn1.weight", "layer1.2.bn1.bias", "layer1.2.bn1.running_mean", "layer1.2.bn1.running_var", "layer1.2.conv2.weight", "layer1.2.bn2.weight", "layer1.2.bn2.bias", "layer1.2.bn2.running_mean", "layer1.2.bn2.running_var", "layer1.2.conv3.weight", "layer1.2.bn3.weight", "layer1.2.bn3.bias", "layer1.2.bn3.running_mean", "layer1.2.bn3.running_var", "layer2.0.conv1.weight", "layer2.0.bn1.weight", "layer2.0.bn1.bias", "layer2.0.bn1.running_mean", "layer2.0.bn1.running_var", "layer2.0.conv2.weight", "layer2.0.bn2.weight", "layer2.0.bn2.bias", "layer2.0.bn2.running_mean", "layer2.0.bn2.running_var", "layer2.0.conv3.weight", "layer2.0.bn3.weight", "layer2.0.bn3.bias", "layer2.0.bn3.running_mean", "layer2.0.bn3.running_var", "layer2.0.downsample.0.weight", "layer2.0.downsample.1.weight", "layer2.0.downsample.1.bias", "layer2.0.downsample.1.running_mean", "layer2.0.downsample.1.running_var", "layer2.1.conv1.weight", "layer2.1.bn1.weight", "layer2.1.bn1.bias", "layer2.1.bn1.running_mean", "layer2.1.bn1.running_var", "layer2.1.conv2.weight", "layer2.1.bn2.weight", "layer2.1.bn2.bias", "layer2.1.bn2.running_mean", "layer2.1.bn2.running_var", "layer2.1.conv3.weight", "layer2.1.bn3.weight", "layer2.1.bn3.bias", "layer2.1.bn3.running_mean", "layer2.1.bn3.running_var", "layer2.2.conv1.weight", "layer2.2.bn1.weight", "layer2.2.bn1.bias", "layer2.2.bn1.running_mean", "layer2.2.bn1.running_var", "layer2.2.conv2.weight", "layer2.2.bn2.weight", "layer2.2.bn2.bias", "layer2.2.bn2.running_mean", "layer2.2.bn2.running_var", "layer2.2.conv3.weight", "layer2.2.bn3.weight", "layer2.2.bn3.bias", "layer2.2.bn3.running_mean", "layer2.2.bn3.running_var", "layer2.3.conv1.weight", "layer2.3.bn1.weight", "layer2.3.bn1.bias", "layer2.3.bn1.running_mean", "layer2.3.bn1.running_var", "layer2.3.conv2.weight", "layer2.3.bn2.weight", "layer2.3.bn2.bias", "layer2.3.bn2.running_mean", "layer2.3.bn2.running_var", "layer2.3.conv3.weight", "layer2.3.bn3.weight", "layer2.3.bn3.bias", "layer2.3.bn3.running_mean", "layer2.3.bn3.running_var", "layer3.0.conv1.weight", "layer3.0.bn1.weight", "layer3.0.bn1.bias", "layer3.0.bn1.running_mean", "layer3.0.bn1.running_var", "layer3.0.conv2.weight", "layer3.0.bn2.weight", "layer3.0.bn2.bias", "layer3.0.bn2.running_mean", "layer3.0.bn2.running_var", "layer3.0.conv3.weight", "layer3.0.bn3.weight", "layer3.0.bn3.bias", "layer3.0.bn3.running_mean", "layer3.0.bn3.running_var", "layer3.0.downsample.0.weight", "layer3.0.downsample.1.weight", "layer3.0.downsample.1.bias", "layer3.0.downsample.1.running_mean", "layer3.0.downsample.1.running_var", "layer3.1.conv1.weight", "layer3.1.bn1.weight", "layer3.1.bn1.bias", "layer3.1.bn1.running_mean", "layer3.1.bn1.running_var", "layer3.1.conv2.weight", "layer3.1.bn2.weight", "layer3.1.bn2.bias", "layer3.1.bn2.running_mean", "layer3.1.bn2.running_var", "layer3.1.conv3.weight", "layer3.1.bn3.weight", "layer3.1.bn3.bias", "layer3.1.bn3.running_mean", "layer3.1.bn3.running_var", "layer3.2.conv1.weight", "layer3.2.bn1.weight", "layer3.2.bn1.bias", "layer3.2.bn1.running_mean", "layer3.2.bn1.running_var", "layer3.2.conv2.weight", "layer3.2.bn2.weight", "layer3.2.bn2.bias", "layer3.2.bn2.running_mean", "layer3.2.bn2.running_var", "layer3.2.conv3.weight", "layer3.2.bn3.weight", "layer3.2.bn3.bias", "layer3.2.bn3.running_mean", "layer3.2.bn3.running_var", "layer3.3.conv1.weight", "layer3.3.bn1.weight", "layer3.3.bn1.bias", "layer3.3.bn1.running_mean", "layer3.3.bn1.running_var", "layer3.3.conv2.weight", "layer3.3.bn2.weight", "layer3.3.bn2.bias", "layer3.3.bn2.running_mean", "layer3.3.bn2.running_var", "layer3.3.conv3.weight", "layer3.3.bn3.weight", "layer3.3.bn3.bias", "layer3.3.bn3.running_mean", "layer3.3.bn3.running_var", "layer3.4.conv1.weight", "layer3.4.bn1.weight", "layer3.4.bn1.bias", "layer3.4.bn1.running_mean", "layer3.4.bn1.running_var", "layer3.4.conv2.weight", "layer3.4.bn2.weight", "layer3.4.bn2.bias", "layer3.4.bn2.running_mean", "layer3.4.bn2.running_var", "layer3.4.conv3.weight", "layer3.4.bn3.weight", "layer3.4.bn3.bias", "layer3.4.bn3.running_mean", "layer3.4.bn3.running_var", "layer3.5.conv1.weight", "layer3.5.bn1.weight", "layer3.5.bn1.bias", "layer3.5.bn1.running_mean", "layer3.5.bn1.running_var", "layer3.5.conv2.weight", "layer3.5.bn2.weight", "layer3.5.bn2.bias", "layer3.5.bn2.running_mean", "layer3.5.bn2.running_var", "layer3.5.conv3.weight", "layer3.5.bn3.weight", "layer3.5.bn3.bias", "layer3.5.bn3.running_mean", "layer3.5.bn3.running_var", "layer4.0.conv1.weight", "layer4.0.bn1.weight", "layer4.0.bn1.bias", "layer4.0.bn1.running_mean", "layer4.0.bn1.running_var", "layer4.0.conv2.weight", "layer4.0.bn2.weight", "layer4.0.bn2.bias", "layer4.0.bn2.running_mean", "layer4.0.bn2.running_var", "layer4.0.conv3.weight", "layer4.0.bn3.weight", "layer4.0.bn3.bias", "layer4.0.bn3.running_mean", "layer4.0.bn3.running_var", "layer4.0.downsample.0.weight", "layer4.0.downsample.1.weight", "layer4.0.downsample.1.bias", "layer4.0.downsample.1.running_mean", "layer4.0.downsample.1.running_var", "layer4.1.conv1.weight", "layer4.1.bn1.weight", "layer4.1.bn1.bias", "layer4.1.bn1.running_mean", "layer4.1.bn1.running_var", "layer4.1.conv2.weight", "layer4.1.bn2.weight", "layer4.1.bn2.bias", "layer4.1.bn2.running_mean", "layer4.1.bn2.running_var", "layer4.1.conv3.weight", "layer4.1.bn3.weight", "layer4.1.bn3.bias", "layer4.1.bn3.running_mean", "layer4.1.bn3.running_var", "layer4.2.conv1.weight", "layer4.2.bn1.weight", "layer4.2.bn1.bias", "layer4.2.bn1.running_mean", "layer4.2.bn1.running_var", "layer4.2.conv2.weight", "layer4.2.bn2.weight", "layer4.2.bn2.bias", "layer4.2.bn2.running_mean", "layer4.2.bn2.running_var", "layer4.2.conv3.weight", "layer4.2.bn3.weight", "layer4.2.bn3.bias", "layer4.2.bn3.running_mean", "layer4.2.bn3.running_var", "fc.weight", "fc.bias". 
	Unexpected key(s) in state_dict: "backbone", "embedding". 

## PRUEBA DAVID


In [None]:
import torch
from torchvision import transforms, models
from torch.nn.functional import normalize
from PIL import Image

# Configuración
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Modelo backbone
backbone = models.resnet50(pretrained=False)
backbone.fc = torch.nn.Linear(backbone.fc.in_features, 512)
backbone.load_state_dict(torch.load("../models/arcface_backbone.pth"))
backbone = backbone.to(DEVICE)
backbone.eval()

# Transformación
transform = transforms.Compose([
    transforms.Resize((112,112)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# Ruta de imagen referencia
IMG_PATH = "../models/davidtest.jpeg"

# Cargar imagen
img = Image.open(IMG_PATH).convert("RGB")
input_tensor = transform(img).unsqueeze(0).to(DEVICE)

# Obtener embedding
with torch.no_grad():
    emb = backbone(input_tensor)
    emb = normalize(emb, dim=1).squeeze(0).cpu().numpy()

# Guardar embedding
OUTPUT_PATH = "../models/reference_embedding.npy"
import numpy as np
np.save(OUTPUT_PATH, emb)

print(f"✅ Embedding guardado en {OUTPUT_PATH}")


In [None]:
import cv2
import torch
import numpy as np
import mediapipe as mp
from torchvision import transforms
from torch.nn.functional import normalize
from torchvision import models

# Configuración
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Cargar ResNet50
backbone = models.resnet50(pretrained=False)
backbone.fc = torch.nn.Linear(backbone.fc.in_features, 512)
backbone.load_state_dict(torch.load("../models/arcface_backbone.pth"))
backbone = backbone.to(DEVICE)
backbone.eval()

# Transformación igual que en entrenamiento
transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((112,112)),
    transforms.ToTensor(),
    transforms.Normalize([0.5]*3, [0.5]*3)
])

# Cargar embeddings de galería
reference_db = torch.load("../models/reference_embedding.npy")

# Mediapipe Face Detection
mp_face_detection = mp.solutions.face_detection
face_detector = mp_face_detection.FaceDetection(model_selection=0, min_detection_confidence=0.5)

# Función para obtener embedding
def get_embedding(face_img):
    face_tensor = transform(face_img).unsqueeze(0).to(DEVICE)
    with torch.no_grad():
        emb = backbone(face_tensor)
        emb = normalize(emb, dim=1)
    return emb.squeeze(0).cpu().numpy()

# Comparar con referencias
def recognize_face(embedding, threshold=0.91):
    best_match = None
    best_score = -1
    for name, ref_emb in reference_db.items():
        score = np.dot(embedding, ref_emb.numpy())
        if score > best_score:
            best_score = score
            best_match = name
    if best_score >= threshold:
        return best_match, best_score
    else:
        return "Unknown", best_score

# ------------------------------
# Cambiar aquí el origen del video:

# cap = cv2.VideoCapture(0)  # Webcam tiempo real

# cap = cv2.VideoCapture("video.mp4")  # Video archivo
# ------------------------------

cap = cv2.VideoCapture(0)

print("[INFO] Starting video stream...")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = face_detector.process(rgb_frame)

    if results.detections:
        for det in results.detections:
            bbox = det.location_data.relative_bounding_box
            ih, iw, _ = frame.shape
            x1 = int(bbox.xmin * iw)
            y1 = int(bbox.ymin * ih)
            w = int(bbox.width * iw)
            h = int(bbox.height * ih)
            x2 = x1 + w
            y2 = y1 + h

            # Crop face
            face_img = frame[y1:y2, x1:x2]
            if face_img.size == 0:
                continue

            emb = get_embedding(face_img)
            name, score = recognize_face(emb)

            # Draw
            cv2.rectangle(frame, (x1, y1), (x2, y2), (0,255,0), 2)
            cv2.putText(frame, f"{name} ({score:.2f})", (x1, y1-10),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255,255,255), 2)

    cv2.imshow("ArcFace Recognition", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()
