In [74]:
import pandas as pd
import numpy as np
import ast
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from torch.utils.data import DataLoader, TensorDataset

In [75]:
# Cargar la base de datos
ruta_archivo_csv = 'C:\\Users\\luisf\\Documents\\GitHub\\keypoints-transformer\\csvFinal.csv'
df = pd.read_csv(ruta_archivo_csv)

In [76]:
ruta_archivo_pickle = 'C:\\Users\\luisf\\Documents\\GitHub\\keypoints-transformer\\datasetMascaras.pkl'
df = pd.read_pickle(ruta_archivo_pickle)

In [77]:
# Convertir las cadenas de texto de las columnas en numpy arrays
columns = ['nose', 'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow', 
           'left_wrist', 'right_wrist', 'left_hip', 'right_hip', 'left_knee', 
           'right_knee', 'left_ankle', 'right_ankle', 'mid_shoulder', 'mid_hip']

for column in columns:
    df[column] = df[column].apply(lambda x: np.array(ast.literal_eval(x)))


In [78]:
# Normalizar los keypoints
scaler = StandardScaler()
for column in columns:
    keypoints = np.stack(df[column].values)  # Convertir la columna en un array 2D
    keypoints_normalized = scaler.fit_transform(keypoints)  # Normalizar
    df[column] = [arr for arr in keypoints_normalized]  # Convertir de nuevo en una lista de arrays

In [79]:

# Eliminar la coordenada z de las primeras 13 columnas que tienen (x, y, z)
for column in columns[:13]:
    df[column] = df[column].apply(lambda x: x[:2])

In [80]:
# Verificar que todas las filas tienen el mismo número de dimensiones
for col in columns:
    lengths = df[col].apply(len).unique()
    if len(lengths) != 1:
        df = df[df[col].apply(len) == lengths[0]]

In [81]:
# Obtener los keypoints como un array numpy de forma (n_samples, num_keypoints, num_dimensions)
X_keypoints = np.array([np.concatenate(df[columns].iloc[i].values).ravel() for i in range(len(df))])


# Obtener las etiquetas
etiquetas = df['etiqueta'].values

# Codificar las etiquetas a valores numéricos
label_encoder = LabelEncoder()
etiquetas_codificadas = label_encoder.fit_transform(etiquetas)

# Convertir las etiquetas codificadas a float32
etiquetas_float = etiquetas_codificadas.astype(np.float32)

In [82]:
X_train, X_test, y_train, y_test = train_test_split(X_keypoints, etiquetas_codificadas, test_size=0.2, random_state=42)


In [83]:
# Separar datos de entrenamiento y prueba

X_train = torch.tensor(X_train, dtype=torch.float32)

In [84]:

# Convertir a tensores de PyTorch
X_train = torch.tensor(X_train, dtype=torch.float32)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.long)  # Usar torch.long para las etiquetas
y_test = torch.tensor(y_test, dtype=torch.long)


  X_train = torch.tensor(X_train, dtype=torch.float32)


In [85]:
# Definir la arquitectura del modelo Transformer
class TransformerModel(nn.Module):
    def __init__(self, num_keypoints, num_dimensions, embed_dim, num_heads, ff_dim, num_classes, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.num_keypoints = num_keypoints
        self.num_dimensions = num_dimensions
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.num_classes = num_classes

        # Verificar si embed_dim es divisible por num_heads
        assert embed_dim % num_heads == 0, "embed_dim debe ser divisible por num_heads"

        # Capa de embeddings para los keypoints
        self.keypoints_embedding = nn.Linear(num_keypoints * num_dimensions, embed_dim)

        # Codificador Transformer
        self.transformer_encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=embed_dim, nhead=num_heads, dim_feedforward=ff_dim, dropout=dropout),
            num_layers=1
        )

        # Capa de salida
        self.output_layer = nn.Linear(embed_dim, num_classes)

    def forward(self, x):
        # Aplanar los keypoints
        x = x.view(-1, self.num_keypoints * self.num_dimensions)

        # Capa de embeddings lineales
        x = self.keypoints_embedding(x)

        # Ajustar la forma para el Transformer (seq_len, batch, input_size)
        x = x.unsqueeze(1).transpose(0, 1)

        # Pasar por el codificador Transformer
        x = self.transformer_encoder(x)

        # Aplanar la salida y pasar por la capa de salida
        x = x.squeeze(0)
        x = self.output_layer(x)

        return x

In [86]:
# Función para entrenar el modelo
def train_model(model, criterion, optimizer, X_train, y_train, num_epochs=10, batch_size=32):
    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for i in range(0, len(X_train), batch_size):
            inputs = X_train[i:i+batch_size]
            labels = y_train[i:i+batch_size]

            optimizer.zero_grad()

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            running_loss += loss.item()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss / len(X_train)}')


In [87]:

# Parámetros del modelo Transformer
num_heads = 4  # Número de cabezas de atención en la capa MultiHeadAttention
embed_dim = 32  # Dimensión de los embeddings, debe ser múltiplo de num_heads
ff_dim = 64    # Dimensión de las capas feedforward
num_classes = len(label_encoder.classes_)  # Número de clases (etiquetas únicas)

In [88]:
# Inicializar el modelo
model = TransformerModel(num_keypoints=len(columns),
                         num_dimensions=2,
                         embed_dim=embed_dim,
                         num_heads=num_heads,
                         ff_dim=ff_dim,
                         num_classes=num_classes)




In [89]:

# Imprimir la arquitectura del modelo
print(model)


TransformerModel(
  (keypoints_embedding): Linear(in_features=30, out_features=32, bias=True)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=32, out_features=32, bias=True)
        )
        (linear1): Linear(in_features=32, out_features=64, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=64, out_features=32, bias=True)
        (norm1): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((32,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (output_layer): Linear(in_features=32, out_features=8, bias=True)
)


In [90]:
# Función de pérdida y optimizador
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [91]:

# Entrenar el modelo
train_model(model, criterion, optimizer, X_train, y_train, num_epochs=10)

Epoch [1/10], Loss: 0.05989601238664374
Epoch [2/10], Loss: 0.04827745597806913
Epoch [3/10], Loss: 0.03970953199054061
Epoch [4/10], Loss: 0.03501662419756451
Epoch [5/10], Loss: 0.030917209315012736
Epoch [6/10], Loss: 0.02679814173345613
Epoch [7/10], Loss: 0.023472693480134095
Epoch [8/10], Loss: 0.02119650839069564
Epoch [9/10], Loss: 0.019759342011029968
Epoch [10/10], Loss: 0.017474373040851508


In [92]:

# Evaluar el modelo
model.eval()
with torch.no_grad():
    outputs = model(X_test)
    _, predicted = torch.max(outputs, 1)
    accuracy = (predicted == y_test).sum().item() / len(y_test)
    print(f'Accuracy en el conjunto de prueba: {accuracy * 100:.2f}%')

# Guardar el modelo
torch.save(model.state_dict(), 'transformer_model.pth')


Accuracy en el conjunto de prueba: 85.55%
