In [37]:
from SEMPIDataLoader import InterPersenSEMPIDataset, DataSetLoader, DataLoader
from SEMPIDataLoader import create_dataloaders
from SEMPIDataLoader import DATA_PATH

import os
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm

In [11]:
# load the dataset and dataloaders with pickle
with open(os.path.join(DATA_PATH, 'dataset.pkl'), 'rb') as f:
    dataset: InterPersenSEMPIDataset = pickle.load(f)

train_loader, val_loader = create_dataloaders(dataset, batch_size=32)
print(len(train_loader), len(val_loader))
print(len(dataset))
print("Data loaded successfully!")
print(f"Train size: {len(train_loader.dataset)}")
print(f"Val size: {len(val_loader.dataset)}")

for i, data in enumerate(train_loader):
    print(f"Batch {i}")
    if i == 2:
        break
    print(data['features'].shape)
    print(data['pids'])
    print(data['score'])


382 96
15256
Data loaded successfully!
Train size: 12204
Val size: 3052
Batch 0
torch.Size([32, 2, 329, 64])
tensor([[7, 2],
        [4, 5],
        [4, 1],
        [8, 4],
        [8, 6],
        [3, 4],
        [4, 2],
        [4, 5],
        [2, 4],
        [4, 6],
        [2, 1],
        [6, 4],
        [2, 5],
        [1, 2],
        [5, 7],
        [1, 5],
        [3, 1],
        [6, 2],
        [5, 7],
        [1, 4],
        [4, 3],
        [1, 4],
        [4, 5],
        [4, 1],
        [1, 8],
        [1, 5],
        [8, 7],
        [5, 1],
        [3, 6],
        [8, 4],
        [1, 5],
        [5, 4]], dtype=torch.int32)
tensor([ 1.6667e-02, -1.6667e-02,  1.5000e-01,  5.0000e-02, -1.3333e-01,
         8.3333e-02, -1.3878e-17,  1.8333e-01, -1.8333e-01, -3.6667e-01,
         1.8333e-01,  1.8333e-01, -1.0000e-01,  1.6667e-01, -2.7756e-17,
         1.6667e-02, -5.0000e-02,  5.0000e-02, -6.6667e-02, -1.6667e-02,
         5.0000e-02, -6.6667e-02, -6.6667e-02,  1.6667e-01, -2.7756

In [35]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CrossAttention(nn.Module):
    def __init__(self, feature_dim, num_heads):
        super(CrossAttention, self).__init__()
        self.attention = nn.MultiheadAttention(embed_dim=feature_dim, num_heads=num_heads, batch_first=True)
    
    def forward(self, main_input, attending_input):
        """
        main_input: (batch_size, num_features, num_frames)
        attending_input: (batch_size, num_features, num_frames)
        """
        main_input = main_input.permute(0, 2, 1)  # (batch_size, num_frames, num_features)
        attending_input = attending_input.permute(0, 2, 1)  # (batch_size, num_frames, num_features)
        attended_output, attention_weights = self.attention(main_input, attending_input, attending_input)
        return attended_output.permute(0, 2, 1)  # (batch_size, num_features, num_frames)


class EngagementPredictor(nn.Module):
    def __init__(self, num_features, num_frames, hidden_dim=128, num_heads=4):
        super(EngagementPredictor, self).__init__()
        self.cross_attention = CrossAttention(feature_dim=num_features, num_heads=num_heads)
        self.mlp = nn.Sequential(
            nn.Linear(num_features * num_frames, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
            nn.Tanh()
        )
    
    def forward(self, x):
        main_input, attending_input = x[:, 0, :, :], x[:, 1, :, :]
        attended_output = self.cross_attention(main_input, attending_input)
        flattened = attended_output.contiguous().reshape(attended_output.size(0), -1) # Flatten (batch_size, num_features * num_frames)
        return self.mlp(flattened).squeeze(-1)  # Output a single value per sample


def train_model(model, train_loader, val_loader, num_epochs=10, lr=1e-4, device='cuda'):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    
    for epoch in tqdm(range(num_epochs)):
        model.train()
        total_loss = 0
        for batch in train_loader:
            feat = batch['features'].to(device)
            optimizer.zero_grad()
            predictions = model(feat)
            loss = criterion(predictions, batch['score'].to(device))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch+1}: Training Loss = {total_loss / len(train_loader)}")
        
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                feat = batch['features'].to(device)
                predictions = model(feat)
                loss = criterion(predictions, batch['score'].to(device))
                val_loss += loss.item()
        
        print(f"Validation Loss = {val_loss / len(val_loader)}")
    
    return model

In [38]:
num_features = 329
num_frames = 64

model = train_model(EngagementPredictor(num_features, num_frames, num_heads=7), # it must be divisor of 329
            train_loader, val_loader, num_epochs=10, lr=1e-4, device='cpu')
print("Training completed successfully")

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1: Training Loss = 1.0101529877847402


 10%|█         | 1/10 [00:10<01:34, 10.50s/it]

Validation Loss = 1.0103347015877564
Epoch 2: Training Loss = 1.0099812193690794


 20%|██        | 2/10 [00:20<01:20, 10.11s/it]

Validation Loss = 1.0103347015877564
Epoch 3: Training Loss = 1.0098998749443373


 30%|███       | 3/10 [00:30<01:09,  9.99s/it]

Validation Loss = 1.0103347015877564
Epoch 4: Training Loss = 1.010009327328018


 40%|████      | 4/10 [00:40<00:59,  9.98s/it]

Validation Loss = 1.0103347015877564
Epoch 5: Training Loss = 1.0101095759431729


 50%|█████     | 5/10 [00:49<00:49,  9.91s/it]

Validation Loss = 1.0103347015877564
Epoch 6: Training Loss = 1.009867258408931


 60%|██████    | 6/10 [00:59<00:39,  9.94s/it]

Validation Loss = 1.0103347015877564
Epoch 7: Training Loss = 1.010077150860382


 70%|███████   | 7/10 [01:09<00:29,  9.88s/it]

Validation Loss = 1.0103347015877564
Epoch 8: Training Loss = 1.0099845151002493


 80%|████████  | 8/10 [01:19<00:19,  9.83s/it]

Validation Loss = 1.0103347015877564
Epoch 9: Training Loss = 1.0099701670764005


 90%|█████████ | 9/10 [01:29<00:09,  9.89s/it]

Validation Loss = 1.0103347015877564
Epoch 10: Training Loss = 1.0101879359227826


100%|██████████| 10/10 [01:39<00:00,  9.94s/it]

Validation Loss = 1.0103347015877564
Training completed successfully





In [39]:
# see models's prediction on a batch
for i, data in enumerate(val_loader):
    print(f"Batch {i}")
    if i == 2:
        break
    print(data['features'].shape)
    print(data['pids'])
    print(data['score'])
    print(model(data['features']).detach().numpy())
    print("")

Batch 0
torch.Size([32, 2, 329, 64])
tensor([[1, 4],
        [3, 5],
        [1, 5],
        [8, 1],
        [4, 1],
        [1, 4],
        [5, 2],
        [4, 3],
        [7, 2],
        [2, 4],
        [2, 6],
        [5, 6],
        [6, 5],
        [6, 1],
        [5, 1],
        [5, 3],
        [2, 4],
        [4, 1],
        [6, 7],
        [3, 6],
        [6, 5],
        [3, 5],
        [4, 1],
        [4, 3],
        [4, 2],
        [4, 1],
        [2, 4],
        [2, 1],
        [2, 1],
        [3, 4],
        [5, 3],
        [6, 4]], dtype=torch.int32)
tensor([ 8.3333e-02, -2.6667e-01,  3.3333e-02,  5.0000e-02, -3.6667e-01,
        -1.0000e-01,  3.3333e-02,  8.3333e-02, -8.3333e-02, -2.7756e-17,
        -3.8333e-01, -1.3333e-01,  1.6667e-01,  2.0000e-01,  6.6667e-02,
         3.3333e-02, -2.6667e-01,  1.1667e-01,  1.6667e-01, -2.7756e-17,
         2.0000e-01,  1.8333e-01, -1.6667e-01, -3.3333e-02,  5.0000e-02,
         5.0000e-02, -2.8333e-01,  5.0000e-02, -2.7756e-17, -1.000

In [40]:
# Save the model
torch.save(model.state_dict(), 'engagement_predictor_xatn.pth')
print("Model saved successfully")

Model saved successfully


In [41]:
# Load the model
model = EngagementPredictor(num_features, num_frames, num_heads=7)
model.load_state_dict(torch.load('engagement_predictor_xatn.pth'))
model.eval()
print("Model loaded successfully")

Model loaded successfully


In [42]:
model

EngagementPredictor(
  (cross_attention): CrossAttention(
    (attention): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=329, out_features=329, bias=True)
    )
  )
  (mlp): Sequential(
    (0): Linear(in_features=21056, out_features=128, bias=True)
    (1): ReLU()
    (2): Linear(in_features=128, out_features=1, bias=True)
    (3): Tanh()
  )
)