### Step 1: Import Libraries

In [1]:
!pip install jupyter jupyterhub
!pip install torch
!pip install torchvision
!pip install opencv-python
!pip install numpy

Collecting jupyterhub
  Downloading jupyterhub-4.0.2-py3-none-any.whl.metadata (13 kB)
Collecting alembic>=1.4 (from jupyterhub)
  Downloading alembic-1.13.1-py3-none-any.whl.metadata (7.4 kB)
Collecting async-generator>=1.9 (from jupyterhub)
  Downloading async_generator-1.10-py3-none-any.whl.metadata (4.9 kB)
Collecting certipy>=0.1.2 (from jupyterhub)
  Downloading certipy-0.1.3-py3-none-any.whl.metadata (3.7 kB)
Collecting jupyter-telemetry>=0.1.0 (from jupyterhub)
  Downloading jupyter_telemetry-0.1.0-py3-none-any.whl.metadata (4.0 kB)
Collecting oauthlib>=3.0 (from jupyterhub)
  Downloading oauthlib-3.2.2-py3-none-any.whl.metadata (7.5 kB)
Collecting SQLAlchemy>=1.4 (from jupyterhub)
  Downloading SQLAlchemy-2.0.28-cp312-cp312-win_amd64.whl.metadata (9.8 kB)
Collecting Mako (from alembic>=1.4->jupyterhub)
  Downloading Mako-1.3.2-py3-none-any.whl.metadata (2.9 kB)
Collecting typing-extensions>=4 (from alembic>=1.4->jupyterhub)
  Downloading typing_extensions-4.10.0-py3-none-any.w

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
import cv2
import numpy as np

### Step 2: Define the Dataset

Assuming you have a dataset of videos with labeled gaze directions, you'll need to create a custom `Dataset` class that can handle loading these videos and preparing the data for your model.


In [ ]:
class GazeDataset(Dataset):
    def __init__(self, video_paths, labels, transform=None):
        self.video_paths = video_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.video_paths)

    def __getitem__(self, idx):
        # Load video frames and label
        cap = cv2.VideoCapture(self.video_paths[idx])
        frames = []
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            if self.transform:
                frame = self.transform(frame)
            frames.append(frame)
        frames = torch.stack(frames)
        label = torch.tensor(self.labels[idx], dtype=torch.float32)
        return frames, label

# class GazeNet(nn.Module):
#     

### Step 3: Model Definition

This example model uses a pre-trained ResNet for feature extraction and an LSTM for capturing temporal dynamics.


In [ ]:
class GazeDirectionModel(nn.Module):
    def __init__(self):
        super(GazeDirectionModel, self).__init__()
        # Load a pre-trained ResNet and remove the fully connected layer
        resnet = models.resnet18(pretrained=True)
        self.feature_extractor = nn.Sequential(*list(resnet.children())[:-1])
        # Assuming the ResNet outputs 512-dimensional features
        self.lstm = nn.LSTM(512, 256, batch_first=True)
        self.fc = nn.Linear(256, 2)  # Predicting 2D gaze direction

    def forward(self, x):
        # x is of shape (batch_size, time_steps, C, H, W)
        batch_size, time_steps, C, H, W = x.size()
        # Flatten the first two dimensions to apply the feature extractor
        x = x.view(batch_size * time_steps, C, H, W)
        x = self.feature_extractor(x)
        x = x.view(batch_size, time_steps, -1)
        # LSTM expects input of shape (batch, seq, feature)
        lstm_out, _ = self.lstm(x)
        # Take the output of the last time step
        lstm_out = lstm_out[:, -1, :]
        out = self.fc(lstm_out)
        return out



### Step 4: Training Loop

Here's a simplified version of the training loop.


In [ ]:
# Initialize the model, loss function, and optimizer
model = GazeDirectionModel()
criterion = nn.MSELoss()  # Assuming gaze direction is a regression problem
optimizer = optim.Adam(model.parameters(), lr=0.001)


# DataLoader for your dataset
train_loader = DataLoader(
    GazeDataset(
        train_video_paths, 
        train_labels, 
        transform=transforms.ToTensor()
    ), batch_size=4, shuffle=True
)

# Training loop
for epoch in range(num_epochs):
    for frames, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(frames)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch {epoch+1}, Loss: {loss.item()}')
    # model.eval()

# torch.save(model.state_dict())

