## Start Code

In [12]:
# Import necessary libraries
import json
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
import cv2
import numpy as np

In [8]:
# Check for CUDA first
if torch.cuda.is_available():
    device = torch.device("cuda")
# If CUDA is unavailable, check for MPS (Apple Silicon GPUs)
elif torch.backends.mps.is_available():
    device = torch.device("mps")
# Default to CPU if neither CUDA nor MPS is available
else:
    device = torch.device("cpu")
# Print the selected device
print(device)



## Create Torch Dataset

In [13]:
# Define a custom Dataset class for keypoints
class KeypointsDataset(Dataset):
    def __init__(self, img_dir, data_file):
        # Initialize with image directory and optional transform
        self.img_dir = img_dir
        
        # Read image file names from the directory
        with open(data_file, 'r') as f:
            self.data = json.load(f)
            
        # Define a series of transformations to apply to the images
        self.transform = transforms.Compose([
            transforms.ToPILImage(),
            transforms.Resize((224, 224)),
            transforms.ToTensor(), 
            transforms.Normalize(mean=[0.485, 0.456, 0.406], # Mean and standard deviation for ImageNet
                                 std=[0.229, 0.224, 0.225])
        ])
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        img = cv2.imread(f'''{self.img_dir}/{item["id"]}.png''')
        h, w = img.shape[:2] # Get the height and width of the image
        
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # Convert the image to RGB
        img = self.transform(img) # Apply the transformations

        kps = np.array(item["kps"], dtype=np.float32).flatten()  # Ensure float dtype
        kps[::2] *= 224.0 / w # adjust x-coordinates of the keypoints to match the resized image
        kps[1::2] *= 224.0 / h # adjust y-coordinates of the keypoints to match the resized image
        
        return img, kps # Return the image and keypoints

In [14]:
train_images_path = "/Applications/saggydev/projects_learning/video-analytics-project/tennis_analysis_reproduce/data/tennis_keypoints_data/images/"
train_details_path = "/Applications/saggydev/projects_learning/video-analytics-project/tennis_analysis_reproduce/data/tennis_keypoints_data/data_train.json"
valid_images_path = "/Applications/saggydev/projects_learning/video-analytics-project/tennis_analysis_reproduce/data/tennis_keypoints_data/images"
valid_details_path = "/Applications/saggydev/projects_learning/video-analytics-project/tennis_analysis_reproduce/data/tennis_keypoints_data/data_val.json"

In [15]:
train_dataset = KeypointsDataset(train_images_path, train_details_path)
valid_dataset = KeypointsDataset(valid_images_path, valid_details_path)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=True)

## Create Model

In [16]:
model = models.resnet50(pretrained=True)
model.fc = torch.nn.Linear(model.fc.in_features, 14*2) # Replaces the last layer
model = model.to(device)







## Train the model

In [17]:
criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [18]:
epochs = 20
for epoch in range(epochs):
    for i, (imgs, kps) in enumerate(train_loader):
        imgs, kps = imgs.to(device), kps.to(device)
        
        optimizer.zero_grad()
        
        outputs = model(imgs)
        loss = criterion(outputs, kps)
        
        loss.backward()
        optimizer.step()
        
        if i % 10 == 0:
            print(f"Epoch {epoch}, Batch {i}, Loss: {loss.item()}")



## Save the model

In [20]:
torch.save(model.state_dict(), "../models/keypoints_model.pth")