In [42]:
import torchvision.models as models
from torchinfo import summary

model = models.detection.keypointrcnn_resnet50_fpn(
    weights=models.detection.KeypointRCNN_ResNet50_FPN_Weights.COCO_V1
    )

# summary(model)
models.detection.KeypointRCNN_ResNet50_FPN_Weights.COCO_V1.transforms()

In [45]:
import numpy as np
import os
from pathlib import Path

current_working_dir = os.getcwd()
data_path = Path(current_working_dir) / 'data'

print(len(os.listdir(data_path / 'yogaposes-aii22-challenge' / 'Train')))

12690


In [39]:
from PIL import Image
from IPython.display import display

image_path = data_path / 'yogaposes-aii22-challenge' / 'Train' / '00033.jpg'
image = Image.open(image_path)
image_np = np.array(image)
# print("Image size:", image.size)
# print("Image format:", image.format)
# print("Image mode:", image.mode)
# display(image)

In [None]:
keypoints = [
    "0 - nose",
    "1 - left eye (inner)",
    "2 - left eye",
    "3 - left eye (outer)",
    "4 - right eye (inner)",
    "5 - right eye",
    "6 - right eye (outer)",
    "7 - left ear",
    "8 - right ear",
    "9 - mouth (left)",
    "10 - mouth (right)",
    "11 - left shoulder",
    "12 - right shoulder",
    "13 - left elbow",
    "14 - right elbow",
    "15 - left wrist",
    "16 - right wrist",
    "17 - left pinky",
    "18 - right pinky",
    "19 - left index",
    "20 - right index",
    "21 - left thumb",
    "22 - right thumb",
    "23 - left hip",
    "24 - right hip",
    "25 - left knee",
    
    "26 - right knee",
    "27 - left ankle",
    "28 - right ankle",
    "29 - left heel",
    "30 - right heel",
    "31 - left foot index",
    "32 - right foot index"
]

In [40]:
import mediapipe as mp
# import numpy as np
import cv2

# Initialize MediaPipe Pose model
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(min_detection_confidence=0.5, min_tracking_confidence=0.5)

# Detect poses in an input image
results = pose.process(image_np)#cv2.cvtColor(image, cv2.COLOR_BGR2RGB))

# Extract normalized keypoints
keypoints = results.pose_landmarks.landmark  # List of normalized keypoints (x, y, z)
# Assuming keypoints are normalized and represented as (x, y, z) coordinates

# Example: Embedding as a flattened feature vector
feature_vector = np.array([kp.x for kp in keypoints] + [kp.y for kp in keypoints] + [kp.z for kp in keypoints])

# Normalize feature vector if needed
# ...

# Input embedded features into a classifier model
# classifier_model.predict(feature_vector)


In [47]:
len(keypoints)

33

In [34]:
from torchvision import transforms
from torch.utils.data import Dataset
import os
from PIL import Image


import mediapipe as mp
import numpy as np
import matplotlib.pyplot as plt


class MediapipeCroppedDataset(Dataset):
    def __init__(self, data_dir):
        self.data_dir = data_dir
        self.image_files = os.listdir(data_dir)
        self.transform = transforms.Compose([
            self.custom_transform,
            transforms.ToTensor(),
        ])
        self.transform_crop_proposal = mp.solutions.pose.Pose(
            min_detection_confidence=0.5,
            min_tracking_confidence=0.5
            )

    def custom_transform(self, image):
        mp_pose = mp.solutions.pose 

        image.flags.writeable = False
        pose = mp_pose.Pose(min_detection_confidence=0.5, min_tracking_confidence=0.5)
        result = pose.process(image)

        # Create a white background image with the same size as the original image
        white_background = np.ones_like(image) * 255

        # Make the white background image writeable
        white_background.flags.writeable = True
    
        # Render detections on the white background with specified colors
        mp_drawing = mp.solutions.drawing_utils
        mp_drawing.draw_landmarks(
            white_background, result.pose_landmarks, mp_pose.POSE_CONNECTIONS,
            landmark_drawing_spec=mp_drawing.DrawingSpec(color=(0, 0, 0), thickness=4, circle_radius=4),
            connection_drawing_spec=mp_drawing.DrawingSpec(color=(0, 0, 0), thickness=2)
        )
        white_background_bgr = cv2.cvtColor(white_background, cv2.COLOR_RGB2BGR)

        # Get landmarks of the detected pose
        landmarks = result.pose_landmarks.landmark if result.pose_landmarks else []

        # Extract x, y coordinates of landmarks
        landmark_coords = np.array([(lm.x, lm.y) for lm in landmarks])
        
        if landmark_coords.size > 0:
            # Find the bounding box that encloses the detected skeleton
            min_x, min_y = np.min(landmark_coords, axis=0)
            max_x, max_y = np.max(landmark_coords, axis=0)
        
            # Add 5-pixel margins to the bounding box
            margin = 5
            min_x -= margin
            min_y -= margin
            max_x += margin
            max_y += margin
        
            # Crop the image based on the bounding box
            cropped_image = white_background_bgr[int(min_y * white_background_bgr.shape[0]):int(max_y * white_background_bgr.shape[0]),
                            int(min_x * white_background_bgr.shape[1]):int(max_x * white_background_bgr.shape[1])]
        
            # Resize the cropped image to a consistent size (e.g., 224x224 for classification)
            resized_image = cv2.resize(cropped_image, (224, 224))
            resized_image = cv2.cvtColor(resized_image, cv2.COLOR_RGB2BGR)

            return resized_image
        else:
            # Handle the case where no pose landmarks are detected
            return np.zeros((224, 224), dtype=np.uint8)  # You can adjust the default image as needed

    def get_label(self, img_name):
        # Extract the label from the image name (assuming it's the first digit)
        label = int(img_name[0])  # Convert the first character to an integer
        return label

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = os.path.join(self.data_dir, self.image_files[idx])
        image = cv2.imread(img_name)  # Read image using OpenCV
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB

        tensor_image = self.transform(image)
        label = self.get_label(self.image_files[idx])

        # Save the preprocessed image to the "NewTrain" folder
        save_path = os.path.join("NewTrain", f"preprocessed_{self.image_files[idx]}")
        if tensor_image.shape[0] == 1:
            # If it's a single-channel image, convert it to three channels before saving
            tensor_image_rgb = torch.cat([tensor_image] * 3, dim=0)
            cv2.imwrite(save_path, cv2.cvtColor(tensor_image_rgb.numpy().transpose(1, 2, 0), cv2.COLOR_RGB2BGR))
        else:
            # If it's already a three-channel image, save as is
            cv2.imwrite(save_path, cv2.cvtColor(tensor_image.numpy().transpose(1, 2, 0), cv2.COLOR_RGB2BGR))


        return tensor_image, label

# Example usage:
data_directory = "Train"
custom_dataset = CustomDataset(data_directory)

# Example loading one image
sample_image, label = custom_dataset[2]
file = os.listdir(data_directory)
print(file[2])
print(sample_image.shape)  # Check the shape (should be torch.Size([1, 224, 224]))
print("Label:", label)
print(sample_image)

min_value = torch.min(sample_image)
max_value = torch.max(sample_image)

print("Minimum value:", min_value.item())
print("Maximum value:", max_value.item())

# Convert the tensor to a NumPy array
numpy_image = sample_image.squeeze().numpy()

# Display the image
plt.imshow(numpy_image, cmap='gray')  # Use 'gray' colormap for single-channel images
plt.title("Image Title")
plt.show()