## Local Inference

In [95]:
%matplotlib inline
import os, json, cv2, numpy as np, matplotlib.pyplot as plt
import copy

import torch
from torch.utils.data import Dataset, DataLoader

import torchvision
from torchvision.models.detection.rpn import AnchorGenerator
from torchvision.transforms import functional as F

import time

%matplotlib inline

In [96]:
def get_model(num_keypoints, weights_path=None):
    
    anchor_generator = AnchorGenerator(sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.25, 0.5, 0.75, 1.0, 2.0, 3.0, 4.0))
    model = torchvision.models.detection.keypointrcnn_resnet50_fpn(pretrained=False,
                                                                   pretrained_backbone=True,
                                                                   num_keypoints=num_keypoints,
                                                                   num_classes = 2, # Background is the first class, object is the second class
                                                                   rpn_anchor_generator=anchor_generator)

    if weights_path:
        state_dict = torch.load(weights_path)
        
        # update state_dict keys
        # for key in list(state_dict.keys()):
        #     state_dict[key.replace('.0.0', '.0').replace('.1.0', '.1').replace('.2.0', '.2').replace('.3.0', '.3').replace('rpn.head.conv.0.0.', 'rpn.head.conv.').replace('rpn.head.conv.0.', 'rpn.head.conv.')] = state_dict.pop(key)

        model.load_state_dict(state_dict)        
        
    return model

In [3]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model = get_model(num_keypoints = 4, weights_path='./assets/keypoint_model/weights/keypointsrcnn_weights 001.pth')
model.to(device)



KeypointRCNN(
  (transform): GeneralizedRCNNTransform(
      Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
      Resize(min_size=(640, 672, 704, 736, 768, 800), max_size=1333, mode='bilinear')
  )
  (backbone): BackboneWithFPN(
    (body): IntermediateLayerGetter(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): FrozenBatchNorm2d(64, eps=1e-05)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): Bottleneck(
          (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn1): FrozenBatchNorm2d(64, eps=1e-05)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): FrozenBatchNorm2d(64, eps=1e-05)
          (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (bn3): FrozenBatchNorm2d(256, 

## Run model on Live Camera or Video Recording

In [97]:
# frame capture rate in seconds
capture_rate = 5

# keypoint, bbox and frame lists 
keypoints_main = []
bboxes_main = []
frame_list = []
keypoint_list = []
bbox_list = []

### Test on video recording

In [98]:
import cv2

# Open the video file
video = cv2.VideoCapture("./assets/pfd_video_dataset/20221014_124610.mp4")

# Check if video is opened successfully
if not video.isOpened():
    print("Error opening video file")

# Read until video is completed
while(video.isOpened()):
    # Capture frame-by-frame
    ret, frame = video.read()
    
    if ret == True:
        if int(time.time()) % capture_rate == 0:
            frame_np = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)            
            frame_tensor = F.to_tensor(frame_np)
            frame_tensor = frame_tensor.to(device)
            
            images = [frame_tensor]

            with torch.no_grad():
                model.to(device)
                model.eval()
                output = model(images)
            
            frame_tensor = (images[0].permute(1,2,0).detach().cpu().numpy() * 255).astype(np.uint8)
            scores = output[0]['scores'].detach().cpu().numpy()
            
            high_scores_idxs = np.where(scores > 0.8)[0].tolist() # Indexes of boxes with scores > 0.7
            post_nms_idxs = torchvision.ops.nms(output[0]['boxes'][high_scores_idxs], output[0]['scores'][high_scores_idxs], 0.3).cpu().numpy() # Indexes of boxes left after applying NMS (iou_threshold=0.3)

            keypoints = []
            for kps in output[0]['keypoints'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
                keypoints.append([list(map(int, kp[:2])) for kp in kps])
                
            bboxes = []
            for bbox in output[0]['boxes'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
                bboxes.append(list(map(int, bbox.tolist())))

            if(len(keypoints) != 0):
                keypoints_main = keypoints
            
            if(len(bboxes) != 0):
                bboxes_main = bboxes

            # add frame to to frame_list
            frame_list.append(frame)
            keypoint_list.append(keypoints_main)
            bbox_list.append(bboxes_main)

        if(len(keypoints_main) != 0):
            points = np.array(keypoints_main, np.int32)
            points = points.reshape((-1, 1, 2))

            # Draw a keypoints on the frame
            # cv2.polylines(frame, [points], isClosed=True, color=(0, 0, 0), thickness=8)
            for point in points:
                center = (point[0][0], point[0][1])
                radius = 15

                # Draw the keypoints using the circle() function
                cv2.circle(frame, center, radius, (0, 0, 255), -1)

        if(len(bboxes_main) != 0):
            # draw a bbox on the frame
            cv2.rectangle(frame, (bboxes_main[0][0], bboxes_main[0][1]),  (bboxes_main[0][2], bboxes_main[0][3]), color=(0, 255, 0), thickness=8)

        # Display the resulting frame
        frame = cv2.resize(frame, (600, 800), interpolation=cv2.INTER_AREA)
        cv2.imshow('Frame', frame)
        
        # Press Q on keyboard to exit
        if cv2.waitKey(25) & 0xFF == ord('q'):
            break

    # Break the loop
    else:
        break

# Release the video
video.release()

# Close all windows
cv2.destroyAllWindows()

### Test on live camera

In [4]:
import cv2

# Create a VideoCapture object
cap = cv2.VideoCapture(0) # 0 means the default camera

while True:
    # Read a frame from the video
    ret, frame = cap.read()

    if int(time.time()) % capture_rate == 0:
        frame_np = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        frame_tensor = F.to_tensor(frame_np)
        frame_tensor = frame_tensor.to(device)
        images = [frame_tensor]


        with torch.no_grad():
            model.to(device)
            model.eval()
            output = model(images)
        
        frame_tensor = (images[0].permute(1,2,0).detach().cpu().numpy() * 255).astype(np.uint8)
        scores = output[0]['scores'].detach().cpu().numpy()
        
        high_scores_idxs = np.where(scores > 0.7)[0].tolist() # Indexes of boxes with scores > 0.7
        post_nms_idxs = torchvision.ops.nms(output[0]['boxes'][high_scores_idxs], output[0]['scores'][high_scores_idxs], 0.3).cpu().numpy() # Indexes of boxes left after applying NMS (iou_threshold=0.3)

        keypoints = []
        for kps in output[0]['keypoints'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
            keypoints.append([list(map(int, kp[:2])) for kp in kps])
            
        bboxes = []
        for bbox in output[0]['boxes'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
            bboxes.append(list(map(int, bbox.tolist())))
    
        if(len(keypoints) != 0):
            print(keypoints)
            keypoints_main = keypoints
        
        if(len(bboxes) != 0):
            print(bboxes)
            bboxes_main = bboxes

        # add frame to to frame_list
        frame_list.append(frame)
        keypoint_list.append(keypoints_main)
        bbox_list.append(bboxes_main)

    if(len(keypoints_main) != 0):
        points = np.array(keypoints_main, np.int32)
        points = points.reshape((-1, 1, 2))

        # Draw a keypoints on the frame
        # cv2.polylines(frame, [points], isClosed=True, color=(0, 0, 0), thickness=8)
        for point in points:
            center = (point[0][0], point[0][1])
            radius = 15

            # Draw the keypoints using the circle() function
            cv2.circle(frame, center, radius, (255, 0, 0), -1)

    if(len(bboxes_main) != 0):
        # draw a bbox on the frame
        cv2.rectangle(frame, (bboxes_main[0][0], bboxes_main[0][1]),  (bboxes_main[0][2], bboxes_main[0][3]), color=(0, 255, 0), thickness=8)
        
    # Display the frame with the bounding box
    cv2.imshow("Live Video with Bounding Box", frame)

    # `Exit the loop if the 'q' key is pressed
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the video capture object
cap.release()

# Close all windows
cv2.destroyAllWindows()

## Perspective Transformation and Timelapse

In [117]:
filtered_keypoints = []
filtered_frames = []

# remove coordinates if its x or y is lower than the average + threshold
for count, kp in enumerate(keypoint_list):
    threshold = 5
    coordinates = kp[0]

    # Sort the coordinates based on x and y
    coordinates = sorted(coordinates)

    # Get the left and right side coordinates
    left_side_coordinates = coordinates[:2]
    right_side_coordinates = coordinates[2:]

    # Get the top and bottom side coordinates
    if left_side_coordinates[0][1] < left_side_coordinates[1][1]:
        top_side_coordinates = left_side_coordinates
        bottom_side_coordinates = right_side_coordinates
    else:
        top_side_coordinates = right_side_coordinates
        bottom_side_coordinates = left_side_coordinates

    # Extract the slope angles
    left_side_slope = (top_side_coordinates[0][1] - top_side_coordinates[1][1]) / (top_side_coordinates[0][0] - top_side_coordinates[1][0])
    right_side_slope = (bottom_side_coordinates[0][1] - bottom_side_coordinates[1][1]) / (bottom_side_coordinates[0][0] - bottom_side_coordinates[1][0])
    top_side_slope = (top_side_coordinates[1][1] - top_side_coordinates[0][1]) / (top_side_coordinates[1][0] - top_side_coordinates[0][0])
    bottom_side_slope = (bottom_side_coordinates[1][1] - bottom_side_coordinates[0][1]) / (bottom_side_coordinates[1][0] - bottom_side_coordinates[0][0])

    diff_top_bottom = abs(top_side_slope - bottom_side_slope)
    diff_left_right = abs(left_side_slope - right_side_slope)

    if diff_top_bottom < threshold and diff_left_right < threshold:
        filtered_keypoints.append(coordinates)
        filtered_frames.append(frame_list[count])
    
removed = len(keypoint_list) - len(filtered_frames)
print('Length of frames removed (threshold = {}): {}'.format(threshold, removed))

Length of frames removed (threshold = 5): 65


In [93]:
filtered_keypoints = []
filtered_frames = []

# get an average for each keypoint
avg_keypoint = np.mean(keypoint_list, axis = 0)[0]

# filter threshold
threshold = 10

# remove coordinates if its x or y is lower than the average + threshold
for count, kp in enumerate(keypoint_list):
    keypoint = kp[0]
    
    if (keypoint[0][0] < (avg_keypoint[0][0] + threshold) and keypoint[0][1] < (avg_keypoint[0][1] + threshold)) \
        and (keypoint[0][0] > (avg_keypoint[0][0] - threshold) and keypoint[0][1] > (avg_keypoint[0][1] - threshold)):
        filtered_keypoints.append(keypoint)
        filtered_frames.append(frame_list[count])

removed = len(keypoint_list) - len(filtered_frames)
print('Length of frames removed (threshold = {}): {}'.format(threshold, removed))

Length of frames removed (threshold = 10): 54


In [118]:
# frame_list is defined in either live camera test or video recording test

# Calculate the frame rate
num_frames = len(filtered_frames)
timelapse_length = 20
frame_rate = int(num_frames / timelapse_length)

# Define the codec and create a video writer
fourcc = cv2.VideoWriter_fourcc(*"MP4V")
out = cv2.VideoWriter("timelapse.mp4", fourcc, frame_rate, (400, 500))

# Define the transformation matrix
for count, frame in enumerate(filtered_frames):
    bbox = bbox_list[count][0]
    keypoint = filtered_keypoints[count]

    # pt1 = np.float32([keypoint[0],keypoint[3],keypoint[1],keypoint[2]])
    pt1 = np.float32([keypoint[1],keypoint[3],keypoint[0],keypoint[2]])
    pt2 = np.float32([[0,0],[400,0],[0,500],[400,500]])
    M = cv2.getPerspectiveTransform(pt1,pt2)

    # Perform the transformation
    warped_image = cv2.warpPerspective(frame, M, (400, 500))
    # warped_image = cv2.cvtColor(warped_image,cv2.COLOR_BGR2RGB)

    out.write(warped_image)

# Release the video writer
out.release()