# Installing YOLOv7

In [1]:
# ! git clone https://github.com/WongKinYiu/yolov7.git

Cloning into 'yolov7'...


In [1]:
%cd 

/Users/werther


In [2]:
%cd yolov7
!ls

/Users/werther/yolov7
4124287451827527828img_5023-1_keypoint.mp4
4124287451827527828img_5023-2_keypoint.mp4
4124287451827527828img_5023-3_keypoint.mp4
4124287451827527828img_5023-4_keypoint.mp4
4124287451827527828img_5023-5_keypoint.mp4
4124287451827527828img_5023-6_keypoint.mp4
4124287451827527828img_5023-7_keypoint.mp4
4124287451827527828img_5023-8_keypoint.mp4
4124287451827527828img_5023-9_keypoint.mp4
4124287451827527828img_5023_keypoint.mp4
LICENSE.md
README.md
[34mcfg[m[m
dasol_pose_side_v_keypoint.mp4
[34mdata[m[m
[34mdeploy[m[m
detect.py
export.py
[34mfigure[m[m
hubconf.py
[34minference[m[m
[34mmodels[m[m
[34mpaper[m[m
requirements.txt
[34mscripts[m[m
test.py
[34mtools[m[m
train.py
train_aux.py
[34mutils[m[m
yolov7-w6-pose.pt


In [3]:
! curl -L https://github.com/WongKinYiu/yolov7/releases/download/v0.1/yolov7-w6-pose.pt -o yolov7-w6-pose.pt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  153M  100  153M    0     0  19.5M      0  0:00:07  0:00:07 --:--:-- 22.1M 0  0:00:13  0:00:01  0:00:12 18.7M0:00:05  0:00:03 19.8M


# Loading the YOLOv7 Pose Estimation Model

torch and torchvision are straightforward enough - YOLOv7 is implemented with PyTorch. The utils.datasets, utils.general and utils.plots modules come from the YOLOv7 project, and provide us with methods that help with preprocessing and preparing input for the model to run inference on. Amongst those are letterbox() to pad the image, non_max_supression_keypoint() to run the Non-Max Supression algorithm on the initial output of the model and to produce a clean output for our interpretation, as well as the output_to_keypoint() and plot_skeleton_kpts() methods to actually add keypoints to a given image, once they're predicted.

We can load the model from the weight file with torch.load(). Let's create a function to check if a GPU is available, load the model, put it in inference mode and move it to the GPU if available:

In [4]:
# !install python3-tk

In [14]:
import torch
from torchvision import transforms

from utils.datasets import letterbox
from utils.general import non_max_suppression_kpt
from utils.plots import output_to_keypoint

import pandas as pd
import cv2
import numpy as np

In [6]:
%matplotlib inline
import matplotlib.pyplot as plt

We can load the model from the weight file with torch.load(). Let's create a function to check if a GPU is available, load the model, put it in inference mode and move it to the GPU if available:

In [7]:
def load_model():
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = torch.load('/Users/werther/yolov7/yolov7-w6-pose.pt', map_location=device)['model']
    # Put in inference mode
    model.float().eval()

    if torch.cuda.is_available():
        # half() turns predictions into float16 tensors
        # which significantly lowers inference time
        model.half().to(device)
    return model

model = load_model()

With the model loaded, let's create a run_inference() method that accepts a string pointing to a file on our system. The method will read the image using OpenCV (cv2), pad it with letterbox(), apply transforms to it, and turn it into a batch (the model is trained on and expects batches, as usual):

In [8]:
def run_inference(url):
    image = cv2.imread(url) # shape: (480, 640, 3)
    # Resize and pad image
    image = letterbox(image, 960, stride=64, auto=True)[0] # shape: (768, 960, 3)
    # Apply transforms
    image = transforms.ToTensor()(image) # torch.Size([3, 768, 960])
    # Turn image into batch
    image = image.unsqueeze(0) # torch.Size([1, 3, 768, 960])
    output, _ = model(image) # torch.Size([1, 45900, 57])
    return output, image

Here, we've returned the transformed image (because we'll want to extract the original and plot on it) and the outputs of the model. These outputs contain 45900 keypoint predictions, most of which overlap. We'll want to apply Non-Max Supression to these raw predictions, just as with Object Detection predictions (where many bounding boxes are predicted and then they're "collapsed" given some confidence and IoU threshold). After supression, we can plot each keypoint on the original image and display it:

Now, for some input image, such as karate.jpg in the main working directory, we can run inference, perform Non-Max Supression and plot the results with:

In [9]:
dict_coordinate = {}
reps_num = 1 # 랩스 넘버링(초기값 1)
frame_num = 1 # 이미지(=프레임) 넘버링

In [10]:
# 이미지에 스켈레톤을 그리는 함수2 => 리스트에 저장
def plot_skeleton_kpts(im, kpts, steps, orig_shape=None):
    # (추가) 좌표값들을 저장할 리스트, 데이터프레임으로 만들 딕셔너리
    lst_coordinate = []
    
    #Plot the skeleton and keypointsfor coco datatset
    palette = np.array([[255, 128, 0], [255, 153, 51], [255, 178, 102],
                        [230, 230, 0], [255, 153, 255], [153, 204, 255],
                        [255, 102, 255], [255, 51, 255], [102, 178, 255],
                        [51, 153, 255], [255, 153, 153], [255, 102, 102],
                        [255, 51, 51], [153, 255, 153], [102, 255, 102],
                        [51, 255, 51], [0, 255, 0], [0, 0, 255], [255, 0, 0],
                        [255, 255, 255]])

    skeleton = [[16, 14], [14, 12], [17, 15], [15, 13], [12, 13], [6, 12],
                [7, 13], [6, 7], [6, 8], [7, 9], [8, 10], [9, 11], [2, 3],
                [1, 2], [1, 3], [2, 4], [3, 5], [4, 6], [5, 7]]

    pose_limb_color = palette[[9, 9, 9, 9, 7, 7, 7, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16]]
    pose_kpt_color = palette[[16, 16, 16, 16, 16, 0, 0, 0, 0, 0, 0, 9, 9, 9, 9, 9, 9]]
    radius = 5
    num_kpts = len(kpts) // steps

    for kid in range(num_kpts):
        r, g, b = pose_kpt_color[kid]
        x_coord, y_coord = kpts[steps * kid], kpts[steps * kid + 1]
        if not (x_coord % 640 == 0 or y_coord % 640 == 0):
            if steps == 3:
                conf = kpts[steps * kid + 2]
                if conf < 0.5:
                    continue
#             cv2.circle(im, (int(x_coord), int(y_coord)), radius, (int(r), int(g), int(b)), -1)
            
#             # (추가) 각 포인트마다 좌표를 이미지에 출력한다.
#             font = cv2.FONT_HERSHEY_SIMPLEX
#             text = f'({int(x_coord)}, {int(y_coord)})'
#             cv2.putText(im, text, (int(x_coord), int(y_coord)), font, 0.5, (int(r), int(g), int(b)), 2)
            
            # (추가) 포인트 좌표를 리스트에 추가, 
            lst_coordinate.append(int(x_coord))
            lst_coordinate.append(int(y_coord))
            
    
    # (추가) 포인트의 개수가 17개보다 적을 경우(얼굴이 잘려 있는 영상의 경우, 포인트의 개수가 적다) : 결측치로 채우기
    if len(lst_coordinate) < 34:
        lst_nan = []
        for i in range(34 - len(lst_coordinate)):
            lst_nan.append(np.nan)
        lst_coordinate = lst_nan + lst_coordinate
        
    # (추가) 좌표 리스트를 딕셔너리에 추가
    dict_coordinate[f'img_{reps_num}_{frame_num}'] = lst_coordinate

In [11]:
# 2
def visualize_output(output):
    output = non_max_suppression_kpt(output,
                                     0.25, # Confidence Threshold
                                     0.65, # IoU Threshold
                                     nc=model.yaml['nc'], # Number of Classes
                                     nkpt=model.yaml['nkpt'], # Number of Keypoints
                                     kpt_label=True)
    with torch.no_grad():
        output = output_to_keypoint(output)
    lst_coordinate = []
    for idx in range(output.shape[0]):
        steps = 3
        kpts = output[idx, 7:].T
        num_kpts = len(kpts) // steps
        for kid in range(num_kpts):
            x_coord, y_coord = kpts[steps * kid], kpts[steps * kid + 1]
            if not (x_coord % 640 == 0 or y_coord % 640 == 0):
                if steps == 3:
                    conf = kpts[steps * kid + 2]
                    if conf < 0.5:
                        continue
                # (추가) 포인트 좌표를 리스트에 추가,
                lst_coordinate.append(int(x_coord))
                lst_coordinate.append(int(y_coord))
        # (추가) 포인트의 개수가 17개보다 적을 경우(얼굴이 잘려 있는 영상의 경우, 포인트의 개수가 적다) : 결측치로 채우기
        if len(lst_coordinate) < 34:
            lst_nan = []
            for i in range(34 - len(lst_coordinate)):
                lst_nan.append(np.nan)
            lst_coordinate = lst_nan + lst_coordinate
    return lst_coordinate

In [13]:
# 1
def visualize_output(output):
    output = non_max_suppression_kpt(output,
                                     0.25, # Confidence Threshold
                                     0.65, # IoU Threshold
                                     nc=model.yaml['nc'], # Number of Classes
                                     nkpt=model.yaml['nkpt'], # Number of Keypoints
                                     kpt_label=True)
    with torch.no_grad():
        output = output_to_keypoint(output)
    lst_coordinate = []
    for idx in range(output.shape[0]):
        steps = 3
        kpts = output[idx, 7:].T
        num_kpts = len(kpts) // steps
        for kid in range(num_kpts):
            x_coord, y_coord = kpts[steps * kid], kpts[steps * kid + 1]
            if not (x_coord % 640 == 0 or y_coord % 640 == 0):
                if steps == 3:
                    conf = kpts[steps * kid + 2]
                    if conf < 0.5:
                        continue
                # (추가) 포인트 좌표를 리스트에 추가,
                lst_coordinate.append(int(x_coord))
                lst_coordinate.append(int(y_coord))
        # (추가) 포인트의 개수가 17개보다 적을 경우(얼굴이 잘려 있는 영상의 경우, 포인트의 개수가 적다) : 결측치로 채우기
        if len(lst_coordinate) < 34:
            lst_nan = []
            for i in range(34 - len(lst_coordinate)):
                lst_nan.append(np.nan)
            lst_coordinate = lst_nan + lst_coordinate
        # (추가) 좌표 리스트를 딕셔너리에 추가
        dict_coordinate[f'img_{reps_num}_{frame_num}'] = lst_coordinate
        print(f'{reps_num}')

In [10]:
def visualize_output(output, image):
    output = non_max_suppression_kpt(output, 
                                     0.25, # Confidence Threshold
                                     0.65, # IoU Threshold
                                     nc=model.yaml['nc'], # Number of Classes
                                     nkpt=model.yaml['nkpt'], # Number of Keypoints
                                     kpt_label=True)
    with torch.no_grad():
        output = output_to_keypoint(output)
    nimg = image[0].permute(1, 2, 0) * 255
    nimg = nimg.cpu().numpy().astype(np.uint8)
    nimg = cv2.cvtColor(nimg, cv2.COLOR_RGB2BGR)
    lst_coordinate = []
    for idx in range(output.shape[0]):
        steps = 3
        kpts = output[idx, 7:].T
        num_kpts = len(kpts) // steps

        for kid in range(num_kpts):
            x_coord, y_coord = kpts[steps * kid], kpts[steps * kid + 1]
            if not (x_coord % 640 == 0 or y_coord % 640 == 0):
                if steps == 3:
                    conf = kpts[steps * kid + 2]
                    if conf < 0.5:
                        continue

                # (추가) 포인트 좌표를 리스트에 추가, 
                lst_coordinate.append(int(x_coord))
                lst_coordinate.append(int(y_coord))

        # (추가) 포인트의 개수가 17개보다 적을 경우(얼굴이 잘려 있는 영상의 경우, 포인트의 개수가 적다) : 결측치로 채우기
        if len(lst_coordinate) < 34:
            lst_nan = []
            for i in range(34 - len(lst_coordinate)):
                lst_nan.append(np.nan)
            lst_coordinate = lst_nan + lst_coordinate

        # (추가) 좌표 리스트를 딕셔너리에 추가
        dict_coordinate[f'img_{reps_num}_{frame_num}'] = lst_coordinate
        print(f'{reps_num}')
    
    
    
    
        
        
#     cv2.imshow('lena color', nimg)   

# 이미지 출력문
#     plt.figure(figsize=(12, 12))
#     plt.axis('off')
#     plt.imshow(nimg)
#     plt.show()

In [10]:
# import os

# path = f'C:/Users/Playdata/Desktop/playdata/python/project/dataset/eunseong_squat_img/no{reps_num}'
# os.listdir(path)

<br><br>

## 이미지의 joint points -> csv 과정

1. n번 랩스 이미지들의 포인트 좌표 추출하기
2. 추출한 좌표는 딕셔너리 dict_coordinate에 저장됨
3. dict_coordinate를 데이터프레임 df으로 만든다
4. df.to_csv()로 저장한다.

In [17]:
# 이미지에서 포인트 좌표와 포인트 출력 => 다 나올 경우 총 17개의 포인트가 나온다.
output, image = run_inference(f'/Users/werther/dasol_squat_img/dasol_squat_img/no{reps_num}/img_{reps_num}_1.jpg') # Bryan Reyes on Unsplash

visualize_output(output, image)


0


1. reps_num번 랩스 이미지들의 포인트 좌표 추출하기

In [12]:
dict_coordinate = {}
frame_num = 1 # 이미지(=프레임) 넘버링

In [None]:
for reps_num in range(1,11):
    for i in range(1, 31):
        dict_coordinate = {}
        output, _ = run_inference(f'/Users/werther/dasol_squat_img/dasol_squat_img/no{reps_num}/img_{reps_num}_{i}.jpg')
                # (추가) 좌표 리스트를 딕셔너리에 추가
        dict_coordinate[f'img_{reps_num}_{frame_num}'] = visualize_output(output)
        df = pd.DataFrame(dict_coordinate)
        df2 = df.T
        # 4. df.to_csv()로 저장한다.
        df2.to_csv(f'/Users/werther/dasol_squat_img/csv/no{reps_num}_images.csv')
        print(f'{reps_num}')        
        
        

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2


In [None]:
for reps_num in range(1,11):
    for i in range(1, 31):
        output, image = run_inference(f'/Users/werther/dasol_squat_img/dasol_squat_img/no{reps_num}/img_{reps_num}_{i}.jpg')
        # (추가) 좌표 리스트를 딕셔너리에 추가
        dict_coordinate[f'img_{reps_num}_{frame_num}'] = visualize_output(output)
        print(f'{reps_num}')

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
3
3
3
3
3
3
3
3
3
3
3
3
3
3
3


In [14]:
reps_num = 1 # 랩스 넘버링(초기값 1)

for i in range(1, 31):
    output, image = run_inference(f'/Users/werther/dasol_squat_img/dasol_squat_img/no{reps_num}/img_{reps_num}_{i}.jpg')
    visualize_output(output, image)

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [15]:
# dict_coordinate = {}
reps_num = 2 # 랩스 넘버링(초기값 1)


for i in range(1, 31):
    output, image = run_inference(f'/Users/werther/dasol_squat_img/dasol_squat_img/no{reps_num}/img_{reps_num}_{i}.jpg')
    visualize_output(output, image)

2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2
2


In [None]:
# dict_coordinate = {}
reps_num = 3 # 랩스 넘버링(초기값 1)


for i in range(1, 31):
    output, image = run_inference(f'/Users/werther/dasol_squat_img/dasol_squat_img/no{reps_num}/img_{reps_num}_{i}.jpg')
    visualize_output(output, image)

3
3
3
3
3
3
3
3
3
3
3
3
3
3


In [None]:
# dict_coordinate = {}
reps_num = 4 # 랩스 넘버링(초기값 1)


for i in range(1, 31):
    output, image = run_inference(f'/Users/werther/dasol_squat_img/dasol_squat_img/no{reps_num}/img_{reps_num}_{i}.jpg')
    visualize_output(output, image)

In [None]:
# dict_coordinate = {}
reps_num = 5 # 랩스 넘버링(초기값 1)


for i in range(1, 31):
    output, image = run_inference(f'/Users/werther/dasol_squat_img/dasol_squat_img/no{reps_num}/img_{reps_num}_{i}.jpg')
    visualize_output(output, image)

In [None]:
# dict_coordinate = {}
reps_num = 6 # 랩스 넘버링(초기값 1)


for i in range(1, 31):
    output, image = run_inference(f'/Users/werther/dasol_squat_img/dasol_squat_img/no{reps_num}/img_{reps_num}_{i}.jpg')
    visualize_output(output, image)

In [None]:
# dict_coordinate = {}
reps_num = 7 # 랩스 넘버링(초기값 1)


for i in range(1, 31):
    output, image = run_inference(f'/Users/werther/dasol_squat_img/dasol_squat_img/no{reps_num}/img_{reps_num}_{i}.jpg')
    visualize_output(output, image)

In [None]:
# dict_coordinate = {}
reps_num = 8 # 랩스 넘버링(초기값 1)


for i in range(1, 31):
    output, image = run_inference(f'/Users/werther/dasol_squat_img/dasol_squat_img/no{reps_num}/img_{reps_num}_{i}.jpg')
    visualize_output(output, image)

In [None]:
# dict_coordinate = {}
reps_num = 9 # 랩스 넘버링(초기값 1)


for i in range(1, 31):
    output, image = run_inference(f'/Users/werther/dasol_squat_img/dasol_squat_img/no{reps_num}/img_{reps_num}_{i}.jpg')
    visualize_output(output, image)

In [None]:
# dict_coordinate = {}
reps_num = 10 # 랩스 넘버링(초기값 1)


for i in range(1, 31):
    output, image = run_inference(f'/Users/werther/dasol_squat_img/dasol_squat_img/no{reps_num}/img_{reps_num}_{i}.jpg')
    visualize_output(output, image)

In [24]:
# 커널이 죽는 문제 발생으로 인해, 1랩스씩 돌려보기

dict_coordinate = {}

In [1]:
# 2. 추출한 좌표는 딕셔너리 dict_coordinate에 저장됨
dict_coordinate

NameError: name 'dict_coordinate' is not defined

In [37]:
# 3. dict_coordinate를 데이터프레임 df으로 만든다

import pandas as pd

df = pd.DataFrame(dict_coordinate)
df2 = df.T
df2

Unnamed: 0,0,1,2,3,4,...,29,30,31,32,33
img_1_1,,,,,,...,374.0,498.0,509.0,357.0,507.0
img_1_2,,,,,,...,373.0,499.0,509.0,357.0,506.0
img_1_3,,,,,,...,371.0,500.0,510.0,356.0,506.0
img_1_4,,,,,,...,380.0,502.0,510.0,354.0,507.0
img_1_5,432.0,39.0,445.0,27.0,420.0,...,380.0,506.0,513.0,352.0,507.0
img_1_6,431.0,67.0,444.0,55.0,418.0,...,387.0,506.0,516.0,351.0,509.0
img_1_7,430.0,100.0,443.0,88.0,418.0,...,395.0,504.0,518.0,352.0,509.0
img_1_8,430.0,133.0,443.0,120.0,417.0,...,401.0,504.0,518.0,352.0,509.0
img_1_9,430.0,165.0,443.0,152.0,417.0,...,404.0,505.0,520.0,353.0,511.0
img_1_10,429.0,190.0,442.0,177.0,415.0,...,405.0,504.0,519.0,354.0,511.0


In [15]:
reps_num

5

In [16]:
# 4. df.to_csv()로 저장한다.
df2.to_csv(f'C:/Users/Playdata/Desktop/playdata/python/project/dataset/csv/eunseong_squat_45_csv/no{reps_num}_images.csv')

<br><br>