In [1]:
import numpy as np
import torch
from torch.utils import data
from torchvision import models, transforms
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from matplotlib import pyplot as plt
from matplotlib import image
from matplotlib import patches
import zipfile
from pycocotools.coco import COCO
import pandas as pd
import cv2
import random as rn
from shutil import copyfile
%matplotlib inline

In [None]:
# Download Dataset
!wget http://images.cocodataset.org/zips/train2017.zip
!wget http://images.cocodataset.org/zips/val2017.zip
!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip

!mkdir data
!mv *.zip data/

# # Unzip data
with zipfile.ZipFile('data/train2017.zip', 'r') as zip_ref: zip_ref.extractall('data/')
with zipfile.ZipFile('data/val2017.zip', 'r') as zip_ref: zip_ref.extractall('data/')
with zipfile.ZipFile('data/annotations_trainval2017.zip', 'r') as zip_ref: zip_ref.extractall('data/')

In [2]:
# Load annotations for training data
train_annot_path = 'data/annotations/person_keypoints_train2017.json'
val_annot_path = 'data/annotations/person_keypoints_val2017.json'
train_coco = COCO(train_annot_path) # load annotations for training set
val_coco = COCO(val_annot_path) # load annotations for validation set

loading annotations into memory...
Done (t=17.91s)
creating index...
index created!
loading annotations into memory...
Done (t=0.37s)
creating index...
index created!


In [3]:
# Load meta data on images: https://towardsdatascience.com/how-to-analyze-the-coco-dataset-for-pose-estimation-7296e2ffb12e
def get_meta(coco):
    ids = list(coco.imgs.keys())
    for i, img_id in enumerate(ids):
        img_meta = coco.imgs[img_id]
        ann_ids = coco.getAnnIds(imgIds=img_id)
        # basic parameters of an image
        img_file_name = img_meta['file_name']
        w = img_meta['width']
        h = img_meta['height']
        # retrieve metadata for all persons in the current image
        anns = coco.loadAnns(ann_ids)

        yield [img_id, img_file_name, w, h, anns]
        
# Convert Coco object to a pandas dataframe
def convert_to_df(coco):
    images_data = []
    persons_data = []
    # iterate over all images
    for img_id, img_fname, w, h, meta in get_meta(coco):
        images_data.append({
            'image_id': int(img_id),
            'path': img_fname,
            'width': int(w),
            'height': int(h)
        })
        # iterate over all metadata
        for m in meta:
            persons_data.append({
                'image_id': m['image_id'],
                'is_crowd': m['iscrowd'],
                'bbox': m['bbox'],
                'area': m['area'],
                'num_keypoints': m['num_keypoints'],
                'keypoints': m['keypoints'],
            })
    # create dataframe with image paths
    images_df = pd.DataFrame(images_data)
    images_df.set_index('image_id', inplace=True)
    # create dataframe with persons
    persons_df = pd.DataFrame(persons_data)
    persons_df.set_index('image_id', inplace=True)
    return images_df, persons_df

# Create train and validation dfs
images_df, persons_df = convert_to_df(train_coco)
train_coco_df = pd.merge(images_df, persons_df, right_index=True, left_index=True)

images_df, persons_df = convert_to_df(val_coco)
val_coco_df = pd.merge(images_df, persons_df, right_index=True, left_index=True)

In [4]:
# Drop all photos with no keypoints or where a group is just marked as crowd
# This removes unwanted training data
train_coco_df = train_coco_df[train_coco_df['is_crowd']==0]
train_coco_df = train_coco_df[train_coco_df['num_keypoints']>0]

val_coco_df = val_coco_df[val_coco_df['is_crowd']==0]
val_coco_df = val_coco_df[val_coco_df['num_keypoints']>0]

149813
6352


In [5]:
# Normalizing transform defined in the DeepPose paper
# This moves pixel coordinates into a relative scale of -0.5 to 0.5
def N(y, b):
    return [ (y[0]-b[0]) * 1/b[2], (y[1]-b[1]) * 1/b[3]]

In [7]:
# Moves keypoints to the correct location after: cropping, scaling, and normalizing photos
def convertKeyPoints(df, dimensions, source, index):
    bbox = np.array(df.iloc[index]["bbox"]).astype(np.int64)
    keypoints = df.iloc[index]["keypoints"]

    bbox_tl_x = bbox[0]
    bbox_tl_y = bbox[1]
    w_original = bbox[2]
    h_original = bbox[3]

    bbox_rescale = [dimensions/2, dimensions/2, dimensions, dimensions]
    
    keypoints = np.array(keypoints).reshape(-1,3)
    x = keypoints[:,0]
    y = keypoints[:,1]
    v = keypoints[:,2]
    
    # Scale and move keypoints
    x = (x - bbox_tl_x) * dimensions / w_original
    y = (y - bbox_tl_y)* dimensions / h_original
    
    # Do normalization
    x = (x-bbox_rescale[0]) * 1 / bbox_rescale[2]
    y = (y-bbox_rescale[1]) * 1 / bbox_rescale[3]
    
    # Set visible keypoint v to 1 and nonvisible or not in image to 0
    no_point_indices = np.where(v < 2)[0]
    v = np.ones_like(v)
    v[no_point_indices] = 0
    x[no_point_indices] = 0
    y[no_point_indices] = 0
    
    # if keypoints are outside of the bounding box set their x,y,v to 0
    outside_bbox_x_indices = np.where((x<-0.5) | (x>0.5))
    outside_bbox_y_indices = np.where((y<-0.5) | (y>0.5))
    v[outside_bbox_x_indices] = 0
    v[outside_bbox_y_indices] = 0
    x[outside_bbox_x_indices] = 0
    x[outside_bbox_y_indices] = 0
    x[outside_bbox_x_indices] = 0
    y[outside_bbox_y_indices] = 0
    
    
    keypoints = np.column_stack((x,y,v)).reshape(-1)
    
    return keypoints

In [8]:
# Scales and crops images
def convertImg(df, dimensions, source, index):
    img = image.imread(f'data/{source}2017/{df.iloc[index]["path"].replace("c", "")}')
    bbox = np.array(df.iloc[index]["bbox"]).astype(np.int64)
    
    # Account for potential gray images by adding channels
    if len(img.shape) == 2 or img.shape[2] == 1:
        if (len(img.shape) == 2): img = np.expand_dims(img,-1)
        img = cv2.merge([img,img,img])

    # Crop image to bbox
    img = img[bbox[1]:bbox[1]+bbox[3],bbox[0]:bbox[0]+bbox[2]]

    # Scale image 
    img = cv2.resize(img, dsize=(dimensions, dimensions), interpolation=cv2.INTER_CUBIC)
    
    img = img.astype('uint8') # enforce int 
    
    return img

In [9]:
# Creates final dataframes
train_df = pd.DataFrame(columns = ['path', 'width', 'height', 'keypoints', 'num_keypoints', 'bbox'])
val_df = pd.DataFrame(columns = ['path', 'width', 'height', 'keypoints', 'num_keypoints', 'bbox'])

In [None]:
!mkdir -p train2017
!mkdir -p val2017

# Compute new keypoints for train_df
for row in range(len(train_coco_df)):
    df_row = train_coco_df.iloc[row]
    kps = convertKeyPoints(train_coco_df, 224, 'train', row)
    
    # Rename from the same path if paths matching
    same_path = train_df.loc[train_df['path'].str.contains(df_row['path'])]
    
    path = df_row['path']
    if len(same_path) > 0:
        path = 'c' + same_path.iloc[-1]['path']
        
    train_df = train_df.append(
        {
            'path' : path,
            'width' : 224,
            'height' : 224,
            'keypoints' : kps,
            'num_keypoints' : sum(1 for _ in filter(lambda score: score > -1, kps)),
            'bbox' : df_row['bbox']
        }
        , ignore_index = True)
    if row % 1000 == 0: print(f'row {row}')
    
# Compute new keypoints for val_df
for row in range(len(val_coco_df)):
    df_row = val_coco_df.iloc[row]
    kps = convertKeyPoints(val_coco_df, 224, 'val', row)
    
    # Rename from the same path if paths matching
    same_path = val_df.loc[val_df['path'].str.contains(df_row['path'])]
    
    path = df_row['path']
    if len(same_path) > 0:
        path = 'c' + same_path.iloc[-1]['path']
    
    val_df = val_df.append(
        {
            'path' : path,
            'width' : 224,
            'height' : 224,
            'keypoints' : kps,
            'num_keypoints' : sum(1 for _ in filter(lambda score: score > -1, kps)),
            'bbox' : df_row['bbox']
        }
        , ignore_index = True)
    if row % 1000 == 0: print(f'row {row}')
    
# Create new cropped files in data-2
for row in range(len(train_df)):
    df_row = train_df.iloc[row]
    img = convertImg(train_df, 224, 'train', row)
    cv2.imwrite(f'train2017/{train_df.iloc[row]["path"]}', cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
    if row % 1000 == 0: print(f'row {row}')

for row in range(len(val_df)):
    df_row = val_df.iloc[row]
    img = convertImg(val_df, 224, 'val', row)
    cv2.imwrite(f'val2017/{val_df.iloc[row]["path"]}', cv2.cvtColor(img, cv2.COLOR_RGB2BGR))
    if row % 1000 == 0: print(f'row {row}')

In [None]:
# Remove bbox columns
del train_df['bbox']
del val_df['bbox']

In [None]:
# Print Statistics
print(len(train_df))
print(len(val_df))
print(train_df.head())

In [4]:
# Save pickle files and zip data
train_df.to_pickle('train_df.pkl')
val_df.to_pickle('val_df.pkl')

!mkdir data-2
!mv train2017 data-2
!mv val2017 data-2
!(cd data-2 && zip -r -q ../data.zip .)