# Preprocessing - Let's pick up images for pose detection

### Input
* COCO Dataset keypoint annotation json

### Output
* CSV for picked images

### Criteria
* Image has a single person
* The person has more keypoints than 13

In [12]:
%matplotlib inline
from pycocotools.coco import COCO
import skimage.io as io
import matplotlib.pyplot as plt
import numpy as np
import pylab
import os.path
import pandas as pd
pylab.rcParams['figure.figsize'] = (8.0, 10.0)


# If you got error like "No module named pycocotools._mask", then build it by make
# cython package should be installed

#### Set annotation file path

In [9]:
annFolder = "../annotations"
annFile = "person_keypoints_val2017.json"
annPath = "{}/{}".format(annFolder, annFile)
print(annPath)
os.path.isfile(annPath)

../annotations/person_keypoints_val2017.json


True

#### Initialize COCO class for annotation

In [11]:
coco=COCO(annPath)

loading annotations into memory...
Done (t=0.20s)
creating index...
index created!


#### Filter step 1 : Get all image ids of person

In [13]:
personID = 1 # Predefined id
personImageIds = coco.getImgIds(catIds=[personID]);
personImages = coco.loadImgs(personImageIds)
len(personImages)

2693

#### Filter step 2 : Get keypoints of person images

In [14]:
keypointsIds = coco.getAnnIds(imgIds=personImageIds)
keypoints = coco.loadAnns(keypointsIds)
len(keypoints)

11004

#### Filter step 3 : Get keypoints by my criteria
keypoints are more than image  
That is, there are images having multi keypoints  
We don't need images like that     
Let's filter them  

In [19]:
# I am gonna handle keypoints with pandas dataframe
# but dataframe doesn't support collection type like list as value
# Fortunately, what I need at this moment is just these 3 files of non-collection type
# num_keypoints / area / image_id
# Drop all properties except these 3
temp = [{"imageID":kp['image_id'], "numKeypoints":kp["num_keypoints"], "area":kp["area"]} for kp in keypoints]

In [68]:
df = pd.DataFrame(temp)
# Let's calculate how many keypoints each image has
df['freq'] = df.groupby('imageID')['imageID'].transform('count')
df

Unnamed: 0,imageID,numKeypoints,area,freq
0,532481,12,2188.08650,2
1,532481,0,153.10855,2
2,458755,13,117599.51490,4
3,458755,2,2721.80025,4
4,458755,1,1222.94590,4
...,...,...,...,...
10999,98287,0,75.49305,14
11000,98287,0,163.00000,14
11001,417779,0,452.51520,1
11002,24567,11,151026.62905,1


In [69]:
df = df[df['freq'] == 1] # Only one person in a image
df

Unnamed: 0,imageID,numKeypoints,area,freq
6,385029,1,18704.69050,1
7,311303,1,14862.27685,1
8,393226,15,4010.80445,1
9,532493,16,7817.26120,1
16,188439,0,260.12070,1
...,...,...,...,...
10973,147415,6,94001.96655,1
10974,270297,0,333.61380,1
11001,417779,0,452.51520,1
11002,24567,11,151026.62905,1


In [70]:
df = df[df['area'] > 30000] # Person size is big enough
df = df[df['numKeypoints'] > 10]
df

Unnamed: 0,imageID,numKeypoints,area,freq
21,401446,17,41409.94485,1
22,213033,14,58468.43390,1
119,32861,15,61470.33910,1
125,311394,12,204684.81615,1
146,270474,15,30447.81350,1
...,...,...,...,...
10939,327601,17,35009.52795,1
10958,327617,17,65808.03670,1
10959,442306,16,44533.69385,1
11002,24567,11,151026.62905,1


In [71]:
df = df.drop(['area', 'freq', 'numKeypoints'], axis='columns') # Drop temporary columns
df

Unnamed: 0,imageID
21,401446
22,213033
119,32861
125,311394
146,270474
...,...
10939,327601
10958,327617
10959,442306
11002,24567


In [72]:
# Filtering has done. Let's add filename column
# filename is (imageID).jpg and zero padding with 16 length
# ex - 000000401446.jpg
df['filename'] = df['imageID'].apply(lambda x: f"{x}.jpg".zfill(16))
df

Unnamed: 0,imageID,filename
21,401446,000000401446.jpg
22,213033,000000213033.jpg
119,32861,000000032861.jpg
125,311394,000000311394.jpg
146,270474,000000270474.jpg
...,...,...
10939,327601,000000327601.jpg
10958,327617,000000327617.jpg
10959,442306,000000442306.jpg
11002,24567,000000024567.jpg


#### To CSV file

In [73]:
df.to_csv('./inputImages.csv', index=False)