In [1]:
# imports.py

import glob
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
import PIL
## https://github.com/keras-team/keras/issues/5475
from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True
## end of import for image load error fix
from matplotlib.pyplot import imshow
import random
import shutil
import cv2
%matplotlib inline

In [13]:
def prep_category_dataset(dataset_dir, dest_dir):
    category_dict = {}
    for image_f in glob.glob("%s/*" % dataset_dir):
        category = "_".join(os.path.basename(image_f).split("_")[:-1])
        if category not in category_dict:
            category_dict[category] = 1
            os.makedirs(os.path.join(dest_dir, category), exist_ok=True)
        else:
            category_dict[category] += 1
        shutil.move(image_f, os.path.join(dest_dir, category, os.path.basename(image_f)))
    return category_dict

In [14]:
data_dir = "../datasets/Stanford40_actions/JPEGImages/"
dest_dir = "../datasets/Stanford40_actions/images/"
len(glob.glob("%s/*" % data_dir))

9532

In [15]:
glob.glob("%s/*" % data_dir)[:20]

['../datasets/Stanford40_actions/JPEGImages/jumping_245.jpg',
 '../datasets/Stanford40_actions/JPEGImages/cooking_257.jpg',
 '../datasets/Stanford40_actions/JPEGImages/blowing_bubbles_199.jpg',
 '../datasets/Stanford40_actions/JPEGImages/watching_TV_130.jpg',
 '../datasets/Stanford40_actions/JPEGImages/blowing_bubbles_092.jpg',
 '../datasets/Stanford40_actions/JPEGImages/phoning_210.jpg',
 '../datasets/Stanford40_actions/JPEGImages/texting_message_139.jpg',
 '../datasets/Stanford40_actions/JPEGImages/waving_hands_066.jpg',
 '../datasets/Stanford40_actions/JPEGImages/taking_photos_043.jpg',
 '../datasets/Stanford40_actions/JPEGImages/fishing_093.jpg',
 '../datasets/Stanford40_actions/JPEGImages/playing_guitar_168.jpg',
 '../datasets/Stanford40_actions/JPEGImages/smoking_116.jpg',
 '../datasets/Stanford40_actions/JPEGImages/fishing_056.jpg',
 '../datasets/Stanford40_actions/JPEGImages/walking_the_dog_240.jpg',
 '../datasets/Stanford40_actions/JPEGImages/fixing_a_car_144.jpg',
 '../datase

In [16]:
"_".join(os.path.basename('../datasets/Stanford40_actions/JPEGImages/holding_an_umbrella_248.jpg').split("_")[:-1])

'holding_an_umbrella'

In [17]:
cats = prep_category_dataset(dataset_dir=data_dir, dest_dir=dest_dir)

In [18]:
cats

{'jumping': 295,
 'cooking': 288,
 'blowing_bubbles': 259,
 'watching_TV': 223,
 'phoning': 259,
 'texting_message': 193,
 'waving_hands': 210,
 'taking_photos': 197,
 'fishing': 273,
 'playing_guitar': 289,
 'smoking': 241,
 'walking_the_dog': 293,
 'fixing_a_car': 251,
 'fixing_a_bike': 228,
 'holding_an_umbrella': 292,
 'cutting_vegetables': 189,
 'throwing_frisby': 202,
 'looking_through_a_telescope': 203,
 'rowing_a_boat': 185,
 'washing_dishes': 182,
 'reading': 245,
 'writing_on_a_book': 246,
 'pouring_liquid': 200,
 'writing_on_a_board': 183,
 'brushing_teeth': 200,
 'gardening': 199,
 'playing_violin': 260,
 'using_a_computer': 230,
 'running': 251,
 'applauding': 284,
 'pushing_a_cart': 235,
 'riding_a_bike': 293,
 'cutting_trees': 203,
 'cleaning_the_floor': 212,
 'drinking': 256,
 'feeding_a_horse': 287,
 'riding_a_horse': 296,
 'climbing': 295,
 'looking_through_a_microscope': 191,
 'shooting_an_arrow': 214}

In [19]:
def create_category_dataframes(dataset_dir):
    """
        create dataframes with relative image url, action tag per row
        relative directories will be created as follows [this is needed for custom image classifier]
        
        images/whiteboard/0068.JPG receipt
        images/whiteboard/0069.JPG receipt
        images/whiteboard/0070.JPG receipt
        images/receipt/IMG_2118.JPG whiteboard
        images/receipt/IMG_2119.JPG whiteboard
        images/other/IMG_005.JPG other
        images/other/IMG_006.JPG other
        
        ....
    """
    category_list = []
    for cat_dir in glob.glob("%s/*" % dataset_dir):
        category = os.path.basename(cat_dir)
        rel_dirname = "images"
        for image in glob.glob("%s/%s/*" % (dataset_dir, category)):
            im_name = os.path.basename(image)
            category_list.append(["%s/%s/%s" % (rel_dirname, category, im_name), category])
    df_cat = pd.DataFrame(category_list, columns=['image_rel_path', 'action'])
    df_cat.to_csv("stanford40_dataset.csv", sep='\t', encoding='utf-8', index=False)
    return df_cat

In [20]:
_df_cat = create_category_dataframes(dest_dir)

In [22]:
_df_cat['action'].value_counts()

riding_a_horse                  296
climbing                        295
jumping                         295
walking_the_dog                 293
riding_a_bike                   293
holding_an_umbrella             292
playing_guitar                  289
cooking                         288
feeding_a_horse                 287
applauding                      284
fishing                         273
playing_violin                  260
phoning                         259
blowing_bubbles                 259
drinking                        256
fixing_a_car                    251
running                         251
writing_on_a_book               246
reading                         245
smoking                         241
pushing_a_cart                  235
using_a_computer                230
fixing_a_bike                   228
watching_TV                     223
shooting_an_arrow               214
cleaning_the_floor              212
waving_hands                    210
looking_through_a_telescope 

In [23]:
# sample across categories with min samples (=2) per category
_df_cat.groupby('action', group_keys=False).apply(lambda x: x.sample(min(len(x), 1)))

Unnamed: 0,image_rel_path,action
7143,images/applauding/applauding_116.jpg,applauding
157,images/blowing_bubbles/blowing_bubbles_175.jpg,blowing_bubbles
7936,images/brushing_teeth/brushing_teeth_143.jpg,brushing_teeth
1533,images/cleaning_the_floor/cleaning_the_floor_0...,cleaning_the_floor
1956,images/climbing/climbing_107.jpg,climbing
6246,images/cooking/cooking_261.jpg,cooking
5118,images/cutting_trees/cutting_trees_013.jpg,cutting_trees
3562,images/cutting_vegetables/cutting_vegetables_1...,cutting_vegetables
7624,images/drinking/drinking_163.jpg,drinking
1108,images/feeding_a_horse/feeding_a_horse_230.jpg,feeding_a_horse


In [41]:
# dir naming convention is n_{images per category}_k_{images per category threshold in entire dataset}
train_dir = "../datasets/Stanford40_actions/fastai/train/"
eval_dir = "../datasets/Stanford40_actions/fastai/valid/"
test_dir = "../datasets/Stanford40_actions/fastai/test/"
os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)
os.makedirs(eval_dir, exist_ok=True)

In [25]:
# pick 50 samples per category for training
train_min_sample_per_cat = 50
train_50_df = _df_cat.groupby('action', group_keys=False).apply(lambda x: x.sample(min(len(x), train_min_sample_per_cat)))

# remove already sampled ones
res_df = _df_cat[~_df_cat.isin(train_50_df)].dropna()

# pick 50 samples per category for validation
val_min_sample_per_cat = 50
val_50_df = res_df.groupby('action', group_keys=False).apply(lambda x: x.sample(min(len(x), val_min_sample_per_cat)))

# remove already sampled ones
res_df = res_df[~res_df.isin(val_50_df)].dropna()

# res_df becomes your holdout set
res_df.count()

image_rel_path    5532
action            5532
dtype: int64

In [26]:
train_50_df.count()

image_rel_path    2000
action            2000
dtype: int64

In [28]:
train_50_df['action'].value_counts()

reading                         50
pouring_liquid                  50
shooting_an_arrow               50
fixing_a_car                    50
writing_on_a_board              50
jumping                         50
using_a_computer                50
fixing_a_bike                   50
climbing                        50
cutting_vegetables              50
watching_TV                     50
riding_a_horse                  50
brushing_teeth                  50
phoning                         50
looking_through_a_microscope    50
playing_violin                  50
smoking                         50
riding_a_bike                   50
throwing_frisby                 50
taking_photos                   50
playing_guitar                  50
writing_on_a_book               50
blowing_bubbles                 50
gardening                       50
cutting_trees                   50
drinking                        50
waving_hands                    50
fishing                         50
looking_through_a_te

In [31]:
val_50_df.count()

image_rel_path    2000
action            2000
dtype: int64

In [32]:
val_50_df['action'].value_counts()

reading                         50
pouring_liquid                  50
shooting_an_arrow               50
fixing_a_car                    50
writing_on_a_board              50
jumping                         50
using_a_computer                50
fixing_a_bike                   50
climbing                        50
cutting_vegetables              50
watching_TV                     50
riding_a_horse                  50
brushing_teeth                  50
phoning                         50
looking_through_a_microscope    50
playing_violin                  50
smoking                         50
riding_a_bike                   50
throwing_frisby                 50
taking_photos                   50
playing_guitar                  50
writing_on_a_book               50
blowing_bubbles                 50
gardening                       50
cutting_trees                   50
drinking                        50
waving_hands                    50
fishing                         50
looking_through_a_te

In [34]:
res_df['action'].value_counts()

riding_a_horse                  196
climbing                        195
jumping                         195
walking_the_dog                 193
riding_a_bike                   193
holding_an_umbrella             192
playing_guitar                  189
cooking                         188
feeding_a_horse                 187
applauding                      184
fishing                         173
playing_violin                  160
phoning                         159
blowing_bubbles                 159
drinking                        156
fixing_a_car                    151
running                         151
writing_on_a_book               146
reading                         145
smoking                         141
pushing_a_cart                  135
using_a_computer                130
fixing_a_bike                   128
watching_TV                     123
shooting_an_arrow               114
cleaning_the_floor              112
waving_hands                    110
looking_through_a_telescope 

In [36]:
# save train dataset
train_50_df.to_csv("../datasets/Stanford40_actions/train50.txt", sep='\t', index=False, header=None)


In [37]:
# save validation dataset
val_50_df.to_csv("../datasets/Stanford40_actions/val50.txt", sep='\t', index=False, header=None)


In [38]:
# save test dataset
res_df.to_csv("../datasets/Stanford40_actions/test50.txt", sep='\t', index=False, header=None)


In [44]:
for index, row in train_50_df.iterrows():
    rel = row['image_rel_path']
    cat = os.path.dirname(rel).split("/")[1]
    im_name = os.path.basename(rel)
    os.makedirs(os.path.join(train_dir, cat), exist_ok=True)
    shutil.copyfile(os.path.join(dest_dir, cat, im_name), os.path.join(train_dir, cat, im_name))

In [45]:
# train image set
print(len(glob.glob("%s/*/*" % train_dir)))

# train image categories
print(len(glob.glob("%s/*" % train_dir)))

2000
40


In [46]:
for index, row in val_50_df.iterrows():
    rel = row['image_rel_path']
    cat = os.path.dirname(rel).split("/")[1]
    im_name = os.path.basename(rel)
    os.makedirs(os.path.join(eval_dir, cat), exist_ok=True)
    shutil.copyfile(os.path.join(dest_dir, cat, im_name), os.path.join(eval_dir, cat, im_name))

In [47]:
# eval image set
print(len(glob.glob("%s/*/*" % eval_dir)))

# eval image categories
print(len(glob.glob("%s/*" % eval_dir)))

2000
40


In [48]:
# finally create test dir from res_df
for index, row in res_df.iterrows():
    rel = row['image_rel_path']
    cat = os.path.dirname(rel).split("/")[1]
    im_name = os.path.basename(rel)
    os.makedirs(os.path.join(test_dir, cat), exist_ok=True)
    shutil.copyfile(os.path.join(dest_dir, cat, im_name), os.path.join(test_dir, cat, im_name))

In [49]:
# test image set
print(len(glob.glob("%s/*/*" % test_dir)))

# test image categories
print(len(glob.glob("%s/*" % test_dir)))

5532
40


In [50]:
# sanity checks on number of categories
assert len(glob.glob("%s/*" % train_dir)) == len(glob.glob("%s/*" % eval_dir))
assert len(glob.glob("%s/*" % train_dir)) == len(glob.glob("%s/*" % test_dir))