# SPNet: Proposed Splits

## COCO Stuff

In [1]:
import numpy as np
import pickle
import torch
import torch.nn.functional as F
import os.path as osp

datadir = 'data/datasets/cocostuff'
val_size = 2000

all_labels = np.genfromtxt(datadir+'/labels_2.txt', delimiter='\t', usecols=1, dtype='str')
seen_classes = np.load(datadir+'/split/seen_cls.npy')
seenval_classes = np.load(datadir+'/split/seenval_cls.npy')
train_classes = np.asarray(np.concatenate([seen_classes, seenval_classes]),dtype=int)
novel_classes = np.load(datadir+'/split/novel_cls.npy')
class_emb = np.concatenate([pickle.load(open(datadir+'/word_vectors/fasttext.pkl', "rb")), pickle.load(open(datadir+'/word_vectors/word2vec.pkl', "rb"))], axis = 1)
class_emb = F.normalize(torch.tensor(class_emb), p=2, dim=1)#.cuda()
train_images = np.load(datadir+'/split/train_list.npy')
val_images = train_images[-val_size:]
test_images = np.load(datadir+'/split/test_list.npy')
inverse_dict = pickle.load(open(datadir+'/split/inverse_dict_train.pkl', 'rb'))

In [2]:
print("Total classes: {}".format(train_classes.size+novel_classes.size))
print("Training classes: {}+{}".format(seen_classes.size,seenval_classes.size))
print("Test classes: {}".format(novel_classes.size))
print("Embedding dimension: {}\n".format(class_emb.shape[1]))

Total classes: 182
Training classes: 155+12
Test classes: 15
Embedding dimension: 600



In [3]:
print("Train embedding matrix dimension : {}".format(class_emb[train_classes].numpy().shape))
print("# images for training: {}".format(train_images.size))
print("# train/val images for finding threshold: {}/{}".format(train_images[:-val_size].size, train_images[-val_size:].size))
print("Train class list : {}\n".format(", ".join (all_labels[train_classes])))

Train embedding matrix dimension : (167, 600)
# images for training: 118287
# train/val images for finding threshold: 116287/2000
Train class list : person, bicycle, car, airplane, bus, train, truck, boat, traffic light, fire hydrant, street sign, stop sign, parking meter, bench, bird, cat, dog, horse, sheep, bear, zebra, hat, backpack, umbrella, shoe, eye glasses, handbag, tie, skis, snowboard, sports ball, kite, baseball glove, surfboard, tennis racket, bottle, plate, cup, fork, knife, spoon, bowl, apple, sandwich, orange, broccoli, hot dog, pizza, donut, cake, couch, potted plant, bed, mirror, dining table, window, desk, toilet, door, tv, laptop, mouse, remote, cell phone, microwave, oven, toaster, sink, blender, book, clock, vase, teddy bear, hair drier, toothbrush, hair brush, banner, blanket, bridge, building-other, bush, cabinet, cage, carpet, ceiling-other, ceiling-tile, cloth, clothes, counter, cupboard, curtain, desk-stuff, dirt, door-stuff, fence, floor-marble, floor-other, 

In [4]:
print("Test embedding matrix dimension : {}".format(class_emb[novel_classes].numpy().shape))
print("# images for testing: {}".format(test_images.size))
print("Test/novel class list : {}\n".format(", ".join (all_labels[novel_classes])))

Test embedding matrix dimension : (15, 600)
# images for testing: 5000
Test/novel class list : cow, giraffe, suitcase, frisbee, skateboard, carrot, scissors, cardboard, clouds, grass, playingfield, river, road, tree, wall-concrete



In [5]:
print("For 2-shot Few-label training we use these images:")
nshot = 2 # please change this number accordingly to get 1, 2, 5, 10, 20-shot training set
for key in novel_classes:
    if(inverse_dict[key].size >0):
        file_index = inverse_dict[key][0:nshot]
        class_name = all_labels[key]
        example_file = " ". join([osp.basename(train_images[fi])[:-3]+'jpg' for fi in file_index])
        print("{0: <14} : {1}".format(class_name, example_file))

For 2-shot Few-label training we use these images:
cow            : 000000438154.jpg 000000368205.jpg
giraffe        : 000000384230.jpg 000000073075.jpg
suitcase       : 000000136433.jpg 000000406245.jpg
frisbee        : 000000109503.jpg 000000489834.jpg
skateboard     : 000000458677.jpg 000000476827.jpg
carrot         : 000000298189.jpg 000000111549.jpg
scissors       : 000000089738.jpg 000000078478.jpg
cardboard      : 000000105564.jpg 000000107596.jpg
clouds         : 000000010222.jpg 000000294228.jpg
grass          : 000000378491.jpg 000000144904.jpg
playingfield   : 000000109971.jpg 000000233042.jpg
river          : 000000454610.jpg 000000318415.jpg
road           : 000000144904.jpg 000000161738.jpg
tree           : 000000378491.jpg 000000384230.jpg
wall-concrete  : 000000481298.jpg 000000141228.jpg


## PASCAL-VOC

In [6]:
import numpy as np
import pickle
import torch
import torch.nn.functional as F

datadir = 'data/datasets/voc12'
val_size = 500

all_labels = np.genfromtxt(datadir+'/labels_2.txt', delimiter='\t', usecols=1, dtype='str')
seen_classes = np.load(datadir+'/split/seen_cls.npy')
seenval_classes = np.load(datadir+'/split/seenval_cls.npy')
train_classes = np.asarray(np.concatenate([seen_classes, seenval_classes]),dtype=int)
novel_classes = np.load(datadir+'/split/novel_cls.npy')
class_emb = np.concatenate([pickle.load(open(datadir+'/word_vectors/fasttext.pkl', "rb")), pickle.load(open(datadir+'/word_vectors/word2vec.pkl', "rb"))], axis = 1)
class_emb = F.normalize(torch.tensor(class_emb), p=2, dim=1)#.cuda()
train_images = np.load(datadir+'/split/train_list.npy')
val_images = train_images[-val_size:]
test_images = np.load(datadir+'/split/test_list.npy')
inverse_dict = pickle.load(open(datadir+'/split/inverse_dict_train.pkl', 'rb'))

In [7]:
print("Total classes: {}".format(seen_classes.size+seenval_classes.size+novel_classes.size))
print("Training classes: {}+{}".format(seen_classes.size,seenval_classes.size))
print("Test classes: {}".format(novel_classes.size))
print("Embedding dimension: {}\n".format(class_emb.shape[1]))

Total classes: 20
Training classes: 12+3
Test classes: 5
Embedding dimension: 600



In [8]:
print("Train embedding matrix dimension : {}".format(class_emb[train_classes].numpy().shape))
print("# images for training: {}".format(train_images.size))
print("# train/val images for finding threshold: {}/{}".format(train_images[:-val_size].size, train_images[-val_size:].size))
print("Train class list : {}\n".format(", ".join (all_labels[train_classes])))

Train embedding matrix dimension : (15, 600)
# images for training: 11685
# train/val images for finding threshold: 11185/500
Train class list : aeroplane, bicycle, bird, boat, bottle, bus, car, cat, chair, cow, diningtable, dog, horse, motorbike, person



In [9]:
print("Test embedding matrix dimension : {}".format(class_emb[novel_classes].numpy().shape))
print("# images for testing: {}".format(test_images.size))
print("Test/novel class list : {}\n".format(", ".join (all_labels[novel_classes])))

Test embedding matrix dimension : (5, 600)
# images for testing: 1449
Test/novel class list : potted_plant, sheep, sofa, train, tv



In [10]:
print("For 2-shot Few-label training we use these images:")
nshot = 2 # please change this number accordingly to get 1, 2, 5, 10, 20-shot training set
for key in novel_classes:
    if(inverse_dict[key].size >0):
        file_index = inverse_dict[key][0:nshot]
        class_name = all_labels[key]
        example_file = " ". join([osp.basename(train_images[fi])[:-3]+'jpg' for fi in file_index])
        print("{0: <14} : {1}".format(class_name, example_file))

For 2-shot Few-label training we use these images:
potted_plant   : 2008_007973.jpg 2010_003302.jpg
sheep          : 2010_003806.jpg 2008_005635.jpg
sofa           : 2011_002184.jpg 2009_001605.jpg
train          : 2010_002930.jpg 2008_001105.jpg
tv             : 2008_005254.jpg 2011_001357.jpg
