In [1]:
import pandas as pd
import numpy as np
import os

## first, let's create a few different randomly-generated datasets

In [2]:
def generate_dataset(nb_categories, nb_exemplars, nb_textures, nb_colors):
    """
    """
    # keep a running list of the dataframe chunks
    df_chunks = []
    # intialize labels
    labels = []
    for cat in range(nb_categories):
        # create placeholder array for this segment of the data
        arr = np.zeros((nb_exemplars, 3), dtype=np.int16)
        # shape is always correlated with the category
        arr[:,0] = cat
        # color and texture are generated at random
        arr[:,1] = np.random.choice(range(nb_textures), nb_exemplars, replace=True)
        arr[:,2] = np.random.choice(range(nb_colors), nb_exemplars, replace=True)
        # create a dataframe chunk with arr, append it to the list of chunks
        df_chunks.append(pd.DataFrame(arr, columns=['shape', 'color', 'texture'], dtype=np.int16))
        # add to labels
        labels.extend(['obj%i' % cat for i in range(nb_exemplars)])
    # concatenate df chunks, turn labels list into series, return
    return pd.concat(df_chunks), pd.Series(labels)

In [3]:
df, labels = generate_dataset(nb_categories=100, nb_exemplars=5, nb_textures=200, nb_colors=200)

In [4]:
df

Unnamed: 0,shape,color,texture
0,0,42,180
1,0,33,190
2,0,186,55
3,0,189,165
4,0,56,152
0,1,13,3
1,1,24,113
2,1,175,83
3,1,126,182
4,1,140,147


In [7]:
a = np.asarray(
    [
        [100, 200, 200],
        [100, 201, 201],
        [101, 200, 202],
        [102, 202, 200],
    ])

In [8]:
a

array([[100, 200, 200],
       [100, 201, 201],
       [101, 200, 202],
       [102, 202, 200]])

In [9]:
a+3

array([[103, 203, 203],
       [103, 204, 204],
       [104, 203, 205],
       [105, 205, 203]])

In [23]:
def synthesize_new_data(nb_categories, nb_textures, nb_colors):
    a = np.asarray([[nb_categories, nb_colors, nb_textures],
                    [nb_categories, nb_colors+1, nb_textures+1],
                    [nb_categories+1, nb_colors, nb_textures+2],
                    [nb_categories+2, nb_colors+2, nb_textures]])
    dfs = []
    labels = []
    for i in range(nb_categories):
        dfs.append(pd.DataFrame(a+3*i, columns=['shape', 'color', 'texture']))
        labels.extend([nb_categories + 3*i for j in range(4)])
    return pd.concat(dfs), pd.Series(labels)

In [24]:
df_new, labels_new = synthesize_new_data(100, 200, 200)

In [25]:
df_new

Unnamed: 0,shape,color,texture
0,100,200,200
1,100,201,201
2,101,200,202
3,102,202,200
0,103,203,203
1,103,204,204
2,104,203,205
3,105,205,203
0,106,206,206
1,106,207,207


In [26]:
labels_new

0      100
1      100
2      100
3      100
4      103
5      103
6      103
7      103
8      106
9      106
10     106
11     106
12     109
13     109
14     109
15     109
16     112
17     112
18     112
19     112
20     115
21     115
22     115
23     115
24     118
25     118
26     118
27     118
28     121
29     121
      ... 
370    376
371    376
372    379
373    379
374    379
375    379
376    382
377    382
378    382
379    382
380    385
381    385
382    385
383    385
384    388
385    388
386    388
387    388
388    391
389    391
390    391
391    391
392    394
393    394
394    394
395    394
396    397
397    397
398    397
399    397
Length: 400, dtype: int64

In [18]:
df_path = os.path.realpath('../data/objects1.csv')
labels_path = os.path.realpath('../data/labels1.csv')
df.to_csv(df_path, index=False)
labels.to_csv(labels_path, index=False)

## Let's repeat a few more times with different dataset parameters

In [20]:
df, labels = generate_dataset(nb_categories=100, nb_exemplars=5, nb_textures=20, nb_colors=20)
df_path = os.path.realpath('../data/objects2.csv')
labels_path = os.path.realpath('../data/labels2.csv')
df.to_csv(df_path, index=False)
labels.to_csv(labels_path, index=False)

In [21]:
df, labels = generate_dataset(nb_categories=1000, nb_exemplars=10, nb_textures=20, nb_colors=20)
df_path = os.path.realpath('../data/objects3.csv')
labels_path = os.path.realpath('../data/labels3.csv')
df.to_csv(df_path, index=False)
labels.to_csv(labels_path, index=False)

## Finally, let's create the exact dataset from Smith et al.

In [3]:
same_objects_each_week = True # whether or not to use the same 4 objects each week (unclear if paper does)
names = []
shapes = []
colors = []
textures = []
# 7 weeks of training
for week in range(7):
    # each week there are 4 objects
    for obj in range(4):
        # both instances of this object have the same name 
        # and same shape
        for i in range(2):
            if same_objects_each_week:
                names.append('obj%i' % obj)
                shapes.append(obj)
            else:
                names.append('week%i_obj%i' % (week, obj))
                shapes.append(week*4+obj)
        # assuming we have 10 colors, lets sample 2 random
        # colors w/out replacement to ensure that they are
        # different accross instances of the object
        c = list(np.random.choice(range(10), 2, replace=False))
        colors.extend(c)
        # same for textures
        t = list(np.random.choice(range(10), 2, replace=False))
        textures.extend(t)

df = pd.DataFrame()
df['name'] = names
df['shape'] = shapes
df['color'] = colors
df['textures'] = textures

In [4]:
df

Unnamed: 0,name,shape,color,textures
0,obj0,0,6,0
1,obj0,0,2,1
2,obj1,1,1,6
3,obj1,1,7,4
4,obj2,2,5,9
5,obj2,2,4,2
6,obj3,3,1,9
7,obj3,3,3,0
8,obj0,0,8,4
9,obj0,0,0,3


In [8]:
import os
store_path = os.path.realpath('../data/smith_objects.csv')
df.to_csv(store_path, index=False)