In [8]:
import pandas as pd
import numpy as np
import os

## first, let's create a few different randomly-generated datasets

In [14]:
def generate_dataset(nb_categories, nb_exemplars, nb_textures, nb_colors):
    """
    """
    # keep a running list of the dataframe chunks
    df_chunks = []
    # intialize labels
    labels = []
    for cat in range(nb_categories):
        # create placeholder array for this segment of the data
        arr = np.zeros((nb_exemplars, 3), dtype=np.int16)
        # shape is always correlated with the category
        arr[:,0] = cat
        # color and texture are generated at random
        arr[:,1] = np.random.choice(range(nb_textures), nb_exemplars, replace=True)
        arr[:,2] = np.random.choice(range(nb_colors), nb_exemplars, replace=True)
        # create a dataframe chunk with arr, append it to the list of chunks
        df_chunks.append(pd.DataFrame(arr, columns=['shape', 'color', 'texture'], dtype=np.int16))
        # add to labels
        labels.extend(['obj%i' % cat for i in range(nb_exemplars)])
    # concatenate df chunks, turn labels list into series, return
    return pd.concat(df_chunks), pd.Series(labels)

In [15]:
df, labels = generate_dataset(nb_categories=10, nb_exemplars=2, nb_textures=10, nb_colors=10)

In [16]:
df

Unnamed: 0,shape,color,texture
0,0,2,7
1,0,0,5
0,1,0,7
1,1,3,1
0,2,7,4
1,2,6,0
0,3,1,3
1,3,5,7
0,4,8,5
1,4,0,1


In [17]:
labels

0     obj0
1     obj0
2     obj1
3     obj1
4     obj2
5     obj2
6     obj3
7     obj3
8     obj4
9     obj4
10    obj5
11    obj5
12    obj6
13    obj6
14    obj7
15    obj7
16    obj8
17    obj8
18    obj9
19    obj9
dtype: object

In [18]:
df_path = os.path.realpath('../data/objects1.csv')
labels_path = os.path.realpath('../data/labels1.csv')
df.to_csv(df_path, index=False)
labels.to_csv(labels_path, index=False)

## Let's repeat a few more times with different dataset parameters

In [20]:
df, labels = generate_dataset(nb_categories=100, nb_exemplars=5, nb_textures=20, nb_colors=20)
df_path = os.path.realpath('../data/objects2.csv')
labels_path = os.path.realpath('../data/labels2.csv')
df.to_csv(df_path, index=False)
labels.to_csv(labels_path, index=False)

In [21]:
df, labels = generate_dataset(nb_categories=1000, nb_exemplars=10, nb_textures=20, nb_colors=20)
df_path = os.path.realpath('../data/objects3.csv')
labels_path = os.path.realpath('../data/labels3.csv')
df.to_csv(df_path, index=False)
labels.to_csv(labels_path, index=False)

## Finally, let's create the exact dataset from Smith et al.

In [3]:
same_objects_each_week = True # whether or not to use the same 4 objects each week (unclear if paper does)
names = []
shapes = []
colors = []
textures = []
# 7 weeks of training
for week in range(7):
    # each week there are 4 objects
    for obj in range(4):
        # both instances of this object have the same name 
        # and same shape
        for i in range(2):
            if same_objects_each_week:
                names.append('obj%i' % obj)
                shapes.append(obj)
            else:
                names.append('week%i_obj%i' % (week, obj))
                shapes.append(week*4+obj)
        # assuming we have 10 colors, lets sample 2 random
        # colors w/out replacement to ensure that they are
        # different accross instances of the object
        c = list(np.random.choice(range(10), 2, replace=False))
        colors.extend(c)
        # same for textures
        t = list(np.random.choice(range(10), 2, replace=False))
        textures.extend(t)

df = pd.DataFrame()
df['name'] = names
df['shape'] = shapes
df['color'] = colors
df['textures'] = textures

In [4]:
df

Unnamed: 0,name,shape,color,textures
0,obj0,0,6,0
1,obj0,0,2,1
2,obj1,1,1,6
3,obj1,1,7,4
4,obj2,2,5,9
5,obj2,2,4,2
6,obj3,3,1,9
7,obj3,3,3,0
8,obj0,0,8,4
9,obj0,0,0,3


In [8]:
import os
store_path = os.path.realpath('../data/smith_objects.csv')
df.to_csv(store_path, index=False)