In [None]:
import os
from collections import defaultdict
import numpy as np
import pandas as pd
DATADIR = "/Users/robincheong/Documents/Stanford/CS231N/Project/data/eitz2012/"
OUTPUTDIR = "/Users/robincheong/Documents/Stanford/CS231N/Project/data/eitz2012/"

In [None]:
def save_split_fps_txt(outputdir, split, data):
    ''' 
        Saves the train / val / test split filepaths into a .txt file for later use 
        Args:
            outputdir: path to the data folder in which the .txt files will be stored
            split: the split (train / val / test)
            data: the data to be stored
    '''
    with open(PREFIX + f"{split}.txt", 'w') as fp:
        for item in data:
            fp.write(f"{item}\n")
            
            
def get_categories(data_fps, category_map):
    '''
        Return a list of labels for each file in a list
        Args:
            data: list containing the files to match to categories
            category_map: a map matching the string of a category to an int representation
    '''
    labels = []
    for i, file in enumerate(data_fps):
        label = file.split('/')[0]
        label_val = category_map[label]
        labels.append(label_val)
    return labels


def save_csv(data_fps, labels, outputdir, name):
    '''
        Saves the data + labels into outputdir
    '''
    df = pd.DataFrame(data={"filepath": data_fps, "label": labels})
    df.to_csv(f"{outputdir}/{name}.csv", sep=',', index=False)
    

In [None]:
## Create test set
test_set_fps = []
np.random.seed(42)
for catdir in os.listdir(DATADIR):
    sketches = os.listdir(DATADIR + catdir)
    chosen = np.random.choice(sketches, size=10, replace=False)
    chosen = list(map(lambda x: catdir + "/" + x, chosen))
    test_set_fps += chosen

test_set_labels = get_categories(test_set_fps, labels_map)
save_csv(test_set_fps, test_set_labels, OUTPUTDIR, "test")
print(len(test_set_labels))

In [None]:
## Construct validation set
val_set_fps = []
np.random.seed(42)
for catdir in os.listdir(DATADIR):    
    sketches = os.listdir(DATADIR + catdir)
    sketches = [x for x in sketches if catdir + "/" + x not in test_set_fps]
    chosen = np.random.choice(sketches, size=10, replace=False)
    chosen = list(map(lambda x: catdir + "/" + x, chosen))
    val_set_fps += chosen

print(len(val_set_fps))

In [None]:
print(set(val_set_fps) & set(test_set_fps))

In [None]:
val_set_labels = get_categories(val_set_fps, labels_map)
save_csv(val_set_fps, val_set_labels, OUTPUTDIR, "val")

In [None]:
## Construct train set
train_set_fps = []
np.random.seed(42)
for catdir in os.listdir(DATADIR):    
    sketches = os.listdir(DATADIR + catdir)
    sketches = [x for x in sketches if catdir + "/" + x not in test_set_fps and catdir + "/" + x not in val_set_fps]
    chosen = list(map(lambda x: catdir + "/" + x, sketches))
    train_set_fps += chosen

print(len(train_set_fps))

In [None]:
print(set(val_set_fps) & set(test_set_fps) & set(train_set_fps))

In [None]:
train_set_labels = get_categories(train_set_fps, labels_map)
save_csv(train_set_fps, train_set_labels, OUTPUTDIR, "train")

In [None]:
labels = list(set(map(lambda x: x.split('/')[0], test_set_fps)))

In [None]:
labels_map = {label: val for val, label in enumerate(labels)}

In [None]:
print(labels_map)