In [1]:
'''
PURPOSE: 
split data set to obtain train, validation and test sets with non-overlapping images
'''
import os
import numpy as np
from random import randint
from scipy.misc import imread
import matplotlib.pyplot as plt

In [5]:
#load data
x = np.load("x.npy")
y = np.load("y.npy")

In [3]:
def split_dataset(x,y,train_size,val_size,test_size): 
    act = np.unique(y)

    for num in range(len(act)):
    
        actor = act[num]
        start = np.where(y == actor)[0][0]
        stop = np.where(y == actor)[0][-1]
        actor_data = x[:,:,start:stop]
        
        i=0
        j=0
        k=0
        
        #pick 70 images for train set
        while (i < train_size): 
            rand = randint(0,actor_data.shape[2]-1)
            if (num == 0 and i == 0):
                x_train = actor_data[:,:,rand]
                y_train = np.array([actor])
            
            else:
                x_train = np.dstack((x_train, actor_data[:,:,rand]))
                y_train = np.append(y_train,[actor])   
            #remove data from "source" to avoid overlap between sets
            actor_data = np.delete(actor_data, rand, axis = 2)
            i+=1
            
        
        #pick 10 images for val set
        while (j < val_size):
            rand = randint(0,actor_data.shape[2]-1)
            if (num == 0 and j == 0): 
                x_val = actor_data[:,:,rand]
                y_val = np.array([actor])
            else: 
                x_val = np.dstack((x_val, actor_data[:,:,rand]))
                y_val = np.append(y_val,[actor])
            #remove data from "source" to avoid overlap between sets
            actor_data = np.delete(actor_data, rand, axis = 2)
            j+=1        
         
        
        #pick 10 images for test set
        while (k < test_size):
            rand = randint(0,actor_data.shape[2]-1)
            if (num == 0 and k ==0): 
                x_test = actor_data[:,:,rand]
                y_test = np.array([actor])
            else:
                x_test = np.dstack((x_test, actor_data[:,:,rand]))
                y_test = np.append(y_test,[actor])

            #remove data from "source" to avoid overlap between sets
            actor_data = np.delete(actor_data, rand, axis = 2)
            k+=1        
        
        
    return x_train, y_train, x_val, y_val, x_test, y_test

In [9]:
x_train, y_train, x_val, y_val, x_test, y_test = split_dataset(x,y,70,10,10)

In [None]:
np.save("x_train.npy",x_train)
np.save("y_train.npy",y_train)
np.save("x_val.npy", x_val)
np.save("y_val.npy", y_val)
np.save("x_test.npy", x_test)
np.save("y_test.npy", y_test)
#but must pretend to now know y_val and y_test

In [6]:
#modified previous definition to create training, validaiton and test sets for specific actors
def split_dataset_for_actors(x,y,train_size,val_size,test_size,acts): 
    
    acts = np.unique(acts)
    for num in range(len(acts)):
    
        actor = acts[num]
        start = np.where(y == actor)[0][0]
        stop = np.where(y == actor)[0][-1]
        actor_data = x[:,:,start:stop]
        
#        print(actor)
#        print(actor_data.shape)
        
        i=0
        j=0
        k=0
        
        #pick 70 images for train set
        while (i < train_size): 
            rand = randint(0,actor_data.shape[2]-1)
            if (num == 0 and i == 0):
                x_train = actor_data[:,:,rand]
                y_train = np.array([actor])
            
            else:
                x_train = np.dstack((x_train, actor_data[:,:,rand]))
                y_train = np.append(y_train,[actor])   
            #remove data from "source" to avoid overlap between sets
            actor_data = np.delete(actor_data, rand, axis = 2)
            i+=1
            
#        print(x_train.shape, y_train.shape, actor_data.shape[2])
        
        
        
        
        #pick 10 images for val set
        while (j < val_size):
            rand = randint(0,actor_data.shape[2]-1)
            if (num == 0 and j == 0): 
                x_val = actor_data[:,:,rand]
                y_val = np.array([actor])
            else: 
                x_val = np.dstack((x_val, actor_data[:,:,rand]))
                y_val = np.append(y_val,[actor])
            #remove data from "source" to avoid overlap between sets
            actor_data = np.delete(actor_data, rand, axis = 2)
            j+=1
#        print(x_val.shape, y_val.shape, actor_data.shape[2])
        
        
        
        
        #pick 10 images for test set
        while (k < test_size):
            rand = randint(0,actor_data.shape[2]-1)
            if (num == 0 and k ==0): 
                x_test = actor_data[:,:,rand]
                y_test = np.array([actor])
            else:
                x_test = np.dstack((x_test, actor_data[:,:,rand]))
                y_test = np.append(y_test,[actor])

            #remove data from "source" to avoid overlap between sets
            actor_data = np.delete(actor_data, rand, axis = 2)
            k+=1
#        print(x_test.shape, y_test.shape, actor_data.shape[2])
        
        
        
    return x_train, y_train, x_val, y_val, x_test, y_test

In [10]:
x_train0, y_train0, x_val0, y_val0, x_test0, y_test0 = split_dataset_for_actors(x,y,70,10,10,["Steve Carell", "Alec Baldwin"])



In [13]:
print(x_train0.shape , y_train0.shape)
print(x_val0.shape , y_val0.shape)
print(x_test0.shape , y_test0.shape)

((32, 32, 140), (140,))
((32, 32, 20), (20,))
((32, 32, 20), (20,))


In [314]:
np.save("x_train0.npy",x_train0)
np.save("y_train0.npy",y_train0)
np.save("x_val0.npy", x_val0)
np.save("y_val0.npy", y_val0)
np.save("x_test0.npy", x_test0)
np.save("y_test0.npy", y_test0)

In [17]:
x_train1, y_train1, x1, y1, x2, y2 = split_dataset_for_actors(x,y,2,10,10,["Steve Carell", "Alec Baldwin"])

In [18]:
print(x_train1.shape , y_train1.shape)

((32, 32, 4), (4,))


In [19]:
np.save("x_train1.npy",x_train1)
np.save("y_train1.npy",y_train1)
#no need to save smaller validation and test sets