In [1]:
'''
PURPOSE: 
split data set to obtain train, validation and test sets with non-overlapping images
I use this file to create data sets for parts 2, 3, 4 and 5
'''
import os
import numpy as np
from random import randint, seed
#from scipy.misc import imread
import matplotlib.pyplot as plt

In [4]:
#load data
x = np.load("x.npy")
y = np.load("y.npy")

In [5]:
def split_dataset(x,y,train_size,val_size,test_size): 
    act = np.unique(y)

    for num in range(len(act)):
    
        actor = act[num]
        start = np.where(y == actor)[0][0]
        stop = np.where(y == actor)[0][-1]
        actor_data = x[:,:,start:stop]
        
        i=0
        j=0
        k=0
        
        #pick 70 images for train set
        while (i < train_size): 
            seed(5000)
            rand = randint(0,actor_data.shape[2]-1)
            if (num == 0 and i == 0):
                x_train = actor_data[:,:,rand]
                y_train = np.array([actor])
            
            else:
                x_train = np.dstack((x_train, actor_data[:,:,rand]))
                y_train = np.append(y_train,[actor])   
            #remove data from "source" to avoid overlap between sets
            actor_data = np.delete(actor_data, rand, axis = 2)
            i+=1
            
        
        #pick 10 images for val set
        while (j < val_size):
            seed(5000)
            rand = randint(0,actor_data.shape[2]-1)
            if (num == 0 and j == 0): 
                x_val = actor_data[:,:,rand]
                y_val = np.array([actor])
            else: 
                x_val = np.dstack((x_val, actor_data[:,:,rand]))
                y_val = np.append(y_val,[actor])
            #remove data from "source" to avoid overlap between sets
            actor_data = np.delete(actor_data, rand, axis = 2)
            j+=1        
         
        
        #pick 10 images for test set
        while (k < test_size):
            seed(5000)
            rand = randint(0,actor_data.shape[2]-1)
            if (num == 0 and k ==0): 
                x_test = actor_data[:,:,rand]
                y_test = np.array([actor])
            else:
                x_test = np.dstack((x_test, actor_data[:,:,rand]))
                y_test = np.append(y_test,[actor])

            #remove data from "source" to avoid overlap between sets
            actor_data = np.delete(actor_data, rand, axis = 2)
            k+=1        
        
        
    return x_train, y_train, x_val, y_val, x_test, y_test

In [6]:
#modified previous definition to create training, validaiton and test sets for specific actors
def split_dataset_for_actors(x,y,train_size,val_size,test_size,acts): 
    
    acts = np.unique(acts)
    for num in range(len(acts)):
    
        actor = acts[num]
        start = np.where(y == actor)[0][0]
        stop = np.where(y == actor)[0][-1]
        actor_data = x[:,:,start:stop]
        
#        print(actor)
#        print(actor_data.shape)
        
        i=0
        j=0
        k=0
        
        #pick 70 images for train set
        while (i < train_size): 
            rand = randint(0,actor_data.shape[2]-1)
            if (num == 0 and i == 0):
                x_train = actor_data[:,:,rand]
                y_train = np.array([actor])
            
            else:
                x_train = np.dstack((x_train, actor_data[:,:,rand]))
                y_train = np.append(y_train,[actor])   
            #remove data from "source" to avoid overlap between sets
            actor_data = np.delete(actor_data, rand, axis = 2)
            i+=1
            
#        print(x_train.shape, y_train.shape, actor_data.shape[2])
        
        
        
        
        #pick 10 images for val set
        while (j < val_size):
            rand = randint(0,actor_data.shape[2]-1)
            if (num == 0 and j == 0): 
                x_val = actor_data[:,:,rand]
                y_val = np.array([actor])
            else: 
                x_val = np.dstack((x_val, actor_data[:,:,rand]))
                y_val = np.append(y_val,[actor])
            #remove data from "source" to avoid overlap between sets
            actor_data = np.delete(actor_data, rand, axis = 2)
            j+=1
#        print(x_val.shape, y_val.shape, actor_data.shape[2])
        
        
        
        
        #pick 10 images for test set
        while (k < test_size):
            rand = randint(0,actor_data.shape[2]-1)
            if (num == 0 and k ==0): 
                x_test = actor_data[:,:,rand]
                y_test = np.array([actor])
            else:
                x_test = np.dstack((x_test, actor_data[:,:,rand]))
                y_test = np.append(y_test,[actor])

            #remove data from "source" to avoid overlap between sets
            actor_data = np.delete(actor_data, rand, axis = 2)
            k+=1
#        print(x_test.shape, y_test.shape, actor_data.shape[2])
        
        
        
    return x_train, y_train, x_val, y_val, x_test, y_test

In [None]:
'''
For Part 2
'''

x_train, y_train, x_val, y_val, x_test, y_test = split_dataset(x,y,70,10,10)

np.save("x_train.npy",x_train)
np.save("y_train.npy",y_train)
np.save("x_val.npy", x_val)
np.save("y_val.npy", y_val)
np.save("x_test.npy", x_test)
np.save("y_test.npy", y_test)
#but must pretend to now know y_val and y_test

In [10]:
'''
For Part 3
train, val and test sets containing only images of Alec Baldwin and Steve Carell
'''
x_train0, y_train0, x_val0, y_val0, x_test0, y_test0 = split_dataset_for_actors(x,y,70,10,10,["Steve Carell", "Alec Baldwin"])



print(x_train0.shape , y_train0.shape)
print(x_val0.shape , y_val0.shape)
print(x_test0.shape , y_test0.shape)

np.save("x_train0.npy",x_train0)
np.save("y_train0.npy",y_train0)
np.save("x_val0.npy", x_val0)
np.save("y_val0.npy", y_val0)
np.save("x_test0.npy", x_test0)
np.save("y_test0.npy", y_test0)

In [17]:
'''
For Part 4
training set of only 2 images per actor. Only actors Steve Carell and Alec Baldwin
'''
x_train1, y_train1, x1, y1, x2, y2 = split_dataset_for_actors(x,y,2,10,10,["Steve Carell", "Alec Baldwin"])

print(x_train1.shape, y_train1.shape)
np.save("x_train1.npy",x_train1)
np.save("y_train1.npy",y_train1)
#no need to save smaller validation and test sets

In [5]:
'''
For Part 5
for classifying male and female
'''

#training set of other actors
x_other_actors = np.load("x_other_actors.npy")
y_other_actors = np.load("y_other_actors.npy")
x_train_other_actors, y_train_other_actors, x_val_other_actors, y_val_other_actors, x_test_other_actors, y_test_other_actors = split_dataset(x_other_actors, y_other_actors, 70, 10, 10) 

In [13]:
np.save("x_train_other_actors.npy",x_train_other_actors)
np.save("y_train_other_actors.npy",y_train_other_actors)
np.save("x_val_other_actors.npy", x_val_other_actors)
np.save("y_val_other_actors.npy", y_val_other_actors)
np.save("x_test_other_actors.npy", x_test_other_actors)
np.save("y_test_other_actors.npy", y_test_other_actors)

In [11]:
#Of train size 2 imgs/per actor
x_train2, y_train2, x_val2, y_val2, x_test2, y_test2 = split_dataset(x,y,2,10,10)

np.save("x_train2.npy",x_train2)
np.save("y_train2.npy",y_train2)
np.save("x_val2.npy", x_val2)
np.save("y_val2.npy", y_val2)
np.save("x_test2.npy", x_test2) #don't really need
np.save("y_test2.npy", y_test2) #don't really need
#but must pretend to now know y_val and y_test


In [8]:
#size 20
x_train3, y_train3, x_val3, y_val3, x_test3, y_test3 = split_dataset(x,y,20,10,10)

np.save("x_train3.npy",x_train3)
np.save("y_train3.npy",y_train3)
np.save("x_val3.npy", x_val3)
np.save("y_val3.npy", y_val3)
np.save("x_test3.npy", x_test3) #don't really need
np.save("y_test3.npy", y_test3) #don't really need
#but must pretend to now know y_val and y_test

In [13]:
#size 50
x_train4, y_train4, x_val4, y_val4, x_test4, y_test4 = split_dataset(x,y,50,10,10)

np.save("x_train4.npy",x_train4)
np.save("y_train4.npy",y_train4)
np.save("x_val4.npy", x_val4)
np.save("y_val4.npy", y_val4)
np.save("x_test4.npy", x_test4) #don't really need
np.save("y_test4.npy", y_test4) #don't really need
#but must pretend to now know y_val and y_test

In [22]:
#size 99
x_train5, y_train5, x_val5, y_val5, x_test5, y_test5 = split_dataset(x,y,99,10,10)

np.save("x_train5.npy",x_train5)
np.save("y_train5.npy",y_train5)
np.save("x_val5.npy", x_val5)
np.save("y_val5.npy", y_val5)
np.save("x_test5.npy", x_test5) #don't really need
np.save("y_test5.npy", y_test5) #don't really need
#but must pretend to now know y_val and y_test