In [None]:
'''
PURPOSE: 
Download images into folder
    images/[actor]/[actor]_processed
Process and save download images to folder
    images/[actor]/[actor]_unprocessed
create dataset: x, y
'''

In [None]:
import urllib
import os
import numpy as np
import pandas as pd
from random import randrange
import sys
from scipy.misc import imread, imshow, imresize, imsave
import matplotlib.pyplot as plt

In [None]:
def timeout(func, args=(), kwargs={}, timeout_duration=1, default=None):
    '''From:
    http://code.activestate.com/recipes/473878-timeout-function-using-threading/'''
    import threading
    class InterruptableThread(threading.Thread):
        def __init__(self):
            threading.Thread.__init__(self)
            self.result = None

        def run(self):
            try:
                self.result = func(*args, **kwargs)
            except:
                self.result = default

    it = InterruptableThread()
    it.start()
    it.join(timeout_duration)
    if it.isAlive():
        return False
    else:
        return it.result

In [None]:
def rgb2gray(rgb):
    '''Return the grayscale version of the RGB image rgb as a 2D numpy array
    whose range is 0..1
    Arguments:
    rgb -- an RGB image, represented as a numpy array of size n x m x 3. The
    range of the values is 0..255
    '''
    
    r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
    gray = 0.2989 * r + 0.5870 * g + 0.1140 * b

    return gray/255.

In [None]:
def process_img(image, coords):
    image_cropped = image[coords[1]:coords[3], coords[0]:coords[2], :]
    image_gray  = rgb2gray(image_cropped)
    processed_image = imresize(image_gray, (32,32))
    
    return processed_image

In [None]:
def to_list(coord_string):
    return map(int,coord_string.split(","))


In [None]:
act = list(set([a.split("\n")[0] for a in open("subset_actors.txt").readlines()]))

In [None]:
#read line of image data
facescrub_acts = pd.read_table("all_acts.txt",sep="\t",header=None)
facescrub_acts.columns = ["Name", "Col1", "Col2", "URL", "Coordinates", "Col5"]
facescrub_acts_trimmed = facescrub_acts.drop(facescrub_acts.columns[[1,2,5]], axis = 1)

In [None]:
#create directory for unprocessed images
current_directory = os.getcwd()
images = os.path.join(current_directory, r'images')
if not os.path.exists(images):
   os.makedirs(images)

#create directory for each actor
for actor in act:
    actor_directory = os.path.join(images,actor)
    if not os.path.exists(actor_directory):
        os.makedirs(actor_directory)
        os.makedirs(os.path.join(actor_directory,actor+r'_unprocessed'))
        os.makedirs(os.path.join(actor_directory,actor+r'_processed'))


In [None]:
#download and process images and create dataset

i=0
for index, row in facescrub_acts_trimmed.iterrows():
    actor = row['Name']
    if actor in act:
          
        image_name = actor+'_{}.jpg'.format(i)
        unprocessed_dir = os.path.join(images,actor,actor+r'_unprocessed')
        processed_dir = os.path.join(images,actor, actor+r'_processed')
        image_name_w_path = os.path.join(unprocessed_dir,image_name)
        processed_image_w_path = os.path.join(processed_dir,image_name)
        
        if (row['URL'] == "http://www.sawfirst.com/wp-content/uploads/20"):
            #one awkward link that asks for a password...
        else:
            try:
                #don't download images that take longer than 2 seconds (I pay for BELL's Fibe300)
                timeout(urllib.urlretrieve, (row['URL'],image_name_w_path),{}, 2)
                image= imread(image_name_w_path) #check to see if image actually opens, if it doesn't handle exception
                image_processed = process_img(image, to_list(row['Coordinates'])) 
                imsave(processed_image_w_path, image_processed)

                #create data set
                if i == 0:
                    x = image_processed
                    y = np.array([actor])
                else: 
                    x = np.dstack((x, image_processed))
                    y = np.append(y,[actor])

                i+=1 #if image file doesn't open, i won't be incremented
                print(image_name_w_path)
                print("working:     "+row['URL'])

            except Exception:
                #if url is broken
                sys.exc_clear()
                print("not working:     "+row['URL'])


In [None]:
#save dataset
np.save(os.path.join(os.getcwd(),"x.npy") ,x)
np.save(os.path.join(os.getcwd(),"y.npy") ,y)
