In [11]:
import numpy as np

# Import datasets, classifiers and performance metrics
from sklearn import datasets, svm, metrics, cross_validation, grid_search

# Import save and load classifiers utils
from sklearn.externals import joblib

# Import data elaboration utils
from PIL import Image
from StringIO import StringIO
import urllib2
from urlparse import urlparse
from __future__ import division
import os
import csv
import re

In [2]:
# Functions to process RGB images and make them become feature vectors

def process_image_file(image_path):
    '''Given an image path it returns its feature vector.

    Args:
      image_path (str): path of the image file to process.

    Returns:
      list of float: feature vector on success, None otherwise.
    '''
    image_fp = StringIO(open(image_path, 'rb').read())
    try:
        image = Image.open(image_fp)
        return process_image(image)
    except IOError:
        return None


def process_image(image, blocks=6):
    '''Given a PIL Image object it returns its feature vector.

    Args:
      image (PIL.Image): image to process.
      blocks (int, optional): number of block to subdivide the RGB space into.

    Returns:
      list of float: feature vector if successful. None if the image is not
      RGB.
    '''
    if not image.mode == 'RGB':
        return None
    feature = [0] * blocks * blocks * blocks
    pixel_count = 0
    for pixel in image.getdata():
        ridx = int(pixel[0]/(256/blocks))
        gidx = int(pixel[1]/(256/blocks))
        bidx = int(pixel[2]/(256/blocks))
        idx = ridx + gidx * blocks + bidx * blocks * blocks
        feature[idx] += 1
        pixel_count += 1
    return [x/pixel_count for x in feature]

In [21]:
# Predict classes for all images in the data folder

picturesFolder = './data/train_photos'
desinationFolder = './data/featurePictures'

originals = [f for f in os.listdir(picturesFolder) if os.path.isfile(os.path.join(picturesFolder, f))]
processed = [f for f in os.listdir(desinationFolder) if os.path.isfile(os.path.join(desinationFolder, f))]

def diff(first, second):
        second = set(second)
        return [item for item in first if item not in second]
    
processables = [s.replace('.npy', '') for s in diff([i + '.npy' for i in originals], processed)]

In [22]:
# Load data

dataDir = './data'
photoDir = './data/train_photos'

with open('./sample_food_no_food.csv') as f:
    food_no_food = [{k: v for k, v in row.items()}
        for row in csv.DictReader(f, skipinitialspace=True)]
    

train = [element['id'] for element in food_no_food]
processables = [s.replace('.npy', '') for s in diff([i + '.npy' for i in train], processed)]

In [20]:
for pic in processables:
    featureArray = np.array(process_image_file(os.path.join(picturesFolder, pic)))
    np.save(os.path.join(desinationFolder, pic), featureArray)

In [23]:
print len(processables)

0
