In [1]:
import numpy as np

# Import datasets, classifiers and performance metrics
from sklearn import datasets, svm, metrics, cross_validation, grid_search

# Import save and load classifiers utils
from sklearn.externals import joblib

# Import data elaboration utils
from PIL import Image
from StringIO import StringIO
import urllib2
from urlparse import urlparse
from __future__ import division
import os
import csv

In [66]:
# Functions to process RGB images and make them become feature vectors

def process_directory(directory):
    '''Returns an array of feature vectors for all the image files in a
    directory (and all its subdirectories). Symbolic links are ignored.

    Args:
      directory (str): directory to process.

    Returns:
      list of list of float: a list of feature vectors.
    '''
    training = []
    for root, _, files in os.walk(directory):
        for file_name in files:
            file_path = os.path.join(root, file_name)
            img_feature = process_image_file(file_path)
            if img_feature:
                training.append(img_feature)
    return training

def process_image_files(files):
    '''Returns an array of feature vectors for all the image files in a
    directory (and all its subdirectories). Symbolic links are ignored.

    Args:
      directory (str): directory to process.

    Returns:
      list of list of float: a list of feature vectors.
    '''
    training = []
    for file_path in files:
        img_feature = process_image_file(file_path)
        if img_feature:
            training.append(img_feature)
    return training


def process_image_file(image_path):
    '''Given an image path it returns its feature vector.

    Args:
      image_path (str): path of the image file to process.

    Returns:
      list of float: feature vector on success, None otherwise.
    '''
    image_fp = StringIO(open(image_path, 'rb').read())
    try:
        image = Image.open(image_fp)
        return process_image(image)
    except IOError:
        return None


def process_image_url(image_url):
    '''Given an image URL it returns its feature vector

    Args:
      image_url (str): url of the image to process.

    Returns:
      list of float: feature vector.

    Raises:
      Any exception raised by urllib2 requests.

      IOError: if the URL does not point to a valid file.
    '''
    parsed_url = urlparse(image_url)
    request = urllib2.Request(image_url)
    # set a User-Agent and Referer to work around servers that block a typical
    # user agents and hotlinking. Sorry, it's for science!
    request.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux ' \
            'x86_64; rv:31.0) Gecko/20100101 Firefox/31.0')
    request.add_header('Referrer', parsed_url.netloc)
    # Wrap network data in StringIO so that it looks like a file
    net_data = StringIO(urllib2.build_opener().open(request).read())
    image = Image.open(net_data)
    return process_image(image)


def process_image(image, blocks=6):
    '''Given a PIL Image object it returns its feature vector.

    Args:
      image (PIL.Image): image to process.
      blocks (int, optional): number of block to subdivide the RGB space into.

    Returns:
      list of float: feature vector if successful. None if the image is not
      RGB.
    '''
    if not image.mode == 'RGB':
        return None
    feature = [0] * blocks * blocks * blocks
    pixel_count = 0
    for pixel in image.getdata():
        ridx = int(pixel[0]/(256/blocks))
        gidx = int(pixel[1]/(256/blocks))
        bidx = int(pixel[2]/(256/blocks))
        idx = ridx + gidx * blocks + bidx * blocks * blocks
        feature[idx] += 1
        pixel_count += 1
    return [x/pixel_count for x in feature]

In [67]:
# Load data

dataDir = './data'
photoDir = './data/train_photos'

with open('./sample_food_no_food.csv') as f:
    food_no_food = [{k: v for k, v in row.items()}
        for row in csv.DictReader(f, skipinitialspace=True)]
    
positive = process_image_files([os.path.join(photoDir, element['id']) for element in food_no_food if element['is_food'] is '1'])
negative = process_image_files([os.path.join(photoDir, element['id']) for element in food_no_food if element['is_food'] is '0'])

data = negative + positive
labels = [0] * len(negative) + [1] * len(positive)

In [109]:
# split training data in a train set and a test set. The test set will containt 20% of the total
x_train, x_test, y_train, y_test = cross_validation.train_test_split(data, labels, test_size=0.25, random_state=6)

# Define the parameter search space
# parameters = {'kernel': ['linear', 'rbf'], 'C': [1, 10, 100, 1000], 'gamma': [0.01, 0.001, 0.0001]}
parameters = {'kernel': ['linear'], 'C': [1000], 'gamma': [0.01]}

In [110]:
# Search for the best classifier within the search space and return it
print 'Train set has positive', len([i for i in y_train if i is 1]), 'and negative', len([i for i in y_train if i is 0])

kf = cross_validation.KFold(n=len(x_train), n_folds=20, shuffle=True, random_state=3)
ss = cross_validation.ShuffleSplit(n=len(x_train), test_size=0.25, n_iter=10, random_state=3)

grid = grid_search.GridSearchCV(svm.SVC(verbose=True), parameters, cv=kf)
grid.fit(x_train, y_train)
classifier = grid.best_estimator_

Train set has positive 753 and negative 222
[LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM][LibSVM]

In [111]:
print
print 'Best parameters:', grid.best_params_
print 'Test set has positive', len([i for i in y_test if i is 1]), 'and negative', len([i for i in y_test if i is 0])
print
print 'Best classifier score'
print
print(metrics.classification_report(y_test, classifier.predict(x_test), target_names=['not-food', 'food']))


Best parameters: {'kernel': 'linear', 'C': 1000, 'gamma': 0.01}
Test set has positive 255 and negative 70

Best classifier score

             precision    recall  f1-score   support

   not-food       0.60      0.36      0.45        70
       food       0.84      0.93      0.88       255

avg / total       0.79      0.81      0.79       325



In [112]:
# Save classifier

joblib.dump(classifier, './foodClassifier/foodClassifier.pkl')

['./foodClassifier/foodClassifier.pkl',
 './foodClassifier/foodClassifier.pkl_01.npy',
 './foodClassifier/foodClassifier.pkl_02.npy',
 './foodClassifier/foodClassifier.pkl_03.npy',
 './foodClassifier/foodClassifier.pkl_04.npy',
 './foodClassifier/foodClassifier.pkl_05.npy',
 './foodClassifier/foodClassifier.pkl_06.npy',
 './foodClassifier/foodClassifier.pkl_07.npy',
 './foodClassifier/foodClassifier.pkl_08.npy',
 './foodClassifier/foodClassifier.pkl_09.npy',
 './foodClassifier/foodClassifier.pkl_10.npy',
 './foodClassifier/foodClassifier.pkl_11.npy']

In [3]:
# Load classifier

classifier = joblib.load('./foodClassifier/foodClassifier.pkl') 

In [113]:
# Predict classes for all images in the data folder

picturesFolder = './data/train_photos'

onAll = False
howMany = 30
files = [f for f in os.listdir(picturesFolder) if os.path.isfile(os.path.join(picturesFolder, f))]

if onAll:
    predictables = np.array(process_image_files([os.path.join(picturesFolder, pic) for pic in files]))
else:
    predictables = np.array(process_image_files([os.path.join(picturesFolder, pic) for pic in files[:howMany]]))

predicted = [prediction[0] for prediction in [classifier.predict(p.reshape(1,-1)) for p in predictables]]

if onAll:
    classification = zip(files, predicted)
else:
    classification = zip(files[:howMany], predicted)

In [114]:
# Store classification

savePath = './'

with open(os.path.join(savePath, 'classifiedFood.csv'), 'wb') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['filename','food'])
    for row in classification:
        csv_out.writerow(row)