In [1]:
import numpy as np

# Import datasets, classifiers and performance metrics
from sklearn import datasets, svm, metrics, cross_validation, grid_search, linear_model, neural_network

# Import save and load classifiers utils
from sklearn.externals import joblib

# Import data access utils
import os
import csv

# Other Tools
import re
import itertools

In [2]:
def loadFeatures(folder):
    files = [f for f in os.listdir(folder) if os.path.isfile(os.path.join(folder, f))]
    features = np.array([np.load(os.path.join(folder, f)) for f in files])
    simplifiedNames = [re.sub(r'\.npy*$','',s) for s in files]
    return sorted(zip(simplifiedNames, features), key= lambda x : x[0])

def createFeaturesDictionary(*features):
    # Collect all identifiers
    resultingMap = []
    featuresDetails = []
    
    feature = 0
    for featureSet in features:
        featuresDetails.append({'feature': feature, 'length': len(featureSet[0][1]), 'size': featureSet[0][1].size, 'shape': featureSet[0][1].shape,})
        for element in featureSet:
            temp = [e for e in resultingMap if e['id'] == element[0]]
            if element[1].size != featureSet[0][1].size:
                print "!!!!!!!! FEATURE ARRAY MISMATCH! WILL NEVER WORK!!!!!"
            if len(temp):
                temp[0][str(feature)] = element[1]
                temp[0]['flat'] = np.append(temp[0]['flat'], element[1])
            else:
                resultingMap.append({'id': element[0], str(feature): element[1], 'flat': element[1]})
        feature += 1
    
    return resultingMap, featuresDetails

def classifierEstimation(classifier, x_test, y_test):
    print
    print 'Test set has positive', len([i for i in y_test if i is 1]), 'and negative', len([i for i in y_test if i is 0])
    print
    print 'Best classifier score'
    print
    print(metrics.classification_report(y_test, classifier.predict(x_test), target_names=['not-food', 'food']))
    return

In [3]:
# Merge labels with data
#for element in fmap:
#    try:
#        label_index = next(index for (index, d) in enumerate(food_no_food) if d["id"] == element['id'])
#        element['label'] = food_no_food[label_index]['is_food']
#    except(StopIteration):
#        print "This id was not found:", element['id']

In [4]:
def runSVM(data, labels):
    print data.shape, labels.shape

    # split training data in a train set and a test set. The test set will containt 20% of the total
    x_train, x_test, y_train, y_test = cross_validation.train_test_split(data, labels, test_size=0.25, random_state=6)
    print 'Train set has positive', len([i for i in y_train if i == 1]), 'and negative', len([i for i in y_train if i == 0])

    # Define the parameter search space
    # parameters = {'kernel': ['linear', 'rbf'], 'C': [1, 10, 100, 1000], 'gamma': [0.01, 0.001, 0.0001]}
    parameters = {'kernel': ['linear'], 'C': [1000], 'gamma': [0.01]}
    
    # Search for the best classifier within the search space and return it
    kf = cross_validation.KFold(n=len(x_train), n_folds=5, shuffle=True, random_state=3)
    # ss = cross_validation.ShuffleSplit(n=len(x_train), test_size=0.25, n_iter=10, random_state=3)

    grid = grid_search.GridSearchCV(svm.SVC(verbose=True), parameters, cv=kf)
    grid.fit(x_train, y_train)
    
    # print 'Best parameters:', grid.best_params_
    
    classifierEstimation(grid.best_estimator_, x_test, y_test)
    
    return grid.best_estimator_

In [5]:
def equalSize(*arrays):
    size = len(arrays[0])
    
    for i in range(1,len(arrays)):
        if len(arrays[i]) != size:
            print "Faulty is number", i
            return False
        
    return True

def tryAllCombinations(labels, *features):
    if equalSize(labels, *features):
        print "Arrays pased are OK"
    else:
        print "Arrays passed are NOT OK!"
        return
    
    for i in range(len(features)):
        
        data = features[i]
        
        print "Trying with f", i
        # runSVM(data, labels)
        
        for j in range(i+1, len(features)):
            
            data = []
            
            for iterator in range(len(features[0])):
                data.append(np.append(features[i][iterator], features[j][iterator]))
        
            data = np.array(data)
            print "Trying with f", i, j
            runSVM(data, labels)                
            
#            for k in range(j+1, i-1):
#                data = np.append(data, features[j])
        
#                print "Trying with f", i, j, k
#                runSVM(data, labels)

def tryAllFeatures(labels, *features):
    if equalSize(labels, *features):
        print "Arrays pased are OK"
    else:
        print "Arrays passed are NOT OK!"
        return
    
    data = []
    
    for i in range(len(features[0])):
        item = []
        for j in range(len(features)):
            item.extend(features[j][i])
        data.append(item)
    
        
    print "Trying with all features"
    runSVM(data, labels)

In [6]:
# Load training info
with open('./sample_food_no_food.csv') as f:
    food_no_food = [{k: v for k, v in row.items()}
        for row in csv.DictReader(f, skipinitialspace=True)]

labels = np.array([x[1] for x in sorted([(k['id'], np.int64(k['is_food'])) for k in food_no_food], key= lambda x : x[0])])

In [7]:
chris = './data/svmFeature'
sobel = './data/edge_array_sobel'
roberts = './data/edge_array_roberts'
watershed = './data/watershed_array'
skeleton = './data/array_skeleton'
binary = './data/binarization_array'
a05 = './data/con_array_0.5'
a65 = './data/con_array_0.65'


aC = np.array([x[1] for x in loadFeatures(chris)])
aSo = np.array([x[1] for x in loadFeatures(sobel)])
aR = np.array([x[1] for x in loadFeatures(roberts)])
aW = np.array([x[1] for x in loadFeatures(watershed)])
aSk = np.array([x[1] for x in loadFeatures(skeleton)])
aB = np.array([x[1] for x in loadFeatures(binary)])
a05 = np.array([x[1] for x in loadFeatures(a05)])
a65 = np.array([x[1] for x in loadFeatures(a65)])

In [None]:
tryAllFeatures(labels, aC, aSo, aR, aW, aSk, aB, a05, a65)

Arrays pased are OK
Trying with f 0
Trying with f 0 1
(1300, 90216) (1300,)
Train set has positive 745 and negative 230
[LibSVM]