In [None]:
import csv
import numpy as np
import math

In [None]:
with open('./train.csv') as f:
    food_to_label = []
    for row in csv.DictReader(f, skipinitialspace=True):
        element = {}
        for k, v in row.items():
            if k == "business_id":
                element['id'] = str(v)
            elif k == "labels":
                labels_raw = np.array(str(v).split(' '))
                labels = [0] * 9
                labels_int = []
                try:
                    for lb in labels_raw:
                        labels[int(str(lb))] = 1
                        labels_int.append(int(lb))
                except ValueError:
                    print "Failure with value", lb, "labels lenght", len(labels_raw), "content:", v
                element['labels'] = labels
                element['labels_raw'] = labels_int
            else :
                print "No idea what you just passed!"
        
        if len(element['labels_raw']) is not 0:
            food_to_label.append(element)
        else:
            print "Business", element['id'], "has no labels and is being ignored!"

if len(set([element['id'] for element in food_to_label])) != len(food_to_label):
    print('something\'s wrong!')

In [None]:
def bestScoring(element, n=3):
    if n is 0:
        return []
    else:
        current = max(element)
        return [current] + bestScoring([e for e in element if e != current], n-1)
    
def getIndexes(values, element):
    return[element.index(v) for v in values]

def getCombinations(array):
    if len(array) is 1:
        return [array]
    else:
        result = [[array[0]]]
        for e in getCombinations(array[1:]):
            result.append(e)
            a = []
            a.extend(e)
            a.append(array[0])
            result.append(a)
        return result
    
def combine(n = 8):
    if n < 0:
        return []
    else:
        result = [[n]]
        for e in combine(n-1):
            result.append(e)
            a = []
            a.extend(e)
            a.append(n)
            result.append(a)
        return result

In [None]:
combinations = [set(e) for e in combine()]

In [None]:
proportions = []
for combination in combinations:
    l = len([element for element in food_to_label if set(element['labels_raw']) == combination])/float(len(food_to_label))
    proportions.append(l)

In [None]:
with open('./all_classes_prediction.csv') as f:
    allClasses = [{k:v for k,v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)]

In [None]:
with open('./one_vs_all_class_prediction.csv') as f:
    oneVsAll = [{k:v for k,v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)]

In [None]:
i = 100
print combinations[i], "happens with", proportions[i]*100, "%"
print "Most likely happens at", max(proportions)*100, "% and is", combinations[proportions.index(max(proportions))]
print "All proportions should sum up to 1:", sum(proportions)
print "Lengths of all-vs-all & one-vs-all are same:", len(oneVsAll) == len(allClasses)
print "Last elements in all-vs-all & one-vs-all are same:", oneVsAll[len(oneVsAll)-1]['id'] == allClasses[len(allClasses)-1]['id']

In [None]:
total = sum([math.exp(float(oneVsAll[0][str(k)])) for k in range(9)])
oneVsAllNorm = []
for element in oneVsAll:
    temp = element.copy()
    for e in range(9):
        temp[str(e)] = math.exp(float(temp[str(e)])) / total
    oneVsAllNorm.append(temp)

In [None]:
for j in range(4):
    print oneVsAll[j]['id'], allClasses[j]['id']
    oneVsAllBest = bestScoring([oneVsAllNorm[j][str(v)] for v in range(9)], 2)
    oneVsAllBestIndexes = getIndexes(oneVsAllBest, [oneVsAllNorm[j][str(v)] for v in range(9)])
    combIndex = combinations.index(set(oneVsAllBestIndexes))
    print oneVsAllBestIndexes, "happens at", proportions[combIndex]

    
    allBest = bestScoring([float(allClasses[j][str(v)]) for v in range(9)], 2)
    allBestIndexes = getIndexes(allBest, [float(allClasses[j][str(v)]) for v in range(9)])
    allIndex = combinations.index(set(allBestIndexes))
    print allBestIndexes, "happens at", proportions[allIndex]
    
    for i in range(9):
        print "\t", str(i), "{:.2f}".format(float(oneVsAll[j][str(i)])), "{:.2f}".format(oneVsAllNorm[j][str(i)]), "{:.2f}".format(float(allClasses[j][str(i)]))

In [None]:
def getPredictions(unfiltered, normalized, support, lowerThreshold=.25, upperThreshold=.7):
    if(unfiltered['id'] == normalized['id'] and unfiltered['id'] == support['id']):
        likely = []
        
        # Labels I can use
        yes = []
        no = []
        left = []
        
        u = []
        n = []
        s = []
        
        for i in range(9):
            u.append(float(unfiltered[str(i)]))
            
            n.append(float(normalized[str(i)]))
            s.append(float(support[str(i)]))
            
            if u[i] >= upperThreshold:
                yes.append(i)
            elif u[i] <= lowerThreshold:
                no.append(i)
            else:
                left.append(i)
        
        if len(yes) > 0:
            strength = sum([n[i] for i in yes]) + proportions[combinations.index(set(yes))]
        elif len(no) > 5:
            return left
        elif len(left) > 5:            
            yes = getIndexes(bestScoring(s,3),s)
            if 0 in left and 0 not in yes and u[0]>.27:
                yes.append(0)
            left = [i for i in left if i not in yes]
            strength = sum([n[i] for i in yes]) + proportions[combinations.index(set(yes))]
            
        else:
            yes = left
            left = no
            no = []
            strength = 0

        while True and len(left) is not 0:
            
            # How many should I still pick?
            # print "Strength", strength
            if strength > .4:
                break
            
            maxNormalized = left[0]
            maxSuppport = left[0]
            
            for i in left:
                if n[i] > n[maxNormalized]:
                    maxNormalized = i
                if s[i] > s[maxSuppport]:
                    maxSuppport = i
                    
            candidates = list(yes)
                    
            if maxNormalized is maxSuppport:
                candidates.append(maxNormalized)
                left.remove(maxNormalized)
            else:
                candidates.append(maxNormalized)
                candidates.append(maxSuppport)
                left.remove(maxNormalized)
                left.remove(maxSuppport)
                
            candidates = getCombinations(candidates)
            candidate = False
            
            for comb in candidates:
                if len(yes) > 0 and proportions[combinations.index(set(comb))] > proportions[combinations.index(set(yes))]:
                    candidate = comb
            
            improvedItems = candidate
            
            if candidate is not False:
                improvedItems = [i for i in candidate if i not in yes]
                
            if len(yes) > 0 and improvedItems is not False:
                yes.extend(improvedItems)
            
            if len(yes):
                strength = sum([n[i] for i in yes]) + proportions[combinations.index(set(yes))]
        else:
            strength = 0
            
        return yes
    else:
        print "error"
        return

In [None]:
predictions = []
for i in range(len(oneVsAll)):
    prediction = getPredictions(oneVsAll[i], allClasses[i], oneVsAllNorm[i])
    predictions.append((oneVsAll[i]['id'], prediction))

In [180]:
with open('./test_photo_to_biz.csv') as f:
    businesses = [{k: str(v) for k, v in row.items()} for row in csv.DictReader(f, skipinitialspace=True)]

In [183]:
unique = set([element['business_id'] for element in businesses])

In [233]:
final = []
for business in unique:
    pictures = [element['photo_id']+'.jpg' for element in businesses if element['business_id'] == business]
    labels = [element[1] for element in predictions if element[0] in pictures]
    assign = [0]*9
    result = []
    support = .6
    labs = ''
    
    for array in labels:
        for label in array:
            assign[label] += 1
    
    while len(result) < 1 and support > 0:
        for i in range(9):
            if assign[i]/9.0 > support:
                result.append(i)
        support = support - .05
    
    for e in result:
        labs += str(e) + " "
    
    final.append({
            'business_id': business,
            'labels': labs
        })

In [234]:
keys = final[0].keys()
with open('final_biz_pred.csv', 'wb') as output_file:
    dict_writer = csv.DictWriter(output_file, keys)
    dict_writer.writeheader()
    dict_writer.writerows(final)