Bag of Colors
=================

In [None]:
from multiprocessing import Pool, cpu_count
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import fbeta_score
import pandas as pd
import numpy as np
import glob, cv2

def get_features(path):
    img = cv2.imread(path)
    hist = cv2.calcHist([cv2.imread(path,0)],[0],None,[256],[0,256])
    m, s = cv2.meanStdDev(img)
    img = cv2.resize(img, (20, 20), cv2.INTER_LINEAR)
    img = np.append(img.flatten(), m.flatten())
    img = np.append(img, s.flatten())
    img = np.append(img, hist.flatten())
    return [path, img]

def normalize_img(paths):
    imf_d = {}
    p = Pool(cpu_count())
    ret = p.map(get_features, paths)
    for i in range(len(ret)):
        imf_d[ret[i][0]] = ret[i][1]
    ret = []
    fdata = [imf_d[f] for f in paths]
    fdata = np.array(fdata, dtype=np.uint8)
    return fdata

in_path = '../input/'
train = pd.read_csv(in_path + 'train.csv')
train['path'] = train['image_name'].map(lambda x: in_path + 'train-jpg/' + x + '.jpg')
y = train['tags'].str.get_dummies(sep=' ')
xtrain = normalize_img(train['path']); print('train...')

test_jpg = glob.glob(in_path + 'test-jpg/*')
test = pd.DataFrame([[p.split('/')[3].replace('.jpg',''),p] for p in test_jpg])
test.columns = ['image_name','path']
xtest = normalize_img(test['path']); print('test...')

etr = ExtraTreesRegressor(n_estimators=18, max_depth=12, n_jobs=-1, random_state=1)
etr.fit(xtrain, y); print('fit...')

train_pred = etr.predict(xtrain)
train_pred[train_pred >0.24] = 1
train_pred[train_pred < 1] = 0
print(fbeta_score(y,train_pred,beta=2, average='samples'))

pred = etr.predict(xtest); print('predict...')

tags = []
for r in pred:
    r = list(r)
    tags.append(' '.join([j[1] for j in sorted([[r[i],y.columns[i]] for i in range(len(y.columns)) if r[i]>.24], reverse=True)]))

test['tags'] = tags
test[['image_name','tags']].to_csv('submission_boc_01.csv', index=False)
test.head()

In [None]:
#mutualy exclusive tags from cooccurence_matrix in following script
#https://github.com/planetlabs/planet-amazon-deforestation/blob/master/planet_chip_examples.ipynb

def me_clean(row): #
    row = row.split(' ')
    d = {k: i for i, k in enumerate(row)}
    me_list = [['artisinal_mine','conventional_mine','blow_down'],['clear','partly_cloudy','cloudy','haze']]
    for l in me_list:
        l2 = [j for j in l if j in row]
        if len(l2)>1:
            l2 = [c[0] for c in sorted([[c, d[c]] for c in l2], reverse=True)] #give priority to lower bound pred
            row = [j for j in row if j not in l2[1:]]
    return ' '.join(row)

test['tags'] = test['tags'].apply(lambda x: me_clean(x))
test[['image_name','tags']].to_csv('submission_boc_02.csv', index=False)
test.head()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

th = []
train_predx = etr.predict(xtrain)
for i in np.arange(0.0, 0.9, 0.01):
    train_pred = train_predx.copy()
    train_pred[train_pred >i] = 1
    train_pred[train_pred < 1] = 0
    th.append([i, fbeta_score(y,train_pred,beta=2, average='samples')])
_ = pd.DataFrame(th, columns=['th','f2_score']).plot(kind='line', x='th', y='f2_score')