In [1]:
import numpy as np
import pandas as pd
import glob
import os
import re
import pickle
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder
from sklearn import cross_validation
from sklearn.model_selection import KFold
import xgboost as xgb
from scipy.stats import gmean

import tensorflow as tf
import tensorflow.python.platform
from tensorflow.python.platform import gfile


model_dir = 'data/precomputed/xgb/'
images_dir = 'data/train_full/'
list_images = glob.glob('{}/*/*g'.format(images_dir))



In [2]:
# setup tensorFlow graph initiation
def create_graph():
    with gfile.FastGFile(os.path.join(model_dir, 'inceptionv3.pb'), 'rb') as f:
        graph_def = tf.GraphDef()
        graph_def.ParseFromString(f.read())
        _ = tf.import_graph_def(graph_def, name='')

In [3]:
features = pickle.load(open('data/precomputed/xgb/features', 'rb'))
labels = pickle.load(open('data/precomputed/xgb/labels', 'rb'))

In [4]:
le = LabelEncoder()

labels = le.fit_transform(labels)

In [5]:
def train_xgboost():
    x = features
    y = labels
    
    clfs = []
    
    num_round = 2000
    param = {'max_depth': 6,
             'n_estimators': 1500,
             'min_child_weight': 100,
             'learning_rate': 0.04,
             'num_class': 8,
             'objective': 'multi:softprob',
             'eval_metric': 'mlogloss',
             'nthread': 8,
             'subsample': 0.9,
             'colsample_bytree': 0.9,
             'early_stopping_rounds': 20,
             'silent': 1,
             'seed': 42}
    
    kf = KFold(n_splits=5, random_state=2017, shuffle=True)
    for train_index, test_index in kf.split(x, y):
        trn_x, val_x = x[train_index,:], x[test_index,:]
        trn_y, val_y = y[train_index], y[test_index]
        
        d_train = xgb.DMatrix(trn_x, label=trn_y)
        d_val = xgb.DMatrix(val_x, label=val_y)
        watchlist  = [(d_val,'eval'), (d_train,'train')]
        
        clf = xgb.train(param, d_train, num_round, watchlist, verbose_eval=499)
        clfs.append(clf)

    return clfs

In [6]:
clfs = train_xgboost()

[0]	eval-mlogloss:2.03809	train-mlogloss:2.03614
[500]	eval-mlogloss:0.639979	train-mlogloss:0.418838
[1000]	eval-mlogloss:0.559717	train-mlogloss:0.307672
[1500]	eval-mlogloss:0.538929	train-mlogloss:0.277124
[0]	eval-mlogloss:2.04008	train-mlogloss:2.03727
[500]	eval-mlogloss:0.689815	train-mlogloss:0.424566
[1000]	eval-mlogloss:0.600729	train-mlogloss:0.311414
[1500]	eval-mlogloss:0.578866	train-mlogloss:0.280422
[0]	eval-mlogloss:2.04025	train-mlogloss:2.03618
[500]	eval-mlogloss:0.688279	train-mlogloss:0.419741
[1000]	eval-mlogloss:0.604386	train-mlogloss:0.311304
[1500]	eval-mlogloss:0.580504	train-mlogloss:0.280706
[0]	eval-mlogloss:2.03887	train-mlogloss:2.03678
[500]	eval-mlogloss:0.669617	train-mlogloss:0.418214
[1000]	eval-mlogloss:0.589371	train-mlogloss:0.308541
[1500]	eval-mlogloss:0.568377	train-mlogloss:0.278243
[0]	eval-mlogloss:2.03632	train-mlogloss:2.03509
[500]	eval-mlogloss:0.624465	train-mlogloss:0.424083
[1000]	eval-mlogloss:0.540578	train-mlogloss:0.311468
[150

In [7]:
test_dir = 'data/test/test_stg1/'
test_images = [test_dir+f for f in os.listdir(test_dir) if re.search('jpg|JPG', f)]

In [8]:
def extract_tst_feat(list_images):
    nb_features = 2048
    features = np.empty((len(list_images), nb_features))
    create_graph()
    with tf.Session() as sess:
        next_to_last_tensor = sess.graph.get_tensor_by_name('pool_3:0')
        
        for ind, image in enumerate(list_images):
            if not gfile.Exists(image):
                tf.logging.fatal('File does not exist %s', image)
            image_data = gfile.FastGFile(image, 'rb').read()
            predictions = sess.run(next_to_last_tensor,
            {'DecodeJpeg/contents:0': image_data})
            features[ind,:] = np.squeeze(predictions)
            
        return features

In [9]:
def submit():
    features_test = xgb.DMatrix(extract_tst_feat(test_images))
    
    preds = []
    for clf in clfs:
        preds.append(np.clip(clf.predict(features_test, ntree_limit=clf.best_ntree_limit), 0.001, 1))
        
    y_pred = gmean(np.array(preds), axis=0)
    
    image_id = [i.split('/')[3] for i in test_images]

    submit = open('submissions/xgb/-LB_A.csv','w')
    submit.write('image,ALB,BET,DOL,LAG,NoF,OTHER,SHARK,YFT\n')

    for idx, id_n in enumerate(image_id):
        probs = ['%s' % p for p in list(y_pred[idx, :])]
        submit.write('%s,%s\n' % (str(image_id[idx]),','.join(probs)))

    submit.close()

In [10]:
submit()