Keras + CV

Thanks @anokas for the starter code at https://www.kaggle.com/anokas/planet-understanding-the-amazon-from-space/simple-keras-starter/

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import gc

import keras as k
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
from keras.callbacks import EarlyStopping, ModelCheckpoint

import cv2
from tqdm import tqdm

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import KFold
from sklearn.metrics import fbeta_score
import time

Using TensorFlow backend.


Pre-processing the train and test data

In [2]:
x_train = []
x_test = []
y_train = []

df_train = pd.read_csv('./input/train_v2.csv')
df_test = pd.read_csv('./input/sample_submission_v2.csv')

flatten = lambda l: [item for sublist in l for item in sublist]
labels = list(set(flatten([l.split(' ') for l in df_train['tags'].values])))

labels = ['blow_down',
 'bare_ground',
 'conventional_mine',
 'blooming',
 'cultivation',
 'artisinal_mine',
 'haze',
 'primary',
 'slash_burn',
 'habitation',
 'clear',
 'road',
 'selective_logging',
 'partly_cloudy',
 'agriculture',
 'water',
 'cloudy']

label_map = {'agriculture': 14,
 'artisinal_mine': 5,
 'bare_ground': 1,
 'blooming': 3,
 'blow_down': 0,
 'clear': 10,
 'cloudy': 16,
 'conventional_mine': 2,
 'cultivation': 4,
 'habitation': 9,
 'haze': 6,
 'partly_cloudy': 13,
 'primary': 7,
 'road': 11,
 'selective_logging': 12,
 'slash_burn': 8,
 'water': 15}

for f, tags in tqdm(df_train.values, miniters=1000):
    img = cv2.imread('./input/train-jpg/{}.jpg'.format(f))
    targets = np.zeros(17)
    for t in tags.split(' '):
        targets[label_map[t]] = 1 
    x_train.append(cv2.resize(img, (128, 128)))
    #x_train.append(img)
    y_train.append(targets)

for f, tags in tqdm(df_test.values, miniters=1000):
    img = cv2.imread('./input/test-jpg/{}.jpg'.format(f))
    x_test.append(cv2.resize(img, (128, 128)))
    #x_test.append(img)
    
y_train = np.array(y_train, np.uint8)
x_train = np.array(x_train, np.float32) / 255.
x_test  = np.array(x_test, np.float32) / 255.

print(x_train.shape)
print(y_train.shape)

100%|██████████| 40479/40479 [04:11<00:00, 161.23it/s]
100%|██████████| 61191/61191 [06:24<00:00, 158.94it/s]


(40479, 128, 128, 3)
(40479, 17)


Transpose the data if use Theano

In [None]:
#x_train = x_train.transpose((0, 3, 1, 2))
#x_test = x_test.transpose((0, 3, 1, 2))

Create n-folds cross-validation

In [4]:
# https://www.kaggle.com/c/planet-understanding-the-amazon-from-space/discussion/32475
import numpy as np
from sklearn.metrics import fbeta_score

def optimise_f2_thresholds(y, p, verbose=True, resolution=100):
  def mf(x):
    p2 = np.zeros_like(p)
    for i in range(17):
      p2[:, i] = (p[:, i] > x[i]).astype(np.int)
    score = fbeta_score(y, p2, beta=2, average='samples')
    return score

  x = [0.2]*17
  for i in range(17):
    best_i2 = 0
    best_score = 0
    for i2 in range(resolution):
      i2 /= resolution
      x[i] = i2
      score = mf(x)
      if score > best_score:
        best_i2 = i2
        best_score = score
    x[i] = best_i2
    if verbose:
      print(i, best_i2, best_score)

  return x

In [5]:
from keras.layers.normalization import BatchNormalization

nfolds = 3

num_fold = 0
sum_score = 0

yfull_test = []
yfull_train =[]

kf = KFold(len(y_train), n_folds=nfolds, shuffle=True, random_state=1)

for train_index, test_index in kf:
        start_time_model_fitting = time.time()
        
        X_train = x_train[train_index]
        Y_train = y_train[train_index]
        X_valid = x_train[test_index]
        Y_valid = y_train[test_index]

        num_fold += 1
        print('Start KFold number {} from {}'.format(num_fold, nfolds))
        print('Split train: ', len(X_train), len(Y_train))
        print('Split valid: ', len(X_valid), len(Y_valid))
        
        kfold_weights_path = os.path.join('', 'weights_kfold_' + str(num_fold) + '.h5')
        
        model = Sequential()
        model.add(BatchNormalization(input_shape=(128, 128, 3)))
        model.add(Conv2D(8, 1, 1, activation='relu'))
        model.add(Conv2D(16, 2, 2, activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Conv2D(32, 3, 3, activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))
        model.add(Conv2D(64, 3, 3, activation='relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))
        model.add(Flatten())
        model.add(Dense(256, activation='relu'))
        model.add(Dropout(0.5))
        model.add(Dense(17, activation='sigmoid'))

        model.compile(loss='binary_crossentropy', 
                      optimizer='adam',
                      metrics=['accuracy'])
        callbacks = [
            EarlyStopping(monitor='val_loss', patience=2, verbose=0),
            ModelCheckpoint(kfold_weights_path, monitor='val_loss', save_best_only=True, verbose=0)]
        
        model.fit(x = X_train, y= Y_train, validation_data=(X_valid, Y_valid),
                  batch_size=128,verbose=2, nb_epoch=10,callbacks=callbacks,
                  shuffle=True)
        
        if os.path.isfile(kfold_weights_path):
            model.load_weights(kfold_weights_path)
        
        p_valid = model.predict(X_valid, batch_size = 128, verbose=2)
        print(fbeta_score(Y_valid, np.array(p_valid) > 0.2, beta=2, average='samples'))
        print("Optimizing prediction threshold")
        print(optimise_f2_thresholds(Y_valid, p_valid))
        
        p_test = model.predict(x_train, batch_size = 128, verbose=2)
        yfull_train.append(p_test)
        
        p_test = model.predict(x_test, batch_size = 128, verbose=2)
        yfull_test.append(p_test)

Start KFold number 1 from 3
Split train:  26986 26986
Split valid: 



 13493 13493
Train on 26986 samples, validate on 13493 samples
Epoch 1/10
1166s - loss: 0.2293 - acc: 0.9119 - val_loss: 0.2117 - val_acc: 0.9155
Epoch 2/10
1465s - loss: 0.1852 - acc: 0.9281 - val_loss: 0.1774 - val_acc: 0.9344
Epoch 3/10
1489s - loss: 0.1689 - acc: 0.9340 - val_loss: 0.1616 - val_acc: 0.9384
Epoch 4/10
1659s - loss: 0.1611 - acc: 0.9369 - val_loss: 0.1555 - val_acc: 0.9399
Epoch 5/10
952s - loss: 0.1543 - acc: 0.9393 - val_loss: 0.1474 - val_acc: 0.9426
Epoch 6/10
1526s - loss: 0.1488 - acc: 0.9412 - val_loss: 0.1426 - val_acc: 0.9452
Epoch 7/10


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.


KeyboardInterrupt



Averaging the prediction from each fold

In [6]:
result = np.array(yfull_test[0])
for i in range(1, nfolds):
    result += np.array(yfull_test[i])
result /= nfolds
result = pd.DataFrame(result, columns = labels)
result

Unnamed: 0,blow_down,bare_ground,conventional_mine,blooming,cultivation,artisinal_mine,haze,primary,slash_burn,habitation,clear,road,selective_logging,partly_cloudy,agriculture,water,cloudy
0,0.001773,0.001360,0.000038,1.091260e-02,0.016153,3.877658e-05,5.093649e-03,0.998899,0.000877,0.002191,0.984159,0.011773,2.176377e-03,0.004500,0.030583,0.049904,0.000156
1,0.005200,0.001035,0.000044,2.940394e-02,0.032945,5.475922e-05,8.360219e-04,0.999904,0.001810,0.003418,0.988311,0.018059,9.030221e-03,0.011917,0.041563,0.045540,0.000019
2,0.000002,0.000015,0.000001,1.789377e-08,0.008752,3.258021e-07,1.053482e-07,0.999671,0.000005,0.000291,0.000013,0.009279,4.498758e-07,0.999426,0.039818,0.054980,0.000225
3,0.006998,0.002011,0.000106,3.282828e-02,0.058456,1.331107e-04,2.137376e-03,0.999705,0.003000,0.005868,0.944941,0.027484,1.233116e-02,0.049378,0.065877,0.070678,0.000080
4,0.000006,0.000179,0.000019,3.483859e-07,0.009123,6.728590e-06,1.497422e-05,0.762517,0.000036,0.004541,0.000051,0.049099,6.591720e-06,0.917937,0.070855,0.076563,0.180771
5,0.003288,0.000731,0.000046,2.301529e-02,0.013294,4.653619e-05,1.029586e-03,0.999849,0.000931,0.002100,0.994472,0.011885,5.417985e-03,0.003076,0.019293,0.030594,0.000030
6,0.007180,0.049562,0.004429,6.709935e-03,0.312713,6.098468e-03,1.147727e-01,0.987759,0.035893,0.167044,0.504322,0.321036,1.715825e-02,0.382171,0.609129,0.350465,0.004770
7,0.000018,0.085634,0.011658,7.877713e-06,0.017283,4.238990e-02,1.751125e-02,0.759627,0.000126,0.875116,0.982613,0.931477,1.604452e-03,0.004202,0.319830,0.200870,0.000225
8,0.002746,0.000884,0.000056,1.884582e-02,0.013950,4.799864e-05,1.927305e-03,0.999725,0.000782,0.002249,0.990434,0.012182,4.148026e-03,0.003872,0.021178,0.035089,0.000056
9,0.000384,0.004538,0.000281,4.411438e-04,0.048355,1.333214e-04,7.266077e-01,0.941314,0.001051,0.014633,0.196270,0.067612,4.988707e-04,0.024913,0.144530,0.122552,0.038408


Output prediction for submission

In [7]:
from tqdm import tqdm
#thres = [0.07, 0.17, 0.2, 0.04, 0.23, 0.33, 0.24, 0.22, 0.1, 0.19, 0.23, 0.24, 0.12, 0.14, 0.25, 0.26, 0.16]
thres = [0.0475, 0.2225, 0.0875, 0.19, 0.265, 0.1375, 0.1925, 0.2625, 0.085, 0.2175, 0.2375, 0.21, 0.14, 0.1625, 0.245, 0.205, 0.12]
preds = []
for i in tqdm(range(result.shape[0]), miniters=1000):
    a = result.ix[[i]]
    a = a.apply(lambda x: x > thres, axis=1)
    a = a.transpose()
    a = a.loc[a[i] == True]
    ' '.join(list(a.index))
    preds.append(' '.join(list(a.index)))

100%|██████████| 61191/61191 [02:00<00:00, 506.05it/s]


In [8]:
df_test['tags'] = preds
df_test

Unnamed: 0,image_name,tags
0,test_0,primary clear
1,test_1,primary clear
2,test_2,primary partly_cloudy
3,test_3,primary clear
4,test_4,primary partly_cloudy cloudy
5,test_5,primary clear
6,test_6,cultivation primary clear road partly_cloudy a...
7,test_7,primary habitation clear road agriculture
8,test_8,primary clear
9,test_9,haze primary


In [9]:
df_test.to_csv('submission_2.csv', index=False)