# Baseline nn

# Table of contents

[Data Loading](#data_loading)

[NLP](#nlp)

[Image Processing](#cv)

[Data Prep](#data_separation)

[Create Model](#create_model)

[Training](#training)

In [1]:
import gc
import glob
import os
from os import path
import json
import matplotlib.pyplot as plt
import seaborn as sns
import pprint
import warnings
from joblib import Parallel, delayed
from tqdm import tqdm, tqdm_notebook
import cv2
from PIL import Image
from collections import Counter
from functools import partial
from math import sqrt
import time
from numba import cuda

import numpy as np
import pandas as pd
import scipy as sp
import xgboost as xgb
import lightgbm as lgb

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import cohen_kappa_score, mean_squared_error
from sklearn.metrics import confusion_matrix as sk_cmatrix
from sklearn.model_selection import StratifiedKFold, GroupKFold


from keras.applications.densenet import preprocess_input, DenseNet121
from keras.models import Model
from keras.layers import Dense, GlobalAveragePooling2D, Input, Lambda, AveragePooling1D, \
        concatenate, BatchNormalization, Activation, Dropout, Embedding, Reshape
from keras.callbacks import LearningRateScheduler
from keras.optimizers import *
from keras.initializers import glorot_normal

import keras.backend as K

%matplotlib inline

np.random.seed(seed=1)
warnings.filterwarnings('ignore')

split_char = '/'

img_size = 256
batch_size = 256

Using TensorFlow backend.


<a id='data_loading'></a>
## Data loading

In [2]:
data_path = '../input/petfinder-adoption-prediction/'
train = pd.read_csv(path.join(data_path, 'train/train.csv'))
test = pd.read_csv(path.join(data_path, 'test/test.csv'))
y_trn = train['AdoptionSpeed']
sample_submission = pd.read_csv(path.join(data_path, 'test/sample_submission.csv'))

labels_breed = pd.read_csv(path.join(data_path, 'breed_labels.csv'))
labels_state = pd.read_csv(path.join(data_path, 'color_labels.csv'))
labels_color = pd.read_csv(path.join(data_path, 'state_labels.csv'))

In [3]:
def get_filenames(mode='train'):
    
    image_files = sorted(glob.glob(path.join(data_path, '{}_images/*.jpg'.format(mode))))
    metadata_files = sorted(glob.glob(path.join(data_path, '{}_metadata/*.json'.format(mode))))
    sentiment_files = sorted(glob.glob(path.join(data_path, '{}_sentiment/*.json'.format(mode))))
    
    return image_files, metadata_files, sentiment_files

In [4]:
train_image_files, train_metadata_files, train_sentiment_files = get_filenames('train')
print('num of train images files: {}'.format(len(train_image_files)))
print('num of train metadata files: {}'.format(len(train_metadata_files)))
print('num of train sentiment files: {}'.format(len(train_sentiment_files)))

test_image_files, test_metadata_files, test_sentiment_files = get_filenames('test')
print('num of test images files: {}'.format(len(test_image_files)))
print('num of test metadata files: {}'.format(len(test_metadata_files)))
print('num of test sentiment files: {}'.format(len(test_sentiment_files)))

num of train images files: 58311
num of train metadata files: 58311
num of train sentiment files: 14442
num of test images files: 15040
num of test metadata files: 15040
num of test sentiment files: 3815


In [5]:
csv_X = pd.read_csv('processed_data.csv')
csv_X = csv_X.drop('Unnamed: 0', axis=1)

In [6]:
X_raw = pd.concat([train.drop('AdoptionSpeed', axis=1), test]).reset_index()

In [7]:
for c in csv_X.columns:
    if c.startswith('IMG') or c.startswith('TFIDF'):
        csv_X = csv_X.drop(c, axis=1)

<a id='nlp'></a>
## NLP

In [8]:
tfidf_col = 'Description'

X_raw[tfidf_col] = X_raw[tfidf_col].fillna('none')

# Initialize decomposition methods:
tfv = TfidfVectorizer(min_df=1,  max_features=100,
                      strip_accents='unicode', analyzer='word', token_pattern=r'(?u)\b\w+\b',
                      ngram_range=(1, 3))

tfidf_trn = tfv.fit_transform(X_raw[tfidf_col].values)

In [9]:
text_df = pd.DataFrame(tfidf_trn.toarray()).add_prefix('text_')

In [10]:
X = pd.concat([csv_X, text_df], axis=1)

In [11]:
text_cols = []
for c in X.columns:
    if c.startswith('text_'):
        text_cols.append(c)

<a id='cv'></a>
## Image processing and densenet

In [12]:
def resize_to_square(im):
    old_size = im.shape[:2]
    ratio = float(img_size)/max(old_size)
    new_size = tuple([int(x*ratio) for x in old_size])
    im = cv2.resize(im, (new_size[1], new_size[0]))
    delta_w = img_size - new_size[1]
    delta_h = img_size - new_size[0]
    top, bottom = delta_h//2, delta_h-(delta_h//2)
    left, right = delta_w//2, delta_w-(delta_w//2)
    color = [0, 0, 0]
    new_im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT,value=color)
    return new_im

def load_image_augmented(path):
    image = cv2.imread('{}'.format(path))
    new_image = resize_to_square(image)
    new_image = preprocess_input(new_image)
    return new_image

def load_image(path, pet_id):
    image = cv2.imread('{}{}-1.jpg'.format(path, pet_id))
    new_image = resize_to_square(image)
    new_image = preprocess_input(new_image)
    return new_image

In [13]:
inp = Input((256,256,3))
backbone = DenseNet121(input_tensor = inp, 
                       weights="../input/densenet/DenseNet-BC-121-32-no-top.h5",
                       include_top = False)
x = backbone.output
x = GlobalAveragePooling2D()(x)
x = Lambda(lambda x: K.expand_dims(x,axis = -1))(x)
x = AveragePooling1D(4)(x)
out = Lambda(lambda x: x[:,:,0])(x)

m = Model(inp,out)

In [15]:
n_batches = len(train_image_files) // batch_size + 1

trn_img_ids = []
trn_img_feats = []
for b in tqdm(range(n_batches)):
    start = b*batch_size
    end = (b+1)*batch_size

    batch_train_files = train_image_files[start:end]
    batch_images = np.zeros((batch_size,img_size,img_size,3))
    batch_ids = []
    for i, img_path in enumerate(batch_train_files):
        try:
            batch_images[i] = load_image_augmented(img_path)
            batch_ids.append(img_path.split('/')[-1].split('-')[0])
        except:
            pass
    batch_preds = m.predict(batch_images)
    for i,pet_id in enumerate(batch_ids):
        trn_img_ids.append(pet_id)
        trn_img_feats.append(batch_preds[i])

100%|██████████| 228/228 [06:59<00:00,  1.84s/it]


In [16]:
train_feats = pd.DataFrame(trn_img_feats)
train_feats.columns = ['pic_{}'.format(str(i)) for i in range(train_feats.shape[1])]
# train_feats['PetID'] = trn_img_ids

In [17]:
train_feats.insert(loc=0, column='PetID', value=trn_img_ids)

In [18]:

n_batches = len(test_image_files) // batch_size + 1

test_img_ids = []
test_img_feats = []
for b in tqdm(range(n_batches)):
    start = b*batch_size
    end = (b+1)*batch_size

    batch_test_files = test_image_files[start:end]
    batch_images = np.zeros((batch_size,img_size,img_size,3))
    batch_ids = []
    for i, img_path in enumerate(batch_test_files):
        try:
            batch_images[i] = load_image(img_path)
            batch_ids.append(img_path.split('/')[-1].split('-')[0])
        except:
            pass
    batch_preds = m.predict(batch_images)
    for i,pet_id in enumerate(batch_ids):
        test_img_ids.append(pet_id)
        test_img_feats.append(batch_preds[i])

100%|██████████| 59/59 [01:03<00:00,  1.08s/it]


In [19]:
# pet_ids = test['PetID'].values
# n_batches = len(pet_ids) // batch_size + 1

# features = {}
# for b in tqdm(range(n_batches)):
#     start = b*batch_size
#     end = (b+1)*batch_size
#     batch_pets = pet_ids[start:end]
#     batch_images = np.zeros((len(batch_pets),img_size,img_size,3))
#     for i,pet_id in enumerate(batch_pets):
#         try:
#             batch_images[i] = load_image("../input/petfinder-adoption-prediction/test_images/", pet_id)
#         except:
#             pass
#     batch_preds = m.predict(batch_images)
#     for i,pet_id in enumerate(batch_pets):
#         features[pet_id] = batch_preds[i]

In [20]:
# cuda.select_device(0)
# cuda.close()
K.clear_session()

In [21]:
test_feats = pd.DataFrame(trn_img_feats)
test_feats.columns = ['pic_{}'.format(str(i)) for i in range(test_feats.shape[1])]

In [23]:
test_feats.insert(loc=0, column='PetID', value=trn_img_ids)

In [None]:
# test_feats = pd.DataFrame.from_dict(features, orient='index')
# test_feats.columns = ['pic_{}'.format(i) for i in range(test_feats.shape[1])]

In [None]:
# train_feats = train_feats.reset_index()
# train_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)

# test_feats = test_feats.reset_index()
# test_feats.rename({'index': 'PetID'}, axis='columns', inplace=True)

In [24]:
img_features = pd.concat([train_feats, test_feats], axis=0)

In [25]:
label_df = pd.DataFrame.from_dict(dict({'PetID':train.PetID, 'AdoptionSpeed': y_trn}))

In [26]:
y_trn = train_feats.merge(label_df, on='PetID', how='left')['AdoptionSpeed']

In [27]:
img_cols = []
for c in img_features.columns:
    if c != 'PetID':
        img_cols.append(c)

In [28]:
X = img_features.merge(X.reset_index(drop=True), on='PetID', how='left')

<a id='data_separation'></a>
## Data prep

In [29]:
cat_cols = ['Type', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 
            'Sterilized', 'Health', 'State', 'primary_BreedName', 'secondary_BreedName',
            'Color1', 'Color2', 'Color3']

drop_cols = ['PetID']

num_cols = []

for c in X.columns:
    if c not in cat_cols + img_cols + text_cols + drop_cols:
        num_cols.append(c)
        X[c] = (X[c]-X[c].mean())/X[c].std()

In [30]:
X_trn = X.iloc[:len(train_image_files)]
X_test = X.iloc[len(train_image_files):]

In [31]:
rescuer_ids_augmented = X_trn.merge(train, on='PetID', how='left')['RescuerID']

<a id='create_model'></a>
## Create Model

<a id='training'></a>
## Training

In [32]:
def train_and_test_model(X, y, X_test=None, num_folds=5, epochs=10, batch_size=32, use_group_kfold=True):
    result_dict = {}
    fold_predictions = []
    val_preds = np.zeros((X.shape[0]))
    
    if use_group_kfold:
        folds = GroupKFold(n_splits=num_folds)
        fold_split = folds.split(X, y, rescuer_ids_augmented)
    else:
        folds = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=1)
        fold_split = folds.split(X, y)
    
    if X_test is not None:
        result_dict['averaged_prediction'] =  np.zeros(len(X_test))      
        inp_test = [X_test[text_cols]] + [X_test[img_cols]] + [X_test[num_cols]] + [X_test[col] for col in cat_cols]     
    else:
        result_dict['averaged_prediction'] =  np.zeros(1)
        inp_test = []
        
    result_dict['predictions'] = []
    
    for fold_n, (idx_trn, idx_val) in enumerate(fold_split):

        print('Fold {}\n'.
              format(fold_n + 1))
         
        X_t = X.iloc[idx_trn]
        X_v = X.iloc[idx_val]
        y_t, y_v = y[idx_trn], y[idx_val]

        inp_trn = [X_t[text_cols]] + [X_t[img_cols]] + [X_t[num_cols]] + [X_t[col] for col in cat_cols]
        inp_val = [X_v[text_cols]] + [X_v[img_cols]] + [X_v[num_cols]] + [X_v[col] for col in cat_cols]
        
        
        print('Beginning training...')
        start_time = time.time()
        
        model = create_model(X_trn, len(text_cols), len(img_cols), len(num_cols), cat_cols, dense_dim=256, 
                 dropout=0.5, embed_factor=2, activation='relu')
        
        model.fit(inp_trn, y_t, batch_size=batch_size, epochs=epochs, verbose=2, validation_data=(inp_val, y_v))
        
        y_val_hat = model.predict(inp_val, batch_size=batch_size)
        val_preds[idx_val] = y_val_hat[:,0]
        
        del X_t, X_v, inp_trn, inp_val
        
        if X_test is not None:
            print('Beginning prediction...')
            start_time = time.time()
            y_hat = model.predict(inp_test, batch_size=batch_size)
            result_dict['predictions'].append(y_hat)
            result_dict['averaged_prediction'] += y_hat[:,0]
            print('Prediction took {0:.2f} minutes.'.format((time.time()-start_time)/60))

        gc.collect()

    result_dict['val_preds'] = val_preds
    if X_test is not None:
        result_dict['averaged_prediction'] /= num_folds
        
    return result_dict


In [1]:
def dense_block(x, dense_dim, dropout, activation):
    x = Dense(dense_dim)(x)
    x = BatchNormalization()(x)
    x = Activation(activation)(x)
    x = Dropout(dropout)(x)
    return x

def root_mean_squared_error(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true), axis=-1))

def create_model(df, text_size, img_size, num_size, categorical_cols, dense_dim, 
                 dropout, embed_factor, activation):
    
    inputs = []
    
    # text 
    text_inp = Input(shape=(text_size,))
    inputs.append(text_inp)
    text_x = dense_block(text_inp, dense_dim, dropout, activation)
    
    # img
    img_inp = Input(shape=(img_size,))
    inputs.append(img_inp)
    img_x = dense_block(img_inp, dense_dim, dropout, activation)
#     img_x = dense_block(img_x, dense_dim, dropout, activation)

    
    # numerical
    num_inp = Input(shape=(num_size,))
    inputs.append(num_inp)
    num_x = dense_block(num_inp, dense_dim, dropout, activation)
    
    # categorical
    cat_in = []
    cat_out = []
    for col in categorical_cols:
        cat_inp = Input((1,))
        cat_in.append(cat_inp)
        num_unique = df[col].nunique()
        emb_size = max(num_unique//embed_factor, 2)
        cat_emb = Embedding(num_unique, emb_size)(cat_inp)
        cat_emb = Reshape(target_shape=(emb_size,))(cat_emb)
        cat_out.append(cat_emb)
    inputs += cat_in
    cat_x = concatenate(cat_out)
    cat_x = dense_block(cat_x, dense_dim, dropout, activation)

    
    # merge
#     x = concatenate([text_x, img_x, cat_x])
    x = img_x
    x = dense_block(x, dense_dim, dropout, activation)
    
#     x = Dense(5, activation='sigmoid')(x)
#     model = Model(inputs=inputs, outputs=x)  
#     model.compile(optimizer=RMSprop(lr=0.0002), loss='categorical_crossentropy', metrics=['accuracy'])
    
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inputs, outputs=x)
    model.compile(optimizer=RMSprop(lr=0.0005), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [34]:
y_bin = np.zeros(y_trn.shape)
y_bin[y_trn == 4] = 1

In [35]:
X_0 = X_trn[y_bin==0][:13934]
X_1 = X_trn[y_bin==1]

In [36]:
y_labs = np.zeros(13934*2)
y_labs [13934:] = 1

In [37]:
res = train_and_test_model(X_trn, y_bin, X_test=X_test, num_folds=5, epochs=5, batch_size=64, use_group_kfold=True)

Fold 1

Beginning training...
Train on 46648 samples, validate on 11663 samples
Epoch 1/5
 - 3s - loss: 0.5779 - acc: 0.7401 - val_loss: 0.5437 - val_acc: 0.7411
Epoch 2/5
 - 3s - loss: 0.5258 - acc: 0.7620 - val_loss: 0.5360 - val_acc: 0.7407
Epoch 3/5
 - 3s - loss: 0.5099 - acc: 0.7687 - val_loss: 0.5381 - val_acc: 0.7432
Epoch 4/5
 - 2s - loss: 0.5036 - acc: 0.7701 - val_loss: 0.5368 - val_acc: 0.7447
Epoch 5/5
 - 2s - loss: 0.4994 - acc: 0.7708 - val_loss: 0.5610 - val_acc: 0.7432
Beginning prediction...
Prediction took 0.01 minutes.
Fold 2

Beginning training...
Train on 46649 samples, validate on 11662 samples
Epoch 1/5
 - 3s - loss: 0.5923 - acc: 0.7212 - val_loss: 0.5014 - val_acc: 0.7894
Epoch 2/5
 - 2s - loss: 0.5356 - acc: 0.7509 - val_loss: 0.5004 - val_acc: 0.7899
Epoch 3/5
 - 2s - loss: 0.5196 - acc: 0.7594 - val_loss: 0.5158 - val_acc: 0.7722
Epoch 4/5
 - 2s - loss: 0.5129 - acc: 0.7606 - val_loss: 0.5108 - val_acc: 0.7703
Epoch 5/5
 - 2s - loss: 0.5078 - acc: 0.7615 - v

In [3]:
def plot_pred(pred):
    sns.distplot(pred, kde=True, hist_kws={'range': [0, 5]})

In [4]:
plot_pred(res['averaged_prediction'])

NameError: name 'res' is not defined

In [2]:
plot_pred(res['val_preds'])

NameError: name 'plot_pred' is not defined

In [46]:
submission = pd.DataFrame({'PetID': X_test['PetID'].values, 'AdoptionSpeed_nn': res['averaged_prediction']})
submission.to_csv('nn_test.csv', index=False)
submission.head()

Unnamed: 0,AdoptionSpeed_nn,PetID
0,0.181397,0008c5398
1,0.305387,0008c5398
2,0.302267,0008c5398
3,0.242858,0008c5398
4,0.267257,0008c5398


In [47]:
submission = pd.DataFrame({'PetID': X_trn['PetID'].values, 'AdoptionSpeed_nn': res['val_preds']})
submission.to_csv('nn_train.csv', index=False)
submission.head()

Unnamed: 0,AdoptionSpeed_nn,PetID
0,0.226484,0008c5398
1,0.337088,0008c5398
2,0.355109,0008c5398
3,0.258567,0008c5398
4,0.304724,0008c5398
