### Kaggle - Porto Seguro’s Safe Driver Prediction

# Keras Neural Network model

**v1.3.8 Embedding, Upsampling, Stratified K-Fold Validation**

by Ming Chiu and Louis Yang, 2017

The data is taken from the Kaggle Competition - Porto Seguro https://www.kaggle.com/c/porto-seguro-safe-driver-prediction

Original filename: `model3-nn_01_3_8_8folds_25epochs`

#### Statistics
Runtime: 1hr 4min on Intel Core i5-750  
Memory: 4+ GB

In [1]:
model_name = 'model_nn_keras'

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from tensorflow import set_random_seed

from sklearn.model_selection import StratifiedKFold

from keras.models import Model
from keras.layers import Dense, Activation, Merge, Reshape, Dropout, concatenate, Input, Flatten
from keras.layers.embeddings import Embedding

Using TensorFlow backend.


### Gini scoring function
gini scoring function from kernel at: https://www.kaggle.com/tezdhar/faster-gini-calculation

In [5]:
def ginic(actual, pred):
    n = len(actual)
    a_s = actual[np.argsort(pred)]
    a_c = a_s.cumsum()
    giniSum = a_c.sum() / a_c[-1] - (n + 1) / 2.0
    return giniSum / n

def gini_normalizedc(actual, pred):
    return ginic(actual, pred) / ginic(actual, actual)

## Data Loading & Preprocessing

In [6]:
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

In [7]:
df_train.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,7,0,2,2,5,1,0,0,1,0,...,9,1,5,8,0,1,1,0,0,1
1,9,0,1,1,7,0,0,0,0,1,...,3,1,1,9,0,1,1,0,1,0
2,13,0,5,4,9,1,0,0,0,1,...,4,2,7,7,0,1,1,0,1,0
3,16,0,0,1,2,0,0,1,0,0,...,2,2,4,9,0,0,0,0,0,0
4,17,0,0,2,0,1,0,1,0,0,...,3,1,1,3,0,0,0,1,1,0


In [8]:
df_test.head()

Unnamed: 0,id,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
0,0,0,1,8,1,0,0,1,0,0,...,1,1,1,12,0,1,1,0,0,1
1,1,4,2,5,1,0,0,0,0,1,...,2,0,3,10,0,0,1,1,0,1
2,2,5,1,3,0,0,0,0,0,1,...,4,0,2,4,0,0,0,0,0,0
3,3,0,1,6,0,0,1,0,0,0,...,5,1,0,5,1,0,1,0,0,0
4,4,5,1,7,0,0,0,0,0,1,...,4,0,0,4,0,1,1,0,0,1


In [9]:
X_train, y_train = df_train.iloc[:,2:], df_train.target
X_test = df_test.iloc[:,1:]  # For test sets which don't have 'target' field

### Preprocessing Data

Drop all `'calc'` columns

In [10]:
cols_use = [c for c in X_train.columns if (not c.startswith('ps_calc_'))]

In [11]:
X_train = X_train[cols_use]
X_test = X_test[cols_use]

Find categorical features. 

In [12]:
col_vals_dict = {c: list(X_train[c].unique()) for c in X_train.columns if c.endswith('_cat')}

In [13]:
embed_cols = []
for c in col_vals_dict:
    if len(col_vals_dict[c])>2:
        embed_cols.append(c)
        print(c + ': %d values' % len(col_vals_dict[c]))
        # look at value counts to know the embedding dimensions

ps_ind_02_cat: 5 values
ps_ind_04_cat: 3 values
ps_ind_05_cat: 8 values
ps_car_01_cat: 13 values
ps_car_02_cat: 3 values
ps_car_03_cat: 3 values
ps_car_04_cat: 10 values
ps_car_05_cat: 3 values
ps_car_06_cat: 18 values
ps_car_07_cat: 3 values
ps_car_09_cat: 6 values
ps_car_10_cat: 3 values
ps_car_11_cat: 104 values


Convert data to list format to match the network structure

In [14]:
def preproc(X_train, X_val, X_test):

    input_list_train = []
    input_list_val = []
    input_list_test = []
    
    # The cols to be embedded: rescaling to range [0, # values)
    for c in embed_cols:
        raw_vals = np.unique(X_train[c])
        val_map = {}
        for i in range(len(raw_vals)):
            val_map[raw_vals[i]] = i       
        input_list_train.append(X_train[c].map(val_map).values)
        input_list_val.append(X_val[c].map(val_map).fillna(0).values)
        input_list_test.append(X_test[c].map(val_map).fillna(0).values)
     
    # The rest of the columns
    other_cols = [c for c in X_train.columns if (not c in embed_cols)]
    input_list_train.append(X_train[other_cols].values)
    input_list_val.append(X_val[other_cols].values)
    input_list_test.append(X_test[other_cols].values)
    
    return input_list_train, input_list_val, input_list_test

## Neural Network

Look at the input dimensions to determine the embedding output dimensions

In [15]:
embedding_input_dim_list = [len(col_vals_dict[c]) for c in embed_cols]
embedding_input_dim_list

[5, 3, 8, 13, 3, 3, 10, 3, 18, 3, 6, 3, 104]

In [16]:
embedding_output_dim_list = [3,2,5,7,2,2,5,2,8,2,3,2,10]  # Set the embedding output dimensions here
assert len(embedding_output_dim_list) == len(embedding_input_dim_list)

In [17]:
np.sum(embedding_output_dim_list)  # Total embedding output dimension

53

Keras Neural Network

In [18]:
def build_embedding_network(loss='binary_crossentropy'):
    
    input_layers = []  # record the input layer
    concate_layers = []  # record the merge layer
    
    for i, c in zip(range(len(embed_cols)), embed_cols):
        layer = Input(shape=(1,), name=c)
        input_layers.append(layer)
        layer = Embedding(embedding_input_dim_list[i], 
                          embedding_output_dim_list[i])(layer)
        layer = Flatten()(layer)
        concate_layers.append(layer)
        
    layer_rest = Input(shape=(24,), name='rest')
    input_layers.append(layer_rest)
    layer_rest = Dense(32)(layer_rest)
    concate_layers.append(layer_rest)

    layer = concatenate(concate_layers)
    
    layer = Dense(80, activation='relu')(layer)
    layer = Dropout(.35)(layer)
    layer = Dense(30, activation='relu')(layer)
    layer = Dropout(.15)(layer)
    layer = Dense(20, activation='relu')(layer)
    layer = Dropout(.15)(layer)
    layer = Dense(1, activation='sigmoid')(layer)
    
    model = Model(inputs=input_layers, outputs=layer)
    if loss == 'binary_crossentropy':
        model.compile(loss=loss, optimizer='adam')
    else:
        model.compile(loss=loss, optimizer='adam', metrics=['binary_crossentropy'])
    
    return model

In [19]:
test_nn = build_embedding_network()
test_nn.summary()
del test_nn

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
ps_ind_02_cat (InputLayer)       (None, 1)             0                                            
____________________________________________________________________________________________________
ps_ind_04_cat (InputLayer)       (None, 1)             0                                            
____________________________________________________________________________________________________
ps_ind_05_cat (InputLayer)       (None, 1)             0                                            
____________________________________________________________________________________________________
ps_car_01_cat (InputLayer)       (None, 1)             0                                            
___________________________________________________________________________________________

### K-fold Network Network Training
The training will run with `K`-fold validation. Each fold will run `runs_per_fold` times and output the mean of these runs.  
We do upsampling which duplicate the positive examples so that it is easier for neural network to learn.

In [20]:
K = 8
runs_per_fold = 3
n_epochs = 25

cv_ginis = []
full_val_preds = np.zeros(np.shape(X_train)[0])
y_preds = np.zeros((np.shape(X_test)[0],K))

In [21]:
kfold = StratifiedKFold(n_splits = K, shuffle = True)  #random_state = 231

In [116]:
%%time
hists_folds = []
for i, (f_ind, outf_ind) in enumerate(kfold.split(X_train, y_train)):
    print(i)
    
    X_train_f, X_val_f = X_train.loc[f_ind].copy(), X_train.loc[outf_ind].copy()
    y_train_f, y_val_f = y_train[f_ind], y_train[outf_ind]
    
    X_test_f = X_test.copy()
    
    # Upsampling
    # adapted from kernel: https://www.kaggle.com/ogrellier/xgb-classifier-upsampling-lb-0-283
    pos = (pd.Series(y_train_f == 1))
    
    # add positive examples (Duplicate the positiive examples)
    X_train_f = pd.concat([X_train_f, X_train_f.loc[pos]], axis=0)
    y_train_f = pd.concat([y_train_f, y_train_f.loc[pos]], axis=0)
    
    # Shuffle data
    # since kfold always gives you data in order
    idx = np.arange(len(X_train_f))
    np.random.shuffle(idx)
    X_train_f = X_train_f.iloc[idx]
    y_train_f = y_train_f.iloc[idx]
    
    # Preprocessing
    proc_X_train_f, proc_X_val_f, proc_X_test_f = preproc(X_train_f, X_val_f, X_test_f)
    
    # Track of prediction for CV scores
    val_preds = 0
    hists = []
    
    for j in range(runs_per_fold):
        print(j)
        NN = build_embedding_network()
        hist = NN.fit(proc_X_train_f, y_train_f.values, epochs=n_epochs, batch_size=4096, 
                      verbose=2, validation_data=(proc_X_val_f, y_val_f.values))
   
        val_preds += NN.predict(proc_X_val_f, batch_size=4096*8)[:,0] / runs_per_fold
        y_preds[:,i] += NN.predict(proc_X_test_f, batch_size=4096*8)[:,0] / runs_per_fold
        hists.append(hist)
        
    full_val_preds[outf_ind] += val_preds
        
    cv_gini = gini_normalizedc(y_val_f.values, val_preds)
    cv_ginis.append(cv_gini)
    print ('\nFold %i prediction cv gini: %.5f\n' %(i, cv_gini))
    hists_folds.append(hists)

0
0
Train on 539792 samples, validate on 74402 samples
Epoch 1/25
24s - loss: 0.3084 - val_loss: 0.1742
Epoch 2/25
5s - loss: 0.2616 - val_loss: 0.1811
Epoch 3/25
5s - loss: 0.2573 - val_loss: 0.1781
Epoch 4/25
5s - loss: 0.2555 - val_loss: 0.1700
Epoch 5/25
5s - loss: 0.2542 - val_loss: 0.1746
Epoch 6/25
5s - loss: 0.2534 - val_loss: 0.1711
Epoch 7/25
5s - loss: 0.2529 - val_loss: 0.1736
Epoch 8/25
5s - loss: 0.2522 - val_loss: 0.1700
Epoch 9/25
6s - loss: 0.2518 - val_loss: 0.1702
Epoch 10/25
5s - loss: 0.2511 - val_loss: 0.1705
Epoch 11/25
5s - loss: 0.2508 - val_loss: 0.1666
Epoch 12/25
5s - loss: 0.2505 - val_loss: 0.1679
Epoch 13/25
5s - loss: 0.2499 - val_loss: 0.1626
Epoch 14/25
5s - loss: 0.2496 - val_loss: 0.1700
Epoch 15/25
5s - loss: 0.2494 - val_loss: 0.1665
Epoch 16/25
5s - loss: 0.2491 - val_loss: 0.1677
Epoch 17/25
5s - loss: 0.2487 - val_loss: 0.1645
Epoch 18/25
5s - loss: 0.2487 - val_loss: 0.1661
Epoch 19/25
5s - loss: 0.2485 - val_loss: 0.1634
Epoch 20/25
5s - loss:

5s - loss: 0.2517 - val_loss: 0.1695
Epoch 11/25
5s - loss: 0.2512 - val_loss: 0.1685
Epoch 12/25
5s - loss: 0.2507 - val_loss: 0.1687
Epoch 13/25
5s - loss: 0.2504 - val_loss: 0.1637
Epoch 14/25
5s - loss: 0.2500 - val_loss: 0.1639
Epoch 15/25
5s - loss: 0.2494 - val_loss: 0.1624
Epoch 16/25
5s - loss: 0.2494 - val_loss: 0.1632
Epoch 17/25
5s - loss: 0.2490 - val_loss: 0.1623
Epoch 18/25
5s - loss: 0.2486 - val_loss: 0.1628
Epoch 19/25
5s - loss: 0.2485 - val_loss: 0.1651
Epoch 20/25
5s - loss: 0.2482 - val_loss: 0.1639
Epoch 21/25
5s - loss: 0.2480 - val_loss: 0.1631
Epoch 22/25
5s - loss: 0.2481 - val_loss: 0.1638
Epoch 23/25
5s - loss: 0.2476 - val_loss: 0.1624
Epoch 24/25
5s - loss: 0.2471 - val_loss: 0.1630
Epoch 25/25
5s - loss: 0.2471 - val_loss: 0.1645
1
Train on 539792 samples, validate on 74402 samples
Epoch 1/25
11s - loss: 0.3259 - val_loss: 0.1716
Epoch 2/25
5s - loss: 0.2634 - val_loss: 0.1721
Epoch 3/25
5s - loss: 0.2575 - val_loss: 0.1779
Epoch 4/25
5s - loss: 0.2551 -

5s - loss: 0.2474 - val_loss: 0.1635
Epoch 22/25
5s - loss: 0.2471 - val_loss: 0.1632
Epoch 23/25
5s - loss: 0.2469 - val_loss: 0.1634
Epoch 24/25
5s - loss: 0.2469 - val_loss: 0.1626
Epoch 25/25
5s - loss: 0.2465 - val_loss: 0.1637
1
Train on 539792 samples, validate on 74402 samples
Epoch 1/25
12s - loss: 0.2800 - val_loss: 0.1813
Epoch 2/25
5s - loss: 0.2583 - val_loss: 0.1811
Epoch 3/25
5s - loss: 0.2550 - val_loss: 0.1859
Epoch 4/25
5s - loss: 0.2539 - val_loss: 0.1880
Epoch 5/25
5s - loss: 0.2524 - val_loss: 0.1796
Epoch 6/25
5s - loss: 0.2517 - val_loss: 0.1729
Epoch 7/25
5s - loss: 0.2507 - val_loss: 0.1767
Epoch 8/25
5s - loss: 0.2505 - val_loss: 0.1709
Epoch 9/25
5s - loss: 0.2498 - val_loss: 0.1706
Epoch 10/25
5s - loss: 0.2494 - val_loss: 0.1655
Epoch 11/25
5s - loss: 0.2489 - val_loss: 0.1689
Epoch 12/25
5s - loss: 0.2485 - val_loss: 0.1641
Epoch 13/25
5s - loss: 0.2483 - val_loss: 0.1655
Epoch 14/25
5s - loss: 0.2480 - val_loss: 0.1641
Epoch 15/25
5s - loss: 0.2480 - val_

5s - loss: 0.2531 - val_loss: 0.1715
Epoch 7/25
5s - loss: 0.2525 - val_loss: 0.1699
Epoch 8/25
5s - loss: 0.2517 - val_loss: 0.1689
Epoch 9/25
5s - loss: 0.2509 - val_loss: 0.1668
Epoch 10/25
5s - loss: 0.2505 - val_loss: 0.1643
Epoch 11/25
5s - loss: 0.2503 - val_loss: 0.1673
Epoch 12/25
5s - loss: 0.2503 - val_loss: 0.1659
Epoch 13/25
5s - loss: 0.2494 - val_loss: 0.1657
Epoch 14/25
5s - loss: 0.2491 - val_loss: 0.1633
Epoch 15/25
5s - loss: 0.2487 - val_loss: 0.1631
Epoch 16/25
5s - loss: 0.2488 - val_loss: 0.1659
Epoch 17/25
5s - loss: 0.2484 - val_loss: 0.1640
Epoch 18/25
5s - loss: 0.2482 - val_loss: 0.1643
Epoch 19/25
5s - loss: 0.2481 - val_loss: 0.1646
Epoch 20/25
5s - loss: 0.2478 - val_loss: 0.1657
Epoch 21/25
5s - loss: 0.2476 - val_loss: 0.1627
Epoch 22/25
5s - loss: 0.2474 - val_loss: 0.1666
Epoch 23/25
5s - loss: 0.2470 - val_loss: 0.1638
Epoch 24/25
5s - loss: 0.2468 - val_loss: 0.1637
Epoch 25/25
5s - loss: 0.2466 - val_loss: 0.1646
2
Train on 539795 samples, validate

In [117]:
print('Mean out of fold gini, Full validation gini:',
      '%.5f' % np.mean(cv_ginis), '%.5f' % gini_normalizedc(y_train.values, full_val_preds), sep=', ')

Mean out of fold gini, Full validation gini:, 0.27751, 0.27724


### Combining prediction on test set from each folds

In [118]:
y_pred_final_mean = np.mean(y_preds, axis=1)

In [120]:
y_pred_final_mean

array([ 0.06098618,  0.04743283,  0.04498465, ...,  0.08250533,
        0.04820284,  0.04689513])

In [122]:
output_path = '../results/' + model_name+'_mean'+'-test'+'.csv'

'model3-nn_01_3_8_8folds_25epochs_LCM_mean-val0.csv'

In [123]:
df_y_test = pd.DataFrame({'id': df_test.id, 
                          'target': y_pred_final_mean},
                         columns = ['id','target'])
df_y_test.to_csv(output_path, index=False)

Single model Kaggle leaderboard best scores: 0.27909 (Public), 0.28241 (Private)