# Neural embedding model
Since we are not very satisfied with the ad-hoc re-mapping of categorical variables to arbitrary numbers, we propose a more novel solution here where we use embedding layers to find optimal numeric representations for categorical variables with respect to the prediction task.

In [33]:
import sys
sys.path.append('..')
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import matplotlib.pyplot as plt
%matplotlib notebook
plt.style.use('ggplot')

In [34]:
selected_features_numeric = [
    'gps_height',
    'latitude',
    'longitude',
    'population',
    'amount_tsh',
    'construction_year',
]

selected_features_categorical = [
    'payment_type',
    'management_group',
    'quality_group',
    'region',
    'basin',
    'extraction_type_class',
    'quantity_group',
    'waterpoint_type_group',
    'source_type',
    'source_class'
]

all_features =  selected_features_categorical + selected_features_numeric

In [35]:
from data_loading import data_loading_pipeline, split_data
experimentation_df, holdout_df = data_loading_pipeline('../data')
train_df, test_df = split_data(experimentation_df)

Label distribution in training set:  Counter({0: 23519, 2: 16750, 1: 2922})
Label distribution in testing set:  Counter({0: 7870, 2: 5518, 1: 1009})
Label distribution in training set:  Counter({0: 17662, 2: 12528, 1: 2203})
Label distribution in testing set:  Counter({0: 5857, 2: 4222, 1: 719})


# First prepare the data for the model by one-hot encoding all categorical inputs and merging all numeric inputs.


In [29]:
from keras.utils import to_categorical
def prepare_inputs(df):
    inputs = {}
    for f in selected_features_categorical:
        inputs[f] = to_categorical(df[f])

    inputs['continuous'] = df[selected_features_numeric].as_matrix()
    return inputs

smote = False
if smote:
    from imblearn.over_sampling import SMOTE
    smote = SMOTE(n_jobs=4,k=5)
    train_x_smote, train_y_smote = smote.fit_sample(train_df[all_features],train_df.status_group)

    train_inputs = prepare_inputs(pd.DataFrame(train_x_smote,columns=all_features))
    test_inputs = prepare_inputs(test_df)

    train_output= to_categorical(train_y_smote)
    test_output= to_categorical(test_df.status_group.as_matrix())
    
else:
    train_inputs = prepare_inputs(train_df)
    test_inputs = prepare_inputs(test_df)
    holdout_inputs = prepare_inputs(holdout_df)

    train_output= to_categorical(train_df.status_group.as_matrix())
    test_output= to_categorical(test_df.status_group.as_matrix())
    holdout_output= to_categorical(holdout_df.status_group.as_matrix())

In [24]:
print('Input shapes: ')
for k in train_inputs.keys():
    print(train_inputs[k].shape)
    
    
print('Output shape: {}'.format(train_output.shape))

Input shapes: 
(32393, 6)
(32393, 9)
(32393, 7)
(32393, 5)
(32393, 7)
(32393, 6)
(32393, 6)
(32393, 7)
(32393, 21)
(32393, 5)
(32393, 3)
Output shape: (32393, 3)


# Now let's create the model:
    * embedding layers for each categorical input
    * dense feature selection layer for all the numeric inputs
    * concatenate output of all the layers into  a single latent layer
    * dense softmax layer on top of the concatenate layer to do prediction

In [25]:
import keras
from keras.models import Model
from keras import layers as L

input_layers = []
input_name_orders = []
concat_layers = []
for k in selected_features_categorical:
    input_layers.append(L.Input(shape=(train_inputs[k].shape[-1],),name=k))
    embed = L.Embedding(input_dim=train_inputs[k].shape[-1], 
                    output_dim=train_inputs[k].shape[-1],name='{}_embed'.format(k))(input_layers[-1])
    embed = L.Flatten()(embed)
    concat_layers.append(embed)

    input_name_orders.append(k)
    
input_layers.append(L.Input(shape=(train_inputs['continuous'].shape[-1],),name='continuous'))
input_name_orders.append('continuous')
concat_layers.append(L.Dense(32,name='continuous_dense', activation='sigmoid')(input_layers[-1]))

latent = L.concatenate(concat_layers)
latent = L.Dropout(0.5)(latent)
latent = L.Dense(15,activation='sigmoid')(latent)
output = L.Dense(train_output.shape[-1], activation='softmax', name='decision')(latent)

model = Model(inputs=input_layers,outputs=output, name = 'neural_embedder')
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])

model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
payment_type (InputLayer)        (None, 7)             0                                            
____________________________________________________________________________________________________
management_group (InputLayer)    (None, 5)             0                                            
____________________________________________________________________________________________________
quality_group (InputLayer)       (None, 6)             0                                            
____________________________________________________________________________________________________
region (InputLayer)              (None, 21)            0                                            
___________________________________________________________________________________________

In [32]:
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, TensorBoard
callbacks = [EarlyStopping(patience=50)]


class_weights = {}
for i in [0,1,2]:
    class_weights[i] = np.square(1-(np.sum(np.argmax(train_output,-1) == i)/len(train_output)))
    
model.fit(x=[train_inputs[k] for k in input_name_orders],y=train_output,epochs=5000,
          validation_data=([test_inputs[k] for k in input_name_orders],test_output),
          callbacks=callbacks,batch_size=64
          ,class_weight=class_weights)

Train on 32393 samples, validate on 10798 samples
Epoch 1/5000
Epoch 2/5000
Epoch 3/5000
Epoch 4/5000
Epoch 5/5000
Epoch 6/5000
Epoch 7/5000
Epoch 8/5000
Epoch 9/5000
Epoch 10/5000
Epoch 11/5000
Epoch 12/5000
Epoch 13/5000
Epoch 14/5000
Epoch 15/5000
Epoch 16/5000
Epoch 17/5000
Epoch 18/5000
Epoch 19/5000
Epoch 20/5000
Epoch 21/5000
Epoch 22/5000
Epoch 23/5000
Epoch 24/5000
Epoch 25/5000
Epoch 26/5000
Epoch 27/5000
Epoch 28/5000
Epoch 29/5000
Epoch 30/5000
Epoch 31/5000
Epoch 32/5000
Epoch 33/5000
Epoch 34/5000
Epoch 35/5000
Epoch 36/5000
Epoch 37/5000
Epoch 38/5000
Epoch 39/5000
Epoch 40/5000
Epoch 41/5000
Epoch 42/5000
Epoch 43/5000
Epoch 44/5000
Epoch 45/5000
Epoch 46/5000
Epoch 47/5000
Epoch 48/5000
Epoch 49/5000
Epoch 50/5000
Epoch 51/5000
Epoch 52/5000
Epoch 53/5000
Epoch 54/5000
Epoch 55/5000
Epoch 56/5000
Epoch 57/5000


<keras.callbacks.History at 0x14259cc50>

In [31]:
holdout_pred = np.argmax(model.predict(holdout_inputs),-1)
holdout_ref = np.argmax(holdout_output,-1)

from sklearn.metrics import cohen_kappa_score, accuracy_score, confusion_matrix
print(confusion_matrix(holdout_ref, holdout_pred))
print('Kappa: ', cohen_kappa_score(holdout_ref, holdout_pred))
print('Acc: ', accuracy_score(holdout_ref, holdout_pred))


[[7332    0  538]
 [ 861    0  148]
 [2279    0 3239]]
Kappa:  0.4662549916391652
Acc:  0.7342501910120164
