# Setup

In [1]:
import numpy as np
import os
import csv
from random import random, sample, seed

data_path = 'grid_faq_1.csv'
embeddings_path = 'glove.6B/glove.6B.100d.txt'

## Load Data

In [2]:
product_name_1 = []
product_name_2 = []
product_type_1 = []
product_type_2 = []
feature = []
value_no = []

with open(data_path, 'r', encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for submission in reader:
        product_name_1.append(submission['product_name_1'])
        product_name_2.append(submission['product_name_2'])
        product_type_1.append(submission['product_type_1'])
        product_type_2.append(submission['product_type_2'])
        feature.append(submission['feature'])
        value_no.append(submission['value_no'])
            
pname1 = np.array(product_name_1)
pname2 = np.array(product_name_2)
ptype1 = np.array(product_type_1)
ptype2 = np.array(product_type_2)
feature = np.array(feature)
value_no = np.array(value_no, dtype=int)

In [3]:
# check data
print(pname1[52:58])
print(pname1.shape)
print(pname2[0:2])
print(ptype1[0:2])
print(ptype2[0:2])
print(feature[0:2])
print(value_no[0:2])

['general sb- semi urban branches (semi urban sb)'
 'general sb- semi urban branches (semi urban sb)'
 'general sb- semi urban branches (semi urban sb)'
 'general sb- rural branches (rural gen sb)'
 'general sb- rural branches (rural gen sb)'
 'general sb- rural branches (rural gen sb)']
(554,)
['platinum' 'classic']
['savings account' 'savings account']
['debit card' 'debit card']
['annual charges' 'annual charges']
[6 6]


# Process  Textual Inputs

In [4]:
#all_words = np.append(pname1, pname2, ptype1, ptype2, feature)
all_words = np.append(pname1, pname2)
all_words = np.append(all_words, ptype1)
all_words = np.append(all_words, ptype2)
all_words = np.append(all_words, feature)

print(pname1.shape)
print(all_words.shape)

(554,)
(2770,)


In [5]:
from keras.preprocessing import sequence
from keras.preprocessing.text import text_to_word_sequence, Tokenizer

max_features = 40000
word_tokenizer = Tokenizer(max_features)

'''
Todo: fit the tokenizer on all textual words
'''
#word_tokenizer.fit_on_texts(pname1) 
word_tokenizer.fit_on_texts(all_words)

print(str(word_tokenizer.word_counts))
print(str(word_tokenizer.word_index))
print(len(word_tokenizer.word_counts))   # true word count

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


OrderedDict([('ace', 38), ('classic', 77), ('general', 76), ('sb', 152), ('semi', 38), ('urban', 57), ('branches', 57), ('rural', 38), ('gen', 19), ('metro', 95), ('nova', 18), ('orange', 114), ('savings', 431), ('bank', 57), ('non', 152), ('senior', 76), ('platina', 75), ('solo', 19), ('zing', 38), ('kids', 19), ('account', 573), ('zwipe', 57), ('edge', 38), ('pro', 38), ('astra', 40), ('15', 20), ('5', 20), ('elite', 20), ('neo', 20), ('optima', 20), ('prima', 20), ('platinum', 38), ('gold', 29), ('my', 20), ('world', 60), ('silk', 20), ('exclusive', 20), ('b', 62), ('unk', 158), ('titanium', 14), ('a', 153), ('business', 18), ('privy', 9), ('current', 180), ('debit', 632), ('card', 632), ('annual', 181), ('charges', 476), ('cash', 114), ('withdrawal', 114), ('at', 116), ('own', 76), ('atm', 96), ('image', 20), ('issuance', 20), ('charge', 20), ('regeneration', 29), ('of', 87), ('pin', 29), ('sent', 29), ('through', 29), ('courier', 29), ('replacement', 58), ('stolen', 29), ('lost', 

In [6]:
pname1_tf = word_tokenizer.texts_to_sequences(pname1)
pname2_tf = word_tokenizer.texts_to_sequences(pname2)
ptype1_tf = word_tokenizer.texts_to_sequences(ptype1)
ptype2_tf = word_tokenizer.texts_to_sequences(ptype2)
feature_tf = word_tokenizer.texts_to_sequences(feature)

print(feature_tf[52])

[4, 6, 13, 20, 15, 16, 28, 21]


In [7]:
'''
each dimension represents a word.
The longest value in pname1 is: general sb-\nsemi urban branches (semi urban sb)'
the vector is [4, 1, 19, 11, 7, 19, 11, 1]
dimension =8
So here set the maxlen a bit larger, as 10
'''

maxlen = 10
#all_words_tf = sequence.pad_sequences(all_words_tf, maxlen=maxlen)
pname1_tf = sequence.pad_sequences(pname1_tf, maxlen=maxlen)
pname2_tf = sequence.pad_sequences(pname2_tf, maxlen=maxlen)
ptype1_tf = sequence.pad_sequences(ptype1_tf, maxlen=maxlen)
ptype2_tf = sequence.pad_sequences(ptype2_tf, maxlen=maxlen)
feature_tf = sequence.pad_sequences(feature_tf, maxlen=maxlen)

print(feature_tf[52])

[ 0  0  4  6 13 20 15 16 28 21]


## Add Pretrained Embeddings

Adapted from [the official keras tutorial](https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html).

Use pretrained GloVe embeddings to both give Embeddings training a good start, and to account for words that might be present in the test set but not in the training set.

First, load the 50D embeddings into memory.

In [8]:
embedding_vectors = {}

with open(embeddings_path, 'r') as f:
    for line in f:
        line_split = line.strip().split(" ")
        vec = np.array(line_split[1:], dtype=float)
        word = line_split[0]
        embedding_vectors[word] = vec

print(embedding_vectors['you'])

[-1.0919e-03  3.3324e-01  3.5743e-01 -5.4041e-01  8.2032e-01 -4.9391e-01
 -3.2588e-01  1.9972e-03 -2.3829e-01  3.5554e-01 -6.0655e-01  9.8932e-01
 -2.1786e-01  1.1236e-01  1.1494e+00  7.3284e-01  5.1182e-01  2.9287e-01
  2.8388e-01 -1.3590e+00 -3.7951e-01  5.0943e-01  7.0710e-01  6.2941e-01
  1.0534e+00 -2.1756e+00 -1.3204e+00  4.0001e-01  1.5741e+00 -1.6600e+00
  3.7721e+00  8.6949e-01 -8.0439e-01  1.8390e-01 -3.4332e-01  1.0714e-02
  2.3969e-01  6.6748e-02  7.0117e-01 -7.3702e-01  2.0877e-01  1.1564e-01
 -1.5190e-01  8.5908e-01  2.2620e-01  1.6519e-01  3.6309e-01 -4.5697e-01
 -4.8969e-02  1.1316e+00]


Initialize the weights matrix as zeroes, then replace the corresponding index of the weights matrix with the index of the corresponding word.

In [9]:
weights_matrix = np.zeros((max_features + 1, 50))

for word, i in word_tokenizer.word_index.items():

    embedding_vector = embedding_vectors.get(word)
    if (embedding_vector is not None) and i <= max_features:
        weights_matrix[i] = embedding_vector

#index 0 vector should be all zeroes, index 1 vector should be the same one as above
print(weights_matrix[0:2,:])

[[ 0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.        0.        0.        0.        0.        0.        0.
   0.      ]
 [-0.040659 -0.23029   1.1337    0.10393  -0.20169   0.3381    0.099786
  -0.20725  -0.031982  0.50944   0.3619    0.80047   0.71244  -0.22794
   0.41096   0.021037 -1.5387   -0.63559   1.4095   -0.22335   1.2706
  -1.6816   -0.5831    0.54799  -0.61627  -0.83802  -0.27172  -0.30457
   0.38462  -1.8756    1.0956    0.79229   0.80611   0.9219   -0.24946
   0.29573   0.11746  -0.36472  -0.24929  -0.19736   1.248     0.12579
   0.16182   1.2252   -0.38696  -2.0407    0.63147   1.1275    0.43204
  -0.17509 ]]


# Process Output - Encoding

In [10]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

# encode class values as integers
encoder = LabelEncoder()
encoder.fit(value_no)
encoded_value_no = encoder.transform(value_no)

# convert integers to dummy variables (i.e. one hot encoded)
dummy_value_no = np_utils.to_categorical(encoded_value_no)

# Build the Model

Use Keras's functional API to build a branching model.

In [29]:
from keras.models import Input, Model
from keras.layers import Dense, Embedding, GlobalAveragePooling1D, concatenate, Activation
from keras.layers.core import Masking, Dropout, Reshape
from keras.layers.normalization import BatchNormalization

batch_size = 100
embedding_dims = 50
epochs = 20

## Text Branches

Encode the text using a mock fasttext approach. Use `weights_matrix` derived above.

In [30]:
pname1_input = Input(shape=(maxlen,), name='pname1_input')
pname1_embedding = Embedding(max_features + 1, embedding_dims, weights=[weights_matrix])(pname1_input)
pname1_pooling = GlobalAveragePooling1D()(pname1_embedding)

pname2_input = Input(shape=(maxlen,), name='pname2_input')
pname2_embedding = Embedding(max_features + 1, embedding_dims, weights=[weights_matrix])(pname2_input)
pname2_pooling = GlobalAveragePooling1D()(pname2_embedding)

ptype1_input = Input(shape=(maxlen,), name='ptype1_input')
ptype1_embedding = Embedding(max_features + 1, embedding_dims, weights=[weights_matrix])(ptype1_input)
ptype1_pooling = GlobalAveragePooling1D()(ptype1_embedding)

ptype2_input = Input(shape=(maxlen,), name='ptype2_input')
ptype2_embedding = Embedding(max_features + 1, embedding_dims, weights=[weights_matrix])(ptype2_input)
ptype2_pooling = GlobalAveragePooling1D()(ptype2_embedding)

feature_input = Input(shape=(maxlen,), name='feature_input')
feature_embedding = Embedding(max_features + 1, embedding_dims, weights=[weights_matrix])(feature_input)
feature_pooling = GlobalAveragePooling1D()(feature_embedding)

Add an auxillary output to regularize the text component.

In [31]:
aux_output = Dense(16, activation='softmax', name='aux_out')(pname1_pooling)

## Merge the Branches and Complete Model

Combine the 5 embeddings (250D total), add a FC layer to understand latent characteristic, use softmax to get the probability of each class and then decide the final class.

In [32]:
merged = concatenate([pname1_pooling, pname2_pooling, ptype1_pooling, ptype2_pooling, feature_pooling])

hidden_1 = Dense(256, activation='relu')(merged)
hidden_1 = BatchNormalization()(hidden_1)

main_output = Dense(16, activation='softmax', name='main_out')(hidden_1)

In [33]:
merged.shape

TensorShape([Dimension(None), Dimension(250)])

## Compile the Model

In [34]:
model = Model(inputs=[pname1_input,
                      pname2_input,
                      ptype1_input,
                      ptype2_input,
                      feature_input], 
              outputs=[main_output])

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
pname1_input (InputLayer)       (None, 10)           0                                            
__________________________________________________________________________________________________
pname2_input (InputLayer)       (None, 10)           0                                            
__________________________________________________________________________________________________
ptype1_input (InputLayer)       (None, 10)           0                                            
__________________________________________________________________________________________________
ptype2_input (InputLayer)       (None, 10)           0                                            
__________________________________________________________________________________________________
feature_in

In [35]:
from keras.utils.vis_utils import model_to_dot
from keras.utils import plot_model

plot_model(model, to_file='../pic/multi_classifier.png', show_shapes=True)

![](model.png)

# Train the Model!

Randomize the model before training, since Keras [takes the last 20%](https://keras.io/getting-started/faq/#how-is-the-validation-split-computed) as the validation set.

In [36]:
seed(123)
split = 0.3

# returns randomized indices with no repeats
idx = sample(range(pname1_tf.shape[0]), pname1_tf.shape[0])

pname1_tf = pname1_tf[idx, :]
pname2_tf = pname2_tf[idx, :]
ptype1_tf = ptype1_tf[idx, :]
ptype2_tf = ptype2_tf[idx, :]
feature_tf = feature_tf[idx, :]
dummy_value_no = dummy_value_no[idx, :]

Determine No-Information Rate of the test set: the `val_main_out_acc` must be better than it.

In [19]:
dummy_value_no.shape

(554, 16)

Log results to file:

In [37]:
from keras.callbacks import CSVLogger

csv_logger = CSVLogger('../output/log/multi_classifier_training.csv')

In [38]:
model.fit([pname1_tf, pname2_tf, ptype1_tf, ptype2_tf, feature_tf], dummy_value_no,
          batch_size=batch_size,
          epochs=epochs,
          validation_split=split, 
          callbacks=[csv_logger])

Train on 387 samples, validate on 167 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x13c9a8a20>