#### Notebook purpose
This notebook aims to do model selection for deep learning models.

#### Load training and test X & y

In [1]:
import pandas as pd
from ast import literal_eval

In [2]:
data_path = '../../data/'
models_path = '../../models/'

In [3]:
X_train = pd.read_csv(f'{data_path}X_train.csv', converters={'event_list': literal_eval, 'person_list': literal_eval})
X_test = pd.read_csv(f'{data_path}X_test.csv', converters={'event_list': literal_eval, 'person_list': literal_eval})
y_train = pd.read_csv(f'{data_path}y_train.csv').values.ravel()
y_test = pd.read_csv(f'{data_path}y_test.csv').values.ravel()

#### Prepare features
Here we consider every information as chunk of text.

In [4]:
# we agregate the values in the person and events lists.
X_train.event_list = X_train.event_list.map(lambda x: ' '.join(x))
X_train.person_list = X_train.person_list.map(lambda x: ' '.join(x))
X_test.event_list = X_test.event_list.map(lambda x: ' '.join(x))
X_test.person_list = X_test.person_list.map(lambda x: ' '.join(x))

In [5]:
import numpy as np

#### Set up tokenizer before encoding text features

In [6]:
# we aggregate all chunk of text before sending this to the tokenizer
all_text_feat = np.concatenate([X_train.body, X_train.title, X_train.source, X_train.event_list, X_train.person_list])

In [7]:
from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [8]:
t = Tokenizer(num_words=1E4, lower=False, oov_token='OOV')
t.fit_on_texts(all_text_feat)

#### Encode chunk of text to sequence of integers and pad the resulting sequences

In [11]:
enc_body = t.texts_to_sequences(X_train.body)
enc_title = t.texts_to_sequences(X_train.title)
enc_source = t.texts_to_sequences(X_train.source)
enc_event = t.texts_to_sequences(X_train.event_list)
enc_person = t.texts_to_sequences(X_train.person_list)

In [12]:
max_len_body = np.max([len(seq) for seq in enc_body])
max_len_title = np.max([len(seq) for seq in enc_title])
max_len_source = np.max([len(seq) for seq in enc_source])
max_len_event = np.max([len(seq) for seq in enc_event])
max_len_person = np.max([len(seq) for seq in enc_person])

In [13]:
from keras.preprocessing.sequence import pad_sequences

In [14]:
dat_body = pad_sequences(enc_body, maxlen=max_len_body)
dat_title = pad_sequences(enc_title, maxlen=max_len_title)
dat_source = pad_sequences(enc_source, maxlen=max_len_source)
dat_event = pad_sequences(enc_event, maxlen=max_len_event)
dat_person = pad_sequences(enc_person, maxlen=max_len_person)

#### Define model architecture

In [15]:
from keras.models import Model
from keras.layers import Dense, Embedding, Conv1D, GlobalMaxPooling1D, Input, Dropout
from keras.layers import concatenate

In [16]:
embedding_dim = 300

body = Input(shape=(max_len_body, )) 
title = Input(shape=(max_len_title, )) 
source = Input(shape=(max_len_source, ))
person = Input(shape=(max_len_person, ))
event = Input(shape=(max_len_event, ))

embed = Embedding(int(t.num_words), embedding_dim)

cnn_body = Conv1D(150, 3, activation='relu')(embed(body))
pool_body = GlobalMaxPooling1D()(cnn_body)

cnn_title = Conv1D(150, 3, activation='relu')(embed(title))
pool_title = GlobalMaxPooling1D()(cnn_title)

cnn_source = Conv1D(150, 3, activation='relu')(embed(source))
pool_source = GlobalMaxPooling1D()(cnn_source)

cnn_event = Conv1D(150, 3, activation='relu')(embed(event))
pool_event = GlobalMaxPooling1D()(cnn_event)

cnn_person = Conv1D(150, 3, activation='relu')(embed(person))
pool_person = GlobalMaxPooling1D()(cnn_person)

concat = concatenate([pool_body, pool_title, pool_source, pool_person, pool_event])

out = Dense(10, activation='relu')(concat)
out = Dropout(0.5)(out)
out = Dense(3, activation='softmax')(out)

model = Model(inputs=[body, title, source, person, event], outputs=out)

#### Compile model

In [17]:
from keras.optimizers import Adam

In [18]:
model.compile(optimizer=Adam(lr=0.0001), loss='sparse_categorical_crossentropy', metrics=['sparse_categorical_accuracy'])

#### Define early stoping and checkpoint strategy

In [19]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [20]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=5)
mc = ModelCheckpoint(f'{models_path}keras_early_stopping.h5', monitor='val_loss', mode='min', verbose=1, save_best_only=True)

#### Fit model and serialize best weights via early stopping 

In [22]:
%%time
model.fit([dat_body, dat_title, dat_source, dat_person, dat_event], y_train, 
          batch_size=10, epochs=50, validation_split=0.1,
          verbose=0, callbacks=[es, mc])


Epoch 00001: val_loss improved from inf to 0.54020, saving model to ../../models/keras_early_stopping.h5

Epoch 00002: val_loss improved from 0.54020 to 0.29927, saving model to ../../models/keras_early_stopping.h5

Epoch 00003: val_loss improved from 0.29927 to 0.22457, saving model to ../../models/keras_early_stopping.h5

Epoch 00004: val_loss improved from 0.22457 to 0.18607, saving model to ../../models/keras_early_stopping.h5

Epoch 00005: val_loss improved from 0.18607 to 0.18281, saving model to ../../models/keras_early_stopping.h5

Epoch 00006: val_loss did not improve from 0.18281

Epoch 00007: val_loss did not improve from 0.18281

Epoch 00008: val_loss did not improve from 0.18281

Epoch 00009: val_loss did not improve from 0.18281

Epoch 00010: val_loss did not improve from 0.18281
Epoch 00010: early stopping
CPU times: user 3min 33s, sys: 29.7 s, total: 4min 2s
Wall time: 2min 31s


<keras.callbacks.History at 0x7f3b5a7f5ac8>

#### Serialize architecture

In [25]:
model_json = model.to_json()
with open(f'{models_path}keras_architecture.json', 'w') as json_file:
    json_file.write(model_json)

In [26]:
enc_body_test = t.texts_to_sequences(X_test.body)
enc_title_test = t.texts_to_sequences(X_test.title)
enc_source_test = t.texts_to_sequences(X_test.source)
enc_event_test = t.texts_to_sequences(X_test.event_list)
enc_person_test = t.texts_to_sequences(X_test.person_list)

In [27]:
dat_body_test = pad_sequences(enc_body_test, maxlen=max_len_body)
dat_title_test = pad_sequences(enc_title_test, maxlen=max_len_title)
dat_source_test = pad_sequences(enc_source_test, maxlen=max_len_source)
dat_event_test = pad_sequences(enc_event_test, maxlen=max_len_event)
dat_person_test = pad_sequences(enc_person_test, maxlen=max_len_person)

#### Serialize keras_data

In [33]:
import pickle

In [39]:
keras_data = {'train': [dat_body, dat_title, dat_source, dat_person, dat_event],
              'test': [dat_body_test, dat_title_test, dat_source_test, dat_person_test, dat_event_test]}

In [40]:
with open(f'{data_path}keras_data.pkl', 'wb') as f:
    pickle.dump(keras_data, f)