# Introduction

This is a simple LSTM based classifier.

Currently, I have nothing for preprocessing.

In [1]:
!mkdir log
!mkdir model
!mkdir submission

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import log_loss

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import Bidirectional
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Dropout
from keras.backend import tensorflow_backend
from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint
from keras.callbacks import CSVLogger

from gensim.models import KeyedVectors

import pandas
import numpy

## Build the neural network using Keras

I use [Keras](http://github.com/fchollet/keras) to build neural network.

My network is very simple: one Embedding layer, one Bidirectional LSTM layer and one Fully-connected layer.

I use Adam whose parameters are default values to optimize the network.

In [3]:
def build_model(n_word, n_dim, n_hidden, syn0=None, trainable=True):
    model = Sequential()

    if syn0 is not None:
        model.add(Embedding(input_dim=n_word+1, output_dim=n_dim, weights=[syn0], trainable=trainable))
        
    else:
        model.add(Embedding(input_dim=n_word+1, output_dim=n_dim, trainable=trainable))

    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(n_hidden)))    
    model.add(Dense(50))
    model.add(Dense(3, activation='softmax'))
    return model

## Build the embedding matrix

Next, We create a embedding matrix.

Embedding matrix is initiarized randomly.

I use [Gensim](https://github.com/RaRe-Technologies/gensim) for preparing word-embedding.

In [4]:
def build_embedding(n_word, n_dim, pretrain=False):
    syn0 = numpy.random.random((n_word+1, n_dim))

    if pretrain:
        embedding_model = KeyedVectors.load(f'embedding/glove.6B.{n_dim}d')
        for word, index in tokenizer.word_index.items():
            try:
                vector = embedding_model.word_vec(word)
                index = tokenizer.word_index[word]
                syn0[index, :] = vector

            except Exception as e:
                pass

    return syn0

## Run experiment

Train, evaluate, create submission!

You can see the log of training in log directory.

In [5]:
def experiment(n_word, n_dim, n_hidden, pretrain=True, trainable=True, batch_size=128):
    syn0  = build_embedding(n_word, n_dim, pretrain=pretrain)
    
    if pretrain:
        model = build_model(n_word, n_dim, n_hidden, syn0=syn0, trainable=trainable)
    
    else:
        model = build_model(n_word, n_dim, n_hidden, trainable=trainable)

    model_name = f'modelBiLSTM.embedding{n_dim}.n_hidden{n_hidden}.trainable{trainable}.pretrain{pretrain}'  # NOQA
    print()
    print()
    print('# params')
    print(f'n_word     : {n_word}')
    print(f'n_dim      : {n_dim}')
    print(f'n_hidden   : {n_hidden}')
    print(f'pretrain   : {pretrain}')
    print(f'trainable  : {trainable}')
    print(f'destination: {model_name}')
    
    callbacks = []
    callbacks.append(EarlyStopping(patience=3))
    callbacks.append(ModelCheckpoint(filepath=f'model/{model_name}.hdf5', save_best_only=True))
    callbacks.append(CSVLogger(filename=f'log/{model_name}.csv'))

    model.summary()
    model.compile('adam', 'categorical_crossentropy')
    model.fit(X_train, y_train, batch_size=batch_size, epochs=100,
              validation_data=[X_val, y_val], callbacks=callbacks)
    model.load_weights(f'model/{model_name}.hdf5')
    
    y_val_pred_prob = model.predict(X_val, batch_size=batch_size)
    val_loss = log_loss(y_val, y_val_pred_prob)
    
    print(f'{model_name}: {val_loss}')
    
    y_test_prob = model.predict(X_test, batch_size=batch_size)
    test_data['EAP'] = y_test_prob[:, 0]
    test_data['HPL'] = y_test_prob[:, 1]
    test_data['MWS'] = y_test_prob[:, 2]

    test_data[['id', 'EAP', 'HPL', 'MWS']].to_csv(f'submission/{model_name}.csv', index=False)
    
    return 0

In [6]:
train_data = pandas.read_csv('../input/train.csv', index_col=False)
test_data  = pandas.read_csv('../input/test.csv',  index_col=False)

In [7]:
train_data.head()

In [8]:
test_data.head()

In [9]:
all_text = pandas.concat([train_data.text, test_data.text])
n_train = train_data.shape[0]

print(f'n_train: {n_train}')

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_text)

labelbinarizer = LabelBinarizer()
labelbinarizer.fit(train_data['author'])

X = tokenizer.texts_to_sequences(train_data.text)
X = pad_sequences(X)

y = labelbinarizer.fit_transform(train_data['author'])

## Create validation dataset

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X, y)
n_word   = len(tokenizer.word_index)

print(f'vocaburary size: {n_word}')

In [12]:
X_test = tokenizer.texts_to_sequences(test_data.text)
X_test = pad_sequences(X_test)

In [13]:
experiment(n_word, n_dim=50, n_hidden=50, pretrain=False, trainable=True)

## Pretrained Embedding [optional]

You want to use pretrained word-embeddings? OK, go on to the next!


### Repos overview

The directory tree of my repos is following.

```
.                                                                            
├── embedding                                                                                   ├── libexec                                                               
│   └── binary_convert.py  bn# describe later
├── log                                                                                         ├── model                                                                   
├── spooky.ipynb  # this notebook
└── submission
```


#### Download and unzip

I use GloVe 6 billion embeddings.

Please download [GloVe](https://nlp.stanford.edu/projects/glove/) embeddings.

To down load them, exec following commands.

__NOTE__: I don't test this on kernel because glove.6B.zip is a large file. If you want test, please do it on your computer.

```bash
mkdir embedding
cd embedding

wget http://nlp.stanford.edu/data/glove.6B.zip
unzip glove.6B.zip

cd ..

for fname in `ls embedding/*.txt`; do
  python -m gensim.scripts.glove2word2vec -i $fname -o `echo $fname | sed -s 's/txt/w2v/'`;
done
```


#### Convert text file to binary models

Now, I have pretrained embeddings in `repo/embedding`.

Then, I create gensim binary objects by `python libexec/convert.py`.

`libexec/convert.py` looks below.


```python
from gensim.models import KeyedVectors
import glob

for fname in glob.glob('embedding/glove.6B.*.w2v'):
    e = KeyedVectors.load_word2vec_format(fname)
    e.save(fname.replace('.w2v', ''))
```

(exec `rm embedding/*.txt embedding/*.w2v embedding/glove.6B.zip` to cleanup.)


## Enjoy!!

All preparations complete!

We can use pre-trained word-embeddings to train our model.

But currently, as [this discussion](https://www.kaggle.com/c/spooky-author-identification/discussion/42316) says,
I cannot improve the score (It seems supress over-fitting).

In [None]:
# use pre-trained word-embeddings
experiment(n_word, n_dim=50, n_hidden=50, pretrain=True, trainable=True) # not tested on kernel