In [3]:
import numpy as np
np.random.seed(622)
import pandas as pd

import re
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.utils import to_categorical
from keras.preprocessing import text, sequence
from keras.callbacks import Callback
import sqlite3
from keras.callbacks import ModelCheckpoint
from keras_tqdm import TQDMNotebookCallback

import warnings
warnings.filterwarnings('ignore')

EMBEDDING_FILE = 'vectors_50d.txt'


Using TensorFlow backend.


In [9]:
%%time
conn = sqlite3.connect('reviews.sqlite')
# all_data = pd.read_sql('select body, rating from reviews order by created_at asc limit 100000', conn)
# train = all_data.iloc[:80000]
# dev = all_data.iloc[80000:]
all_data = pd.read_sql('select body, rating from reviews', conn)
train = all_data.iloc[:-10000]
dev = all_data.iloc[-10000:]

CPU times: user 5.27 s, sys: 1.72 s, total: 7 s
Wall time: 6.99 s


In [10]:

token_pattern = r"[\w']+|[,\.\?;\-\(\)]"
regex = re.compile(token_pattern)

def tokenize(text):
    return regex.findall(text.lower())



In [11]:
tokenize("This is how we do it.  It's Friday night, and I feel all right, and the party's here on the west side.")

['this',
 'is',
 'how',
 'we',
 'do',
 'it',
 '.',
 "it's",
 'friday',
 'night',
 ',',
 'and',
 'i',
 'feel',
 'all',
 'right',
 ',',
 'and',
 'the',
 "party's",
 'here',
 'on',
 'the',
 'west',
 'side',
 '.']

In [12]:
%%time

def to_sentiment_class(rating):
    if rating < 4:
        return 0
    if rating == 4:
        return 1
    else:
        return 2

X_train = train["body"].fillna("fillna").values
y_train = to_categorical(train["rating"].apply(to_sentiment_class).values)
X_dev = dev["body"].fillna("fillna").values

max_features = 30000
maxlen = 100
embed_size = int(EMBEDDING_FILE.split('_')[1].split('d')[0])

tokenizer = text.Tokenizer(num_words=max_features, filters='', lower=False)
tokenizer.fit_on_texts([' '.join(tokenize(doc)) for doc in list(X_train) + list(X_dev)])
X_train = tokenizer.texts_to_sequences(X_train)
X_dev = tokenizer.texts_to_sequences(X_dev)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_dev = sequence.pad_sequences(X_dev, maxlen=maxlen)


CPU times: user 5min 19s, sys: 2.65 s, total: 5min 21s
Wall time: 5min 21s


In [13]:
y_train.sum(axis=0)

array([ 617567.,  513883., 3394498.], dtype=float32)

In [14]:
sequence.pad_sequences(tokenizer.texts_to_sequences([' '.join(tokenize('We had a great time.'))]), maxlen=maxlen)

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 31, 27,
         7, 21, 38,  1]], dtype=int32)

In [15]:
sequence.pad_sequences(tokenizer.texts_to_sequences(['Beware, these people are scammers!  They are only trying to take your money!']), maxlen=maxlen)

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        201, 192,  32,  32,  80, 421,   5, 220,  60]], dtype=int32)

In [16]:
%%time

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


CPU times: user 18.7 s, sys: 393 ms, total: 19.1 s
Wall time: 19 s


In [17]:

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


In [18]:
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=622)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)


In [19]:
batch_size = 512
epochs = 1

def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix])(inp)
#     x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(20, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(3, activation="softmax")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model


In [20]:
model = get_model()

In [None]:

callbacks = [
    RocAuc,
    TQDMNotebookCallback(leave_inner=True, leave_outer=True),
    ModelCheckpoint(filepath='keras_model.hdf5', verbose=1, save_best_only=True),
]

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=callbacks, verbose=2)


Train on 4299650 samples, validate on 226298 samples


HBox(children=(IntProgress(value=0, description='Training', max=1), HTML(value='')))

Epoch 1/1


HBox(children=(IntProgress(value=0, description='Epoch 0', max=4299650), HTML(value='')))

In [None]:
%%time
y_pred = model.predict(x_dev, batch_size=512)
# submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
# submission.to_csv('submission.csv', index=False)
print(y_pred.shape)

32 batch size, 80 GRU hidden units, 300d embeddings, 50k vocab: 0.90 AUROC, ~160 it/s train, ~600 it/s predict, 106MB hdf5, 36M tfjs

512 batch size, 80 GRU hidden units, 50d embeddings, 50k vocab: 0.90 AUROC, ~2300 it/s train, ~7400 it/s predict, 18MB hdf5, 6M tfjs

In [None]:
%%sh
tensorflowjs_converter --input_format=keras keras_model.hdf5 tfjs_model

In [None]:
with open('tfjs_model/word_index.json', 'w') as outfile:
    json.dump({k: v for k, v in tokenizer.word_index.items() if v < max_features}, outfile)

In [None]:
%%sh
make upload-model