In [1]:
import numpy as np
np.random.seed(622)
import pandas as pd

import os
import re
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Flatten, Input, Dropout, Dense, Reshape, Embedding, SpatialDropout1D, concatenate
from tensorflow.keras.layers import GRU, Bidirectional, AvgPool1D, MaxPool1D, Conv1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.callbacks import Callback
import sqlite3
from tensorflow.keras.callbacks import ModelCheckpoint
from keras_tqdm import TQDMNotebookCallback

from tqdm import tqdm_notebook as tqdm
import toolz

import warnings
warnings.filterwarnings('ignore')

EMBEDDING_FILE = 'embeddings.txt'


  return f(*args, **kwds)
Using TensorFlow backend.


In [2]:
%%time
conn = sqlite3.connect('reviews.sqlite')
# all_data = pd.read_sql('select body, rating from reviews order by created_at asc limit 100000', conn)
# train = all_data.iloc[:80000]
# dev = all_data.iloc[80000:]
all_data = pd.read_sql('select body, rating from reviews', conn)

CPU times: user 4.01 s, sys: 1.25 s, total: 5.26 s
Wall time: 5.26 s


In [3]:
train = all_data.iloc[:-10000]
dev = all_data.iloc[-10000:]

In [4]:

token_pattern = r"[\w']+|[,\.\?;\-\(\)]"
regex = re.compile(token_pattern)

def tokenize(text):
    return regex.findall(text.lower())



In [5]:
tokenize("This is how we do it.  It's Friday night, and I feel all right, and the party's here on the west side.")

['this',
 'is',
 'how',
 'we',
 'do',
 'it',
 '.',
 "it's",
 'friday',
 'night',
 ',',
 'and',
 'i',
 'feel',
 'all',
 'right',
 ',',
 'and',
 'the',
 "party's",
 'here',
 'on',
 'the',
 'west',
 'side',
 '.']

In [6]:
%%time

def to_sentiment_class(rating):
    if rating < 4:
        return 0
    if rating == 4:
        return 1
    else:
        return 2

X_train = train["body"].fillna("fillna").values
# y_train = to_categorical(train["rating"].apply(to_sentiment_class).values)
y_train = to_categorical(train["rating"].values)
X_dev = dev["body"].fillna("fillna").values

max_features = 100000
maxlen = 100
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features, filters='', lower=False)
tokenizer.fit_on_texts([' '.join(tokenize(doc)) for doc in list(X_train) + list(X_dev)])
X_train = tokenizer.texts_to_sequences(X_train)
X_dev = tokenizer.texts_to_sequences(X_dev)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_dev = sequence.pad_sequences(X_dev, maxlen=maxlen)


CPU times: user 3min 38s, sys: 2.01 s, total: 3min 40s
Wall time: 3min 40s


In [7]:
y_train.sum(axis=0)

array([      0.,  283930.,  126280.,  207357.,  513883., 3394498.],
      dtype=float32)

In [8]:
sequence.pad_sequences(tokenizer.texts_to_sequences([' '.join(tokenize('We had a great time.'))]), maxlen=maxlen)

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 31, 27,
         7, 21, 38,  1]], dtype=int32)

In [9]:
sequence.pad_sequences(tokenizer.texts_to_sequences(['Beware, these people are scammers!  They are only trying to take your money!']), maxlen=maxlen)

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        201, 192,  32,  32,  80, 421,   5, 220,  60]], dtype=int32)

In [10]:
%%time

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) 
                        for o in tqdm(open(EMBEDDING_FILE), 
                                      total=1999996))


HBox(children=(IntProgress(value=0, max=1999996), HTML(value='')))


CPU times: user 1min 37s, sys: 2.79 s, total: 1min 40s
Wall time: 1min 39s


In [11]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, embed_size))
for word, i in tqdm(word_index.items()):
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


HBox(children=(IntProgress(value=0, max=402235), HTML(value='')))




In [12]:

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


In [13]:
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=622)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)


In [14]:
embedding_matrix.shape

(100001, 300)

In [15]:
conv_stride = 5
num_filters = 100
sequence_length = 100
embedding_dim = 300
num_recurrent = 3
recurrent_dim = 7

def get_rnn_model():
    inp = Input(shape=(maxlen, ), dtype=tf.int64)
    embed = Embedding(max_features + 1, embed_size, 
                  weights=[embedding_matrix], trainable=False)(inp)
    recurrents = []
    for level in range(num_recurrent):
        last = embed if len(recurrents) == 0 else recurrents[-1]
        recurrents.append(Bidirectional(GRU(recurrent_dim, return_sequences=True))(last))
    recurrent_concats = concatenate(recurrents, axis=2)
    max_pool = GlobalMaxPooling1D()(recurrent_concats)
    avg_pool = GlobalAveragePooling1D()(recurrent_concats)
    
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(y_train.shape[1], activation="softmax")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model


def get_conv_model():
    inp = Input(shape=(maxlen, ), dtype=tf.int64)
    embed = Embedding(max_features + 1, embed_size, 
                  weights=[embedding_matrix], trainable=False)(inp)
    
    max_pools = []
    avg_pools = []
    
    for filter_size in [3, 4, 5]:
        conv = Conv1D(num_filters, filter_size, kernel_initializer='normal', 
                      activation='relu')(embed)
        max_pools.append(MaxPool1D(filter_size)(conv))
        avg_pools.append(AvgPool1D(filter_size)(conv))
    
    conc = Flatten()(concatenate(max_pools + avg_pools, axis=1))
    filter_dropout = Dropout(0.5)(conc)
    output = Dense(units=y_train.shape[1], activation='softmax')(filter_dropout)
    
    model = Model(inputs=inp, outputs=output)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model


biGRU 2x 50d - min losses - val 0.2335 - 2.5k pred/s

3x 100d Conv max pool - min losses - val 0.1627 - train 0.1702 - 20k pred/s

3x 100d Conv maxavg pool - min losses - val 0.1639 - train 0.1702 - 15k pred/s

biGRU 2x 6d highway with embed in maxavg pooling - val 0.1517 - train 0.1527 - 4.4k pred/s

biGRU 3x 7d highway without embed in maxavg pooling - val 0.1507 - train 0.1511 - 8.5k pred/s

biGRU 2x 6d full highway maxavg pooling - val 0.1496 - train 0.1496 - 1k pred/s

In [16]:
model = get_rnn_model()

In [17]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 100)          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 100, 300)     30000300    input_1[0][0]                    
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 100, 14)      12936       embedding[0][0]                  
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 100, 14)      924         bidirectional[0][0]              
__________________________________________________________________________________________________
bidirectio

In [20]:
batch_size = 2048
epochs = 5

callbacks = [
#     RocAuc,
    TQDMNotebookCallback(leave_inner=True, leave_outer=True),
    ModelCheckpoint(filepath='keras_model.hdf5', verbose=1, save_best_only=True),
]

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=callbacks, verbose=2)


Train on 4299650 samples, validate on 226298 samples


HBox(children=(IntProgress(value=0, description='Training', max=5), HTML(value='')))

Epoch 1/5


HBox(children=(IntProgress(value=0, description='Epoch 0', max=4299650), HTML(value='')))

 - 1213s - loss: 0.1512 - acc: 0.9437 - val_loss: 0.1517 - val_acc: 0.9435

Epoch 00001: val_loss improved from inf to 0.15171, saving model to keras_model.hdf5
Epoch 2/5


HBox(children=(IntProgress(value=0, description='Epoch 1', max=4299650), HTML(value='')))

 - 1198s - loss: 0.1508 - acc: 0.9438 - val_loss: 0.1512 - val_acc: 0.9435

Epoch 00002: val_loss improved from 0.15171 to 0.15117, saving model to keras_model.hdf5
Epoch 3/5


HBox(children=(IntProgress(value=0, description='Epoch 2', max=4299650), HTML(value='')))

 - 1192s - loss: 0.1505 - acc: 0.9439 - val_loss: 0.1511 - val_acc: 0.9436

Epoch 00003: val_loss improved from 0.15117 to 0.15108, saving model to keras_model.hdf5
Epoch 4/5


HBox(children=(IntProgress(value=0, description='Epoch 3', max=4299650), HTML(value='')))

 - 1203s - loss: 0.1502 - acc: 0.9440 - val_loss: 0.1501 - val_acc: 0.9439

Epoch 00004: val_loss improved from 0.15108 to 0.15013, saving model to keras_model.hdf5
Epoch 5/5


HBox(children=(IntProgress(value=0, description='Epoch 4', max=4299650), HTML(value='')))

 - 1199s - loss: 0.1499 - acc: 0.9440 - val_loss: 0.1502 - val_acc: 0.9440

Epoch 00005: val_loss did not improve from 0.15013



In [23]:
%%timeit
y_pred = model.predict(x_dev, batch_size=1024)

1.99 s ± 9.41 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
10000 / 1.89

5291.005291005291

In [20]:
import tensorflow as tf

In [24]:
# https://medium.com/tensorflow/training-and-serving-ml-models-with-tf-keras-fd975cc0fa27
tf.keras.backend.set_learning_phase(0)
tf_model = tf.keras.models.load_model('keras_model.hdf5')

In [25]:
try:
    existing_models = os.listdir("out/sentiment/")
    model_num = 1 if len(existing_models) == 0 else max(map(int, existing_models)) + 1
except FileNotFoundError:
    model_num = 1

In [26]:
export_path = f'out/sentiment/{model_num}'

with tf.keras.backend.get_session() as sess:
    tf.saved_model.simple_save(
        sess, 
        export_path, 
        inputs={'input_idxs': tf_model.input}, 
        outputs={t.name: t for t in tf_model.outputs}
    )

INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: out/sentiment/4/saved_model.pb


In [27]:
with open(export_path + '/vocab.json', 'w') as outfile:
    json.dump({k: v for k, v in tokenizer.word_index.items() if v < max_features}, outfile)