In [1]:
import numpy as np
np.random.seed(622)
import pandas as pd

import os
import re
import json
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import tensorflow as tf

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from tensorflow.keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.callbacks import Callback
import sqlite3
from tensorflow.keras.callbacks import ModelCheckpoint
from keras_tqdm import TQDMNotebookCallback

from tqdm import tqdm_notebook as tqdm
import toolz

import warnings
warnings.filterwarnings('ignore')

EMBEDDING_FILE = 'embeddings.txt'


Using TensorFlow backend.


In [2]:
%%time
conn = sqlite3.connect('reviews.sqlite')
# all_data = pd.read_sql('select body, rating from reviews order by created_at asc limit 100000', conn)
# train = all_data.iloc[:80000]
# dev = all_data.iloc[80000:]
all_data = pd.read_sql('select body, rating from reviews limit 100000', conn)

CPU times: user 111 ms, sys: 59.5 ms, total: 171 ms
Wall time: 170 ms


In [3]:
train = all_data.iloc[:9000]
dev = all_data.iloc[-1000:]

In [4]:

token_pattern = r"[\w']+|[,\.\?;\-\(\)]"
regex = re.compile(token_pattern)

def tokenize(text):
    return regex.findall(text.lower())



In [5]:
tokenize("This is how we do it.  It's Friday night, and I feel all right, and the party's here on the west side.")

['this',
 'is',
 'how',
 'we',
 'do',
 'it',
 '.',
 "it's",
 'friday',
 'night',
 ',',
 'and',
 'i',
 'feel',
 'all',
 'right',
 ',',
 'and',
 'the',
 "party's",
 'here',
 'on',
 'the',
 'west',
 'side',
 '.']

In [6]:
%%time

def to_sentiment_class(rating):
    if rating < 4:
        return 0
    if rating == 4:
        return 1
    else:
        return 2

X_train = train["body"].fillna("fillna").values
y_train = to_categorical(train["rating"].apply(to_sentiment_class).values)
X_dev = dev["body"].fillna("fillna").values

max_features = 100000
maxlen = 100
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features, filters='', lower=False)
tokenizer.fit_on_texts([' '.join(tokenize(doc)) for doc in list(X_train) + list(X_dev)])
X_train = tokenizer.texts_to_sequences(X_train)
X_dev = tokenizer.texts_to_sequences(X_dev)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_dev = sequence.pad_sequences(X_dev, maxlen=maxlen)


CPU times: user 506 ms, sys: 4.61 ms, total: 510 ms
Wall time: 509 ms


In [7]:
y_train.sum(axis=0)

array([ 381.,  837., 7782.], dtype=float32)

In [8]:
sequence.pad_sequences(tokenizer.texts_to_sequences([' '.join(tokenize('We had a great time.'))]), maxlen=maxlen)

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0, 131,  27,   7,  13,  48,   1]], dtype=int32)

In [9]:
sequence.pad_sequences(tokenizer.texts_to_sequences(['Beware, these people are scammers!  They are only trying to take your money!']), maxlen=maxlen)

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        189, 351,  31,  31,  97, 498,   6, 347, 110]], dtype=int32)

In [10]:
%%time

def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')

embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) 
                        for o in tqdm(toolz.take(100000, open(EMBEDDING_FILE)), 
                                      total=1999996))


HBox(children=(IntProgress(value=0, max=1999996), HTML(value='')))


CPU times: user 6.1 s, sys: 163 ms, total: 6.26 s
Wall time: 6.07 s


In [11]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words + 1, embed_size))
for word, i in tqdm(word_index.items()):
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


HBox(children=(IntProgress(value=0, max=9560), HTML(value='')))




In [12]:

class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


In [13]:
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=622)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)


In [14]:
embedding_matrix.shape

(9561, 300)

In [15]:
batch_size = 512
epochs = 1

def get_model():
    inp = Input(shape=(maxlen, ), dtype=tf.int64)
    x = Embedding(min([max_features, len(embedding_matrix)]), embed_size, 
                  weights=[embedding_matrix])(inp)
    x = Bidirectional(GRU(100, return_sequences=True))(x)
    x = Bidirectional(GRU(100, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(3, activation="softmax")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model


In [16]:
model = get_model()

In [17]:

callbacks = [
    RocAuc,
    TQDMNotebookCallback(leave_inner=True, leave_outer=True),
    ModelCheckpoint(filepath='keras_model.hdf5', verbose=1, save_best_only=True),
]

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=callbacks, verbose=2)


Train on 8550 samples, validate on 450 samples


HBox(children=(IntProgress(value=0, description='Training', max=1), HTML(value='')))

Epoch 1/1


HBox(children=(IntProgress(value=0, description='Epoch 0', max=8550), HTML(value='')))

 - 15s - loss: 0.3736 - acc: 0.8692 - val_loss: 0.3021 - val_acc: 0.9081

 ROC-AUC - epoch: 1 - score: 0.683392 


Epoch 00001: val_loss improved from inf to 0.30207, saving model to keras_model.hdf5



In [18]:
%%time
y_pred = model.predict(x_dev, batch_size=512)
print(y_pred.shape)

(1000, 3)
CPU times: user 672 ms, sys: 77.7 ms, total: 749 ms
Wall time: 486 ms


In [19]:
10000 / 4

2500.0

In [20]:
import tensorflow as tf

In [21]:
# https://medium.com/tensorflow/training-and-serving-ml-models-with-tf-keras-fd975cc0fa27
tf.keras.backend.set_learning_phase(0)
tf_model = tf.keras.models.load_model('keras_model.hdf5')

In [22]:
try:
    existing_models = os.listdir("out/sentiment/")
    model_num = 1 if len(existing_models) == 0 else max(map(int, existing_models)) + 1
except FileNotFoundError:
    model_num = 1

In [23]:
export_path = f'out/sentiment/{model_num}'

with tf.keras.backend.get_session() as sess:
    tf.saved_model.simple_save(
        sess, 
        export_path, 
        inputs={'input_idxs': tf_model.input}, 
        outputs={t.name: t for t in tf_model.outputs}
    )

INFO:tensorflow:Assets added to graph.
INFO:tensorflow:No assets to write.
INFO:tensorflow:SavedModel written to: out/sentiment/2/saved_model.pb
