In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
MAX_LEN = 512
EMBEDDING_DIM = 200
# this value is fixed since we are using pre-trained GloVe word embeddings
GloVe_PATH = '/content/drive/MyDrive/NLP smtg/data/glove.6B.200d.txt'

DATA_PATH = {
    'Yelp2015': '/content/drive/MyDrive/NLP smtg/data/yelp_2015.csv'
}

TFRECORD_DIR = './tfrecord'
import os
MODEL_DIR = './saved_model'
os.makedirs(MODEL_DIR, exist_ok=True)
BEST_MODEL_PATH = os.path.join(MODEL_DIR, 'best_model.ckpt')

In [None]:
import numpy as np
import tensorflow as tf
import os
import pandas as pd
from tqdm import tqdm
import shutil
import pickle
import argparse
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.utils import generic_utils
from tensorflow.python.ops import array_ops
from tensorflow.python.framework import tensor_shape
from tensorflow.keras.layers import Embedding, Dense, GRU, Input

In [None]:
def _bytes_features(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

In [None]:
def _int64_features(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [None]:
class TrainDataset:
    def __init__(self, args):
        self.batch_size = args.batch_size
        self.train_record_path = os.path.join(TFRECORD_DIR, 'train.tfrecord')
        self.val_record_path = os.path.join(TFRECORD_DIR, 'val.tfrecord')
        self.test_record_path = os.path.join(TFRECORD_DIR, 'test.tfrecord')
        self.tokenizer_path = os.path.join(MODEL_DIR, 'tokenizer.bin')
        if args.create_tfrecord:
            print('reading data')
            if args.dataset == 'Custom':
                assert os.path.isfile(args.train_data_path)
                val_df, test_df, train_df = \
                    self.split_csv(args.train_data_path, [args.val_size, args.test_size, 1 - args.val_size - args.test_size])
                train_data = train_df.to_numpy()
                val_data = val_df.to_numpy()
                test_data = test_df.to_numpy()
            else:
                '''
                for Yelp dataset, only the train csv is provided. To make the process same as custom dataset,
                we load the csv, split into train-val-test and save to './data/tmp'
                '''
                assert os.path.isfile(DATA_PATH[args.dataset])
                val_df, test_df, train_df = \
                    self.split_csv(DATA_PATH[args.dataset], [args.val_size, args.test_size, 1 - args.val_size - args.test_size])
                train_data = self.preprocess_yelp(train_df)
                val_data = self.preprocess_yelp(val_df)
                test_data = self.preprocess_yelp(test_df)
            del train_df, val_df, test_df
            print('     Done')
            print('start fitting tokenizer...')
            tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=args.max_num_words)
            tokenizer.fit_on_texts(texts=train_data[:, 1])
            print('     Done')
            shutil.rmtree(TFRECORD_DIR, ignore_errors=True)
            os.makedirs(TFRECORD_DIR, exist_ok=False)
            self.create_tfrecord(train_data, self.train_record_path, tokenizer)
            self.create_tfrecord(val_data, self.val_record_path, tokenizer)
            self.create_tfrecord(test_data, self.test_record_path, tokenizer)
            with open(self.tokenizer_path, 'wb') as f:
                pickle.dump(tokenizer, f)
            del train_data, val_data, test_data, tokenizer
        self.record_path = {
            'train': self.train_record_path,
            'val': self.val_record_path,
            'test': self.test_record_path
        }
        assert os.path.isfile(self.train_record_path) \
               and os.path.isfile(self.val_record_path) \
               and os.path.isfile(self.test_record_path) \
               and os.path.isfile(self.tokenizer_path)


    def get_datasets(self, split='train'):
        assert split in self.record_path

        def decode_fn(example):
            feature = tf.io.parse_single_example(
                example,
                features={
                    'sequence': tf.io.FixedLenFeature([], dtype=tf.string),
                    'label': tf.io.FixedLenFeature([], dtype=tf.int64),
                }
            )
            sequence = tf.io.decode_raw(feature['sequence'], tf.int32)
            return tf.reshape(sequence, (8, 8, 8)), feature['label']

        dataset = tf.data.TFRecordDataset([self.record_path[split]]).map(decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE).batch(self.batch_size)
        return dataset

    def get_vocabulary(self):
        with open(self.tokenizer_path, 'rb') as f:
            tokenizer = pickle.load(f)
        return tokenizer.word_index


    def create_tfrecord(self, data, save_path, tokenizer):
        with tf.io.TFRecordWriter(save_path) as writer:
            label = data[:, 0].astype(np.int32)
            text = data[:, 1]
            text_ids = tokenizer.texts_to_sequences(text)
            text_pad = tf.keras.preprocessing.sequence.pad_sequences(text_ids, maxlen=MAX_LEN).astype(np.int32)
            for idx in range(label.shape[0]):
                feature = {
                    'sequence': _bytes_features(text_pad[idx].tobytes()),
                    'label': _int64_features(label[idx]),
                }
                msg = tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString()
                writer.write(msg)

    def split_csv(self, csv_path, sizes):
        assert (isinstance(sizes, list) or isinstance(sizes, tuple)) and np.isclose(np.sum(sizes), 1)
        csv_data = pd.read_csv(csv_path, index_col=0)
        # Yelp dataset contains index column but most custom data doesn't.
        if csv_data.shape[1] == 1:
            csv_data = pd.read_csv(csv_path, index_col=None)
        s1 = csv_data.shape[0]
        csv_data = csv_data.dropna()
        if csv_data.shape[0] != s1:
            print('drop {} nan samples'.format(s1 - csv_data.shape[0]))
        csv_data = csv_data.sample(frac=1, random_state=0)
        sizes = np.cumsum(sizes)
        i0 = 0
        dfs = []
        for s in sizes:
            i1 = int(s * csv_data.shape[0])
            dfs.append(csv_data.iloc[i0:i1])
            i0 = i1
        return dfs

    def preprocess_yelp(self, df):
        # the class index of yelp dataset start from 1, we need to make it start from 0
        label = df['stars'].to_numpy() - 1
        text = df['text'].to_numpy()
        data = np.concatenate([label[:, np.newaxis], text[:, np.newaxis]], axis=1)
        return data


class TestDataset:
    def __init__(self, args):
        assert os.path.isfile(args.data_path)
        text = []
        self.tokenizer_path = os.path.join(MODEL_DIR, 'tokenizer.bin')
        with open(args.data_path) as f:
            for line in f.readlines():
                text.append(line)
        with open(self.tokenizer_path, 'rb') as f:
            tokenizer = pickle.load(f)
        text_ids = tokenizer.texts_to_sequences(text)
        self.text_ids_pad = tf.keras.preprocessing.sequence.pad_sequences(text_ids, maxlen=MAX_LEN).astype(np.int32)
        self.text_ids_pad = self.text_ids_pad.reshape((-1, 8, 8, 8))

    def get_text_sequence(self):
        return self.text_ids_pad

In [None]:
class ModelTimeDistributed(tf.keras.layers.Wrapper):
    '''
    Simplified module of tf.keras.layers.TimeDistributed for tf.keras.Model,
    since in current Tensorflow version, the TimeDistributed Model has bug when computing output shape of tf.keras.Model or GRU module
    '''
    def __init__(self, layer, **kwargs):
        if not isinstance(layer, tf.keras.Model):
            raise ValueError(
                'Please initialize `TimeDistributed` layer with a '
                '`tf.keras.layers.Layer` instance. You passed: {input}'.format(
                    input=layer))
        super(ModelTimeDistributed, self).__init__(layer, **kwargs)

    def _get_shape_tuple(self, init_tuple, tensor, start_idx, int_shape=None):
        # replace all None in int_shape by K.shape
        if int_shape is None:
            int_shape = K.int_shape(tensor)[start_idx:]
        if not any(not s for s in int_shape):
            return init_tuple + tuple(int_shape)
        shape = K.shape(tensor)
        int_shape = list(int_shape)
        for i, s in enumerate(int_shape):
            if not s:
                int_shape[i] = shape[start_idx + i]
        return init_tuple + tuple(int_shape)

    def compute_output_shape(self, input_shape):
        input_shape = tensor_shape.TensorShape(input_shape).as_list()
        child_input_shape = tensor_shape.TensorShape([input_shape[0]] +
                                                     input_shape[2:])
        # child_output_shape = self.layer.compute_output_shape(child_input_shape)
        # if not isinstance(child_output_shape, tensor_shape.TensorShape):
        #     child_output_shape = tensor_shape.TensorShape(child_output_shape)
        # child_output_shape = child_output_shape.as_list()
        timesteps = input_shape[1]
        return tensor_shape.TensorShape([child_input_shape[0], timesteps] +
                                        list(self.layer.output_shape[1:]))

    def call(self, inputs, training=None, mask=None):
        kwargs = {}
        if generic_utils.has_arg(self.layer.call, 'training'):
            kwargs['training'] = training

        input_shape = K.int_shape(inputs)
        input_length = input_shape[1]
        if not input_length:
            input_length = array_ops.shape(inputs)[1]
        inner_input_shape = self._get_shape_tuple((-1,), inputs, 2)
        # Shape: (num_samples * timesteps, ...). And track the
        # transformation in self._input_map.
        inputs = array_ops.reshape(inputs, inner_input_shape)
        # (num_samples * timesteps, ...)
        if generic_utils.has_arg(self.layer.call, 'mask') and mask is not None:
            inner_mask_shape = self._get_shape_tuple((-1,), mask, 2)
            kwargs['mask'] = K.reshape(mask, inner_mask_shape)

        y = self.layer(inputs, **kwargs)

        # Shape: (num_samples, timesteps, ...)
        output_shape = self.compute_output_shape(input_shape).as_list()
        output_shape = self._get_shape_tuple((-1, input_length), y, 1,
                                             output_shape[2:])
        y = array_ops.reshape(y, output_shape)

        return y

In [None]:
def create_model(args, vocab=None):
    print("Loading GloVe embeddings")
    if not os.path.isfile(GloVe_PATH):
        raise FileNotFoundError("Can not find glove.6B.200d.txt in ./data. Please download from "
                                "https://nlp.stanford.edu/projects/glove/ and save to ./data")

    if vocab is not None:
        embeddings_index = {}
        with open(GloVe_PATH) as f:
            for line in tqdm(f.readlines()):
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
        print('Finish Loading GloVe embeddings')

        # use pre-trained GloVe word embeddings to initialize the embedding layer
        embedding_matrix = np.random.random((args.max_num_words + 1, EMBEDDING_DIM))
        for word, i in vocab.items():
            if i < args.max_num_words:
                embedding_vector = embeddings_index.get(word)
                if embedding_vector is not None:
                    # words not found in embedding index will be random initialized.
                    embedding_matrix[i] = embedding_vector

        embedding_layer = Embedding(args.max_num_words + 1,
                                    EMBEDDING_DIM,
                                    weights=[embedding_matrix],
                                    input_length=MAX_LEN / 64,
                                    trainable=True)
    else:
        embedding_layer = Embedding(args.max_num_words + 1,
                                    EMBEDDING_DIM,
                                    input_length=MAX_LEN / 64,
                                    trainable=True)
    if args.model == 'srnn':
        # (-1, 8)
        input1 = tf.keras.layers.Input(shape=(MAX_LEN // 64), dtype=tf.int32)
        # (-1, 8, EMBEDDING_DIM)
        embed1 = embedding_layer(input1)
        # (-1, num_filters)
        gru1 = tf.keras.layers.GRU(args.num_filters,
                                   return_sequences=False,
                                   activation=None,
                                   recurrent_activation='sigmoid')(embed1)
        encoder1 = tf.keras.Model(input1, gru1)
        # (-1, 8, 8)
        input2 = tf.keras.layers.Input(shape=(8, MAX_LEN // 64), dtype=tf.int32)
        # (-1, 8, num_filters)
        embed2 = ModelTimeDistributed(encoder1)(input2)
        # (-1, num_filters)
        gru2 = tf.keras.layers.GRU(args.num_filters,
                                   return_sequences=False,
                                   activation=None,
                                   recurrent_activation='sigmoid')(embed2)
        encoder2 = tf.keras.Model(input2, gru2)
        # (-1, 8, 8, 8)
        input3 = tf.keras.layers.Input(shape=(8, 8, MAX_LEN // 64), dtype=tf.int32)
        # (-1, 8, num_filters)
        embed3 = ModelTimeDistributed(encoder2)(input3)
        # (-1, num_filters)
        gru3 = tf.keras.layers.GRU(args.num_filters,
                                   return_sequences=False,
                                   activation=None,
                                   recurrent_activation='sigmoid')(embed3)
        # (-1, num_class)
        pred = tf.keras.layers.Dense(args.num_class, activation='softmax', )(gru3)
        model = tf.keras.Model(input3, pred)
    else:
        # (-1, 8, 8, 8)
        inputs = tf.keras.layers.Input(shape=(8, 8, MAX_LEN // 64), dtype=tf.int32)
        # (-1, MAX_LEN)
        input_flatten = tf.reshape(inputs, (-1, MAX_LEN))
        # (-1, MAX_LEN, EMBEDDING_DIM)
        embed = embedding_layer(input_flatten)
        # (-1, EMBEDDING_DIM)
        gru = tf.keras.layers.GRU(args.num_filters,
                                  return_sequences=False,
                                  recurrent_activation='sigmoid',
                                  activation=None)(embed)
        # (-1, num_class)
        pred = tf.keras.layers.Dense(args.num_class, activation='softmax')(gru)
        model = tf.keras.Model(inputs, pred)
        return model
    return model

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('--data_path', type=str, help='path of the file containing the test sentences')
parser.add_argument('--output_path', type=str, help='path to the file to save the output prediction')
parser.add_argument('--model_path', type=str, help='')
parser.add_argument('--batch_size', type=int, default=2048)
parser.add_argument('--max_num_words', type=int, default=30000)
parser.add_argument('--num_class', type=int, default=5)
parser.add_argument('--num_filters', type=int, default=50, help='hidden size of the RNN')

_StoreAction(option_strings=['--num_filters'], dest='num_filters', nargs=None, const=None, default=50, type=<class 'int'>, choices=None, required=False, help='hidden size of the RNN', metavar=None)

In [None]:
def inference(args):
    text_sequence = TestDataset(args).get_text_sequence()
    model = create_model(args, None)
    model.load_weights(BEST_MODEL_PATH)
    pred = model.predict(text_sequence)
    pred = np.argmax(pred, axis=-1) + 1
    with open(args.output_path, 'w') as f:
        for p in pred:
            f.write('{}\n'.format(p))

In [None]:
parser = argparse.ArgumentParser()
parser.add_argument('--create_tfrecord', action='store_true',
                    help='''
                    Should be only called once for every dataset.
                    If set, the program would load data from csv specified by "--dataset", "--train_data_path" and
                    "--test_data_path", create tfrecord and save to './tfrecord/train.tfrecord",
                    "./tfrecord/val.tfrecord," and "./tfrecord/test.tfrecord".
                    Otherwise, the program would ignore "--dataset", "--train_data_path" and "--test_data_path" and
                    load directly from "./tfrecord/train.tfrecord", "./tfrecord/val.tfrecord", "./tfrecord/test.tfrecord".
                    ''')
parser.add_argument('--model', type=str, default='srnn', choices=['srnn', 'naive_rnn'])
parser.add_argument('--dataset', type=str, default='Yelp2013',
                    choices=['Yelp2013', 'Yelp2014', 'Yelp2015', 'Custom'], help='dataset to train on')
parser.add_argument('--train_data_path', type=str, default='',
                    help='if dataset is "Custom", please specify the path to the train csv')
parser.add_argument('--test_data_path', type=str, default='',
                    help='if dataset is "Custom", please specify the path to the test csv')
parser.add_argument('--val_size', type=float, default=0.1, help='validation size')
parser.add_argument('--test_size', type=float, default=0.1, help='test size if test_csv is not specified')
parser.add_argument('--num_filters', type=int, default=50, help='hidden size of the RNN')
parser.add_argument('--learning_rate', type=float, default=1e-3)
parser.add_argument('--batch_size', type=int, default=2048)
parser.add_argument('--max_num_words', type=int, default=30000)
parser.add_argument('--num_class', type=int, default=5)
parser.add_argument('--epochs', type=int, default=10)

_StoreAction(option_strings=['--epochs'], dest='epochs', nargs=None, const=None, default=10, type=<class 'int'>, choices=None, required=False, help=None, metavar=None)

In [None]:
def train(args):
    dataset = TrainDataset(args)
    train_ds = dataset.get_datasets('train')
    val_ds = dataset.get_datasets('val')
    test_ds = dataset.get_datasets('test')
    vocabulary = dataset.get_vocabulary()
    del dataset
    model_callback = tf.keras.callbacks.ModelCheckpoint(BEST_MODEL_PATH, save_best_only=True, save_weights_only=True,
                                                        verbose=1, monitor='val_acc', mode='max')
    model = create_model(args, vocabulary)
    model.compile(optimizer=tf.keras.optimizers.Adam(args.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08),
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                  metrics=['acc'])
    model.fit(train_ds, epochs=args.epochs, validation_data=val_ds, callbacks=[model_callback])
    del model, train_ds, val_ds

    best_model = create_model(args, None)
    best_model.compile(optimizer=tf.keras.optimizers.Adam(args.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-08),
                       loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                       metrics=['acc'])
    best_model.load_weights(BEST_MODEL_PATH)
    print(best_model.evaluate(test_ds))

In [None]:
args = parser.parse_args(['--dataset', 'Yelp2015', '--val_size', '0.1', '--test_size', '0.1', '--epochs', '10', '--create_tfrecord', '--batch_size', '2048'])
train(args)

reading data
drop 1 nan samples
     Done
start fitting tokenizer...
     Done
Loading GloVe embeddings


100%|██████████| 400000/400000 [00:16<00:00, 24294.15it/s]


Finish Loading GloVe embeddings
Epoch 1/10
    351/Unknown - 3693s 10s/step - loss: 0.8615 - acc: 0.6393
Epoch 1: val_acc improved from -inf to 0.69065, saving model to ./saved_model/best_model.ckpt
Epoch 2/10
Epoch 2: val_acc improved from 0.69065 to 0.70297, saving model to ./saved_model/best_model.ckpt
Epoch 3/10
Epoch 3: val_acc improved from 0.70297 to 0.71209, saving model to ./saved_model/best_model.ckpt
Epoch 4/10
Epoch 4: val_acc improved from 0.71209 to 0.71723, saving model to ./saved_model/best_model.ckpt
Epoch 5/10
 43/351 [==>...........................] - ETA: 51:33 - loss: 0.6139 - acc: 0.7406