In [None]:
import pandas as pd
pd.options.display.max_columns = None
pd.options.mode.chained_assignment = None
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import seaborn as sns
sns.set('talk')
sns.set_style('white')

import re
import emoji

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import Callback, ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.utils import to_categorical

import transformers
from transformers import TFAutoModel, AutoTokenizer

from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import StratifiedShuffleSplit

## Helper Functions

In [None]:
import os

__print__ = print
# Function to print statements into Kaggle logs
def print(string):
    os.system(f'echo \"{string}\"')
    __print__(string)

In [None]:
from datetime import datetime
from pytz import timezone, utc

SGT = timezone('Asia/Singapore')
# Custom callback to print timestamp, training loss and validation loss
class PrintSystemLogPerEpoch(Callback):
    def on_epoch_begin(self, epoch, logs={}):
        t = utc.localize(datetime.utcnow()).astimezone(SGT).time()
        print(f'* [Epoch {epoch+1}] begins at {t}')
    def on_epoch_end(self, epoch, logs={}):
        t = utc.localize(datetime.utcnow()).astimezone(SGT).time()
        print(f'\n* [Epoch {epoch+1}] ends at {t} | loss={logs["loss"]:0.4f}, val_loss={logs["val_loss"]:0.4f}')

In [None]:
# Fast encoding 
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [None]:
# Regular encoding
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
#         return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen,
        truncation=True
    )
    
    return np.array(enc_di['input_ids'])

## TPU Configs

In [None]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU {}'.format(tpu.master()))
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: {}".format(strategy.num_replicas_in_sync))

In [None]:
AUTO = tf.data.experimental.AUTOTUNE

# Data access
GCS_DS_PATH = KaggleDatasets().get_gcs_path()

# Configuration
EPOCHS = 5
BATCH_SIZE = 128 * strategy.num_replicas_in_sync
MAX_LEN = 64
RANDOM_SEED = 2020
MODEL = 'jplu/tf-xlm-roberta-large'

NUM_CLASSES = 5

In [None]:
def build_model(transformer, max_len=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
#     out = Dropout(0.3)(cls_token)
    out = Dense(NUM_CLASSES, activation='softmax')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=2e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

## Create fast tokenizer

In [None]:
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

## Load text data into memory

In [None]:
data = pd.read_csv('../input/sentimenttranslated/train_translated.csv')
test = pd.read_csv('../input/sentimenttranslated/test_translated.csv')

In [None]:
# Encode full data reviews
full_data = regular_encode(data['review'].values, tokenizer, maxlen=MAX_LEN)
# Encode test reviews
x_test = regular_encode(test['review'].values, tokenizer, maxlen=MAX_LEN)

## Build datasets objects

In [None]:
# Convert full data reviews array to tensorflow dataset 
full_dataset = (
         tf.data.Dataset
        .from_tensor_slices(full_data)
        .batch(BATCH_SIZE)
)

# Convert test reviews array to tensorflow dataset 
test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)

## Load model into the TPU

In [None]:
print('Loading pretrained model...')

with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

print('LOADED.')

## Predict

In [None]:
# Calculate number of steps needed, +1 because the last step has leftover samples due to flooring
n_steps_train = full_data.shape[0] // BATCH_SIZE
n_steps_test = x_test.shape[0] // BATCH_SIZE

In [None]:
checkpoint_filepath = 'model.h5'

print('Begin predicting...')
model.load_weights(checkpoint_filepath)

layer_name = model.layers[-2].name
intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer(layer_name).output)

train_output = intermediate_layer_model.predict(full_dataset, steps = n_steps_train+1)
np.save('train_embeddings.npy', train_output)
print('Saved train embeddings...')

test_output = intermediate_layer_model.predict(test_dataset, steps = n_steps_test+1)
np.save('test_embeddings.npy', test_output)
print('Saved test embeddings...')