In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import string

from tensorflow.keras import Model, Input
from tensorflow.keras.layers import Dense, LSTM, Dropout, Flatten

from transformers import RobertaTokenizer, TFRobertaModel

In [None]:
def custom_standardization(text):
    text = text.lower() # if encoder is uncased
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.strip()
    return text

def get_dataset(pandas_df, tokenizer, batch_size=32, seq_len=128):
    """
        Return a Tensorflow dataset ready for training or inference.
    """
    text = [custom_standardization(text) for text in pandas_df['excerpt']]
    
    # Tokenize inputs
    tokenized_inputs = tokenizer(text, max_length=seq_len, truncation=True, 
                                 padding='max_length', return_tensors='tf')
    
    dataset = tf.data.Dataset.from_tensor_slices({'input_ids': tokenized_inputs['input_ids']}) 
#                                                   'attention_mask': tokenized_inputs['attention_mask']})
        
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    
    return dataset

In [None]:
def base_model(encoder, seq_len=256):
    input_ids = Input(shape=(seq_len,), dtype=tf.int32, name='input_ids')
#     input_attention_mask = Input(shape=(seq_len,), dtype=tf.int32, name='attention_mask')
    
    transformer = encoder({'input_ids': input_ids})
#                       'attention_mask': input_attention_mask})
    
    lstm = LSTM(32,return_sequences=True, name="lstm_layer")(transformer.last_hidden_state)
    
    dropout1 = Dropout(0.3, name="dropout_layer1")(lstm)
    
    dense = Dense(16, name="dense_layer")(dropout1)
    
    dropout2 = Dropout(0.5, name="dropout_layer2")(dense)
    
    flatten = Flatten(name="flatten_layer")(dropout2)
    
    output = Dense(1, activation="linear", name="output_layer")(flatten)
    
    model = Model(inputs=[input_ids], outputs=output)
    
#     model = Model(inputs=[input_ids, input_attention_mask], outputs=output)
    
#     model.summary()
    
    return model

In [None]:
df_test = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
print(df_test.shape)
df_test.head()

In [None]:
BASE_MODEL = '../input/huggingface-roberta/roberta-base'

In [None]:
SEQ_LEN = 256
tokenizer = RobertaTokenizer.from_pretrained(BASE_MODEL)

dataset_test = get_dataset(df_test, tokenizer, batch_size=1, seq_len=SEQ_LEN)

preds = np.zeros((df_test.shape[0],))

folds = 5

for x in range(folds):

    model_path = f"../input/clrp-roberta-lstm/model_fold{x+1}.h5"

    encoder = TFRobertaModel.from_pretrained(BASE_MODEL)
    model = base_model(encoder, SEQ_LEN)

    model.compile()

    model.load_weights(model_path)

    pred = model.predict(dataset_test)
    preds += np.squeeze(pred, axis=-1) / folds
#     print(pred.shape)

In [None]:
df_pred = df_test[['id']]
df_pred['target'] = pred
df_pred

In [None]:
df_pred.to_csv("submission.csv", index=False)