In [None]:
import re
import os
import numpy as np
import pandas as pd
import random
import math
import tensorflow as tf
import logging
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from tensorflow.keras import backend as K
from transformers import RobertaTokenizer, TFRobertaModel, TFAutoModel, AutoTokenizer, AutoConfig

tf.get_logger().setLevel(logging.ERROR)


In [None]:
import glob

tr_dir = '../input/commonlit/submission1/*'

transformer_list = glob.glob(tr_dir)
transformer_list

In [None]:
#MODEL = ['albert-base-v2', 'bert-base-uncased', 'roberta-base', 'distilbert-base-uncased']
root = '../input/huggingface-offline-transformers/offline-transformers/'
MODEL = [root + 'albert-base-v2', root + 'bert-base-uncased',
         root + 'roberta-base', root + 'distilbert-base-uncased']

In [None]:
# tokenizer = AutoTokenizer.from_pretrained(MODEL[0])
# #transformer = TFAutoModel.from_pretrained(MODEL[0])

In [None]:
df = pd.read_csv('../input/commonlitreadabilityprize/test.csv')
x_test = df['excerpt']

In [None]:
MAX_LENGTH = 250

In [None]:
# Define function to encode text data in batches
def batch_encode(tokenizer, texts, batch_size=200, max_length=MAX_LENGTH):
    """""""""
    A function that encodes a batch of texts and returns the texts'
    corresponding encodings and attention masks that are ready to be fed 
    into a pre-trained transformer model.
    
    Input:
        - tokenizer:   Tokenizer object from the PreTrainedTokenizer Class
        - texts:       List of strings where each string represents a text
        - batch_size:  Integer controlling number of texts in a batch
        - max_length:  Integer controlling max number of words to tokenize in a given text
    Output:
        - input_ids:       sequence of texts encoded as a tf.Tensor object
        - attention_mask:  the texts' attention mask encoded as a tf.Tensor object
    """""""""
    
    input_ids = []
    attention_mask = []
    
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer.batch_encode_plus(texts,
                                             max_length=MAX_LENGTH,
                                             padding='max_length', #implements dynamic padding
                                             truncation=True,
                                             return_attention_mask=True,
                                             return_token_type_ids=False
                                             )
        input_ids.extend(inputs['input_ids'])
        attention_mask.extend(inputs['attention_mask'])
    
    return tf.convert_to_tensor(input_ids), tf.convert_to_tensor(attention_mask)

In [None]:
def build_model(Model, max_length=MAX_LENGTH):
    """
    Template for building a model off of the BERT or DistilBERT architecture
    for a binary classification task.
    
    Input:
      - transformer:  a base Hugging Face transformer model object (BERT or DistilBERT)
                      with no added classification head attached.
      - max_length:   integer controlling the maximum number of encoded tokens 
                      in a given sequence.
    
    Output:
      - model:        a compiled tf.keras.Model with added classification layers 
                      on top of the base pre-trained model architecture.
    """
    
    # Transformer
    transformer = TFAutoModel.from_pretrained(Model)
    
    # Make Transformer layers untrainable
    for layer in transformer.layers:
        layer.trainable = False
    
    # Define input layers
    input_ids_layer = tf.keras.layers.Input(shape=(max_length,), 
                                            name='input_ids', 
                                            dtype='int32')
    input_attention_layer = tf.keras.layers.Input(shape=(max_length,), 
                                                  name='input_attention', 
                                                  dtype='int32')
    
    # DistilBERT outputs a tuple where the first element at index 0
    # represents the hidden-state at the output of the model's last layer.
    # It is a tf.Tensor of shape (batch_size, sequence_length, hidden_size=768).
    last_hidden_state = transformer([input_ids_layer, input_attention_layer])[0]
    
    # We only care about DistilBERT's output for the [CLS] token, 
    # which is located at index 0 of every encoded sequence.  
    # Splicing out the [CLS] tokens gives us 2D data.
    cls_token = last_hidden_state[:, 0, :]
    
    ##                                                 ##
    ## Define additional dropout and dense layers here ##
    ##                                                 ##
    
    # Define a single node that makes up the output layer (for binary classification)
    output = tf.keras.layers.Dense(1, activation='linear')(cls_token)
    
    # Define the model
    model = tf.keras.Model([input_ids_layer, input_attention_layer], output)

    return model

In [None]:
# Initiate an empty vector to store prediction
predictions = np.zeros(len(df))

for i, Model in enumerate(MODEL):    
    print('\n')
    print('-'*50)
    print(f'Predicting with model {Model}')
    
    #Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(Model)
    
    # Encode X_test
    X_test_ids, X_test_attention = batch_encode(tokenizer, x_test.tolist())
    
    tr_model = build_model(Model, max_length=MAX_LENGTH)
    
    tr_model.load_weights(transformer_list[i])
    
    # Predict
    fold_predictions = tr_model.predict([X_test_ids, X_test_attention]).reshape(-1)
    
    # Add fold prediction to the global predictions
    predictions += fold_predictions / len(MODEL)
        
# Save submissions
df['target'] = predictions
df[['id', 'target']].to_csv('submission.csv', index = False)

In [None]:
df