## Overview
This notebook compares features extracted from the last hidden layer of 9 [pre-trained text models from Huggingface](https://huggingface.co/transformers/pretrained_models.html). Models are evaluated according to prediction accuracy in the [CommonLit Readability Prize competition](https://www.kaggle.com/c/commonlitreadabilityprize) training data. Model *roberta-large* appears to be the best model, followed by *camembert-base*.

## Selected models
The following models were picked from the [Huggingface pre-trained models page](https://huggingface.co/transformers/pretrained_models.html). Some were commented out because they produce an error of one type or another (e.g. notebook runs out of memory, or there's no tokenizer.)

In [None]:
MODEL_NAMES = [
    'bert-large-cased',
    'openai-gpt',
    #'gpt2-large',
    #'xlnet-large-cased',
    #'xlm-mlm-en-2048',
    'roberta-large',
    'distilbert-base-cased',
    #'ctrl',
    'camembert-base',
    'albert-large-v2',
    #'t5-large',
    'flaubert/flaubert_large_cased',
    'facebook/bart-large-cnn',
    'moussaKam/mbarthez',
    #'DialoGPT-large',
    #'facebook/m2m100_418M',
    'allenai/longformer-large-4096',
    #'lxmert-base-uncased',
    #'funnel-transformer/small',
    #'funnel-transformer/xlarge',
    'microsoft/layoutlm-large-uncased',
    'microsoft/deberta-xlarge-v2',
    'squeezebert/squeezebert-mnli-headless',
    'camembert/camembert-base-wikipedia-4gb',
    'chkla/roberta-argument',
]

The following function is used to load a model and tokenizer.

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel

assert torch.cuda.is_available(), 'No CUDA!'
device = torch.device('cuda')

def get_model_info(model_name: str):    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name).to(device)
    return model, tokenizer,

## Last hidden layer
PyTorch models may consist of sub-models or sequences of modules. The *get_last_hidden_layer*() function makes a best guess as to which layer is the one that feeds the output layer.

In [None]:
from torch.nn import Sequential

def get_last_two_layers_in_sequence(seq: Sequential):   
    seq_size = len(seq)
    if seq_size == 0:
        return None, None,
    elif seq_size == 1:
        return None, seq[seq_size - 1],
    else:
        return seq[seq_size - 2], seq[seq_size - 1],

    
def get_last_two_layers(model):
    prev_layer = None
    last_layer = None
    module_dict = model._modules
    for key in module_dict:
        module = module_dict[key]
        if isinstance(module, Sequential):
            m_prev, m_last = get_last_two_layers_in_sequence(module)                        
        else:
            m_prev, m_last = get_last_two_layers(module)
        if m_last is None:
            m_last = module
        prev_layer = m_prev if m_prev is not None else last_layer
        last_layer = m_last
    return prev_layer, last_layer,


def get_last_hidden_layer(model):
    prev, _ = get_last_two_layers(model)
    return prev

## Feature extraction
We'll now define a function that can extract features from an arbitrary model and layer, assuming the shape of the feature tensor is known.

In [None]:
import torch

def get_features(model, tokenizer, layer, out_shape, text: str):
    vector_buffer = torch.zeros(out_shape)
    def _local_hook(_, _input, _output):
        nonlocal vector_buffer
        vector_buffer.copy_(_output.data)
    tensors = tokenizer(text, return_tensors='pt')
    fh = layer.register_forward_hook(_local_hook)
    model(**tensors.to(device))
    fh.remove()
    return torch.flatten(vector_buffer)

## Number of features
To find the shape of the feature tensor, we use a layer hook that captures the shape of the layer's output.

In [None]:
def get_features_shape_mn(model_name: str, test_text='hello', for_input=False):
    model, tokenizer = get_model_info(model_name)
    layer = get_last_hidden_layer(model)
    return get_features_shape(model, tokenizer, layer, test_text=test_text, for_input=for_input)


def get_features_shape(model, tokenizer, layer, test_text='hello', for_input=False):
    t_dims = None
    def _local_hook(_, _input, _output):
        nonlocal t_dims
        t_dims = _input[0].size() if for_input else _output.size()
        return _output 
    tensors = tokenizer(test_text, return_tensors='pt')
    fh = layer.register_forward_hook(_local_hook)
    model(**tensors.to(device))
    fh.remove()
    return t_dims

The following table captures the number of features produced by the last hidden layer of each model, 
assuming the input contains only one token. If two tokens produce a different tensor shape, then the
*tokens_dependent* column will be *True*.

In [None]:
import sys
import gc
import pandas as pd

def flattened_size(size):
    p = 1
    for i in range(len(size)):
        p *= size[i]
    return p


feature_counts = []
for model_name in MODEL_NAMES:
    try:
        model, tokenizer = get_model_info(model_name)
        layer = get_last_hidden_layer(model)
        shape1 = get_features_shape(model, tokenizer, layer, test_text='hello')
        num_features = flattened_size(shape1)
        shape2 = get_features_shape(model, tokenizer, layer, test_text='hello world')
        tokens_dependent = flattened_size(shape2) != num_features
        feature_counts.append((model_name, num_features, tokens_dependent))
        del model
        del tokenizer
        gc.collect()
    except:
        print('Model failed: %s |' % model_name, sys.exc_info()[0])
feature_count_frame = pd.DataFrame(feature_counts, columns=['model_name', 'num_features', 'tokens_dependent'])
feature_count_frame

## Transformation
The *transform_dataset*() function takes competition data frame and produces a feature matrix in place of excerpts.

In [None]:
import numpy as np
import pandas as pd

def transform_dataset(data_frame: pd.DataFrame, model, tokenizer, layer, features_shape):
    y = []
    x = []
    for index, row in data_frame.iterrows():
        text = row['excerpt']
        label = row['target']
        features = get_features(model, tokenizer, layer, features_shape, text)
        x.append(features.detach().numpy())
        y.append(label)
    return np.array(x), np.array(y),

## Using k-NN to evaluate feature sets
If we were to use a linear regressor or a neural network to evaluate feature sets, we'd have to consider that larger feature sets are at a disadvantage, statistically. (The competition's training dataset isn't very large.) Plus regularization parameters depend on the number of features. Even with a random forest or a GBM, the number of trees would need to be adjusted according to the number of features. The k-NN algorithm does not have these issues.

Additionally, we can quickly do leave-one-out cross-validation with k-NN, and we're defining a custom function that does this, with the help of *sklearn*'s BallTree implementation:

In [None]:
from sklearn.neighbors import BallTree

def leave_one_out_knn_predictions(x, y: np.ndarray, k: int):
    ball_tree = BallTree(x, leaf_size=10)
    idx_matrix = ball_tree.query(x, k=k+1, return_distance=False)
    loo_idx_matrix = idx_matrix[:,1:]
    return np.array([np.mean(y[idx_row]) for idx_row in loo_idx_matrix])

## Evaluation
Not we put everything together and run an evaluation routine. Models are skipped if the number of features depends on the number of tokens. Cross-validation RMSE of remaining model feature sets is shown in a bar chart below.

In [None]:
%%time

from sklearn.metrics import mean_squared_error


def evaluate_models(model_names):
    train_data = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
    results_matrix = []
    for index, row in feature_count_frame.iterrows():
        model_name = row['model_name']
        if row['tokens_dependent']:
            print('Skipping %s' % model_name)
            continue
        print('Evaluating %s ...' % model_name)
        model, tokenizer = get_model_info(model_name)
        layer = get_last_hidden_layer(model)
        shape = get_features_shape(model, tokenizer, layer)
        x, y = transform_dataset(train_data, model, tokenizer, layer, shape)
        pred_y = leave_one_out_knn_predictions(x, y, k=10)
        rmse = mean_squared_error(pred_y, y, squared=False)
        results_matrix.append([model_name, rmse])
        del model
        del tokenizer
        del x
        gc.collect()
    return pd.DataFrame(results_matrix, columns=['model_name','rmse'])


model_eval_frame = evaluate_models(MODEL_NAMES)
model_eval_frame = model_eval_frame.sort_values('rmse')

In [None]:
import numpy as np
import plotly.express as px

def plot_bar_chart(data, x_var: str, y_var: str, title='', x_label='', y_label=''):
    fig = px.bar(data, x=x_var, y=y_var, title=title,
                 labels={x_var: x_label, y_var: y_label})
    fig.show()
    

plot_bar_chart(model_eval_frame, 'model_name', 'rmse',
               title='Text model comparison',
               x_label='Model name', y_label='k-NN RMSE')