# Context:
The extra-ordinary power of Transformer models comes from the fact that they can be used a feature extractor. Therefore I wanted to figure out what are the possible features one could extract from a transformer model and on top of it figure out what are all the additional features one could derive from it. To illustrate this, from a simple pretrained BERT base model I do the following:
* Extract the following features
    1. Pooler output
    2. Hidden states of the transformer. (In the case of BERT base there are 12 hidden states)  
    
    
    
* Derive the following features from the hidden states:
    1. Mean of the last hidden state (from n<sup>th</sup> layer)
    2. Mean of the last but one hidden state (from n-1<sup>st</sup> layer)
    3. Mean of the last but two hidden state (from n-2<sup>nd</sup> layer)
    4. Hidden state corresponding to the CLS token of last hidden state (from n<sup>th</sup> layer)
    5. Hidden state corresponding to the CLS token of last but one hidden state (from n-1<sup>st</sup> layer)
    6. Hidden state corresponding to the CLS token of last but two hidden state (from n-2<sup>nd</sup> layer)
    
  
# Objectives:
1. To extract various features from a transformer model
2. To derive many more features from the extracted features
3. Evaluate the performance of the features on the given task

A pre-trained BERT model is used to demonstrate this pipeline

# Steps:
1. Read train and test data
2. Define Dataset and DataLoader
3. Define the model
4. Make sure the entire notebook can execute on CPU or CUDA
5. Iterate through the train_dataloader and extract features
6. Derive additional features from the extracted features
7. Build a regressor and evaluate the various features

In [None]:
import json
import torch
import itertools
import numpy as np
import pandas as pd
import numpy.ma as ma

from pathlib import Path
from sklearn.svm import SVR
from sklearn.linear_model import Ridge
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from sklearn.model_selection import cross_val_score

In [None]:
COMPETITION_DATA_PATH = Path('../input/commonlitreadabilityprize')
TRAIN_DATA_PATH = COMPETITION_DATA_PATH / 'train.csv'
TEST_DATA_PATH = COMPETITION_DATA_PATH / 'test.csv'

In [None]:
BATCH_SIZE = 32
RANDOM_STATE = 41

In [None]:
train_data = pd.read_csv(TRAIN_DATA_PATH)
test_data = pd.read_csv(TEST_DATA_PATH)
# Remove these lines before submission
# train_data = train_data.sample(n=40, random_state=RANDOM_STATE)
train_data = train_data.sort_values(by='excerpt', key=lambda x: x.str.len())
print(f'Length of train data: {len(train_data)}')
print(f'Length of test data: {len(test_data)}')

# Dataset and DataLoader creation

In [None]:
class TrainDataset(Dataset):
    def __init__(self, text_excerpts, targets):
        self.text_excerpts = text_excerpts
        self.targets = targets
    
    def __len__(self):
        return len(self.text_excerpts)
    
    def __getitem__(self, idx):
        return {'text_excerpt': self.text_excerpts[idx], 'target': self.targets[idx]}
    
class PredictionDataset(Dataset):
    def __init__(self, text_excerpts):
        self.text_excerpts = text_excerpts
    
    def __len__(self):
        return len(self.text_excerpts)
    
    def __getitem__(self, idx):
        return {'text_excerpt': self.text_excerpts[idx]}

In [None]:
train_dataset = TrainDataset(text_excerpts=train_data['excerpt'].tolist(),
                             targets=train_data['target'].tolist())
test_dataset = PredictionDataset(text_excerpts=test_data['excerpt'].tolist())

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Model definition

In [None]:
%%capture
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model.eval()
model = model.to(device)

# Extract and Derive features
The mean of the hidden state has to be calculated carefully after masking the hidden states corresponding to the PAD token

In [None]:
def derive_cls_hidden_state(hidden_state):
    return np.asarray(hidden_state)[:,0,:]

def mask_and_compute_mean_hidden_state(hidden_state, attention_mask):
    vector_size = hidden_state.shape[-1]
    attention_mask = np.expand_dims(attention_mask, axis=2)
    attention_mask = attention_mask == 0
    tiled_attention_mask = np.tile(attention_mask, reps=[1, 1, vector_size])
    masked_hidden_state = ma.array(hidden_state, mask=tiled_attention_mask)
    mean_masked_hidden_state = masked_hidden_state.mean(axis=1).data
    return mean_masked_hidden_state

def extract_and_derive_features(dataloader):
    features = {'pooler_output': [],
                'mean_last_hidden_state': [],
                'mean_last_but_one_hidden_state': [],
                'mean_last_but_two_hidden_state': [],
                'cls_last_hidden_state': [],
                'cls_last_but_one_hidden_state': [],
                'cls_last_but_two_hidden_state': []}
    
    for batch_num, batch in enumerate(dataloader):
        text_excerpts_batch = batch['text_excerpt']
        with torch.no_grad():
            inputs = tokenizer(text_excerpts_batch, return_tensors='pt', padding=True, truncation=True)
            inputs = inputs.to(device)
            outputs = model(**inputs, output_attentions=False, output_hidden_states=True)
            attention_mask = inputs['attention_mask'].detach().cpu().numpy()
            # Extract features
            pooler_output = outputs['pooler_output'].detach().cpu().numpy()
            last_but_two_hidden_state, last_but_one_hidden_state,  last_hidden_state = [hidden_state.detach().cpu().numpy() for hidden_state in outputs['hidden_states'][-3:]]
            # Derive features
            mean_last_hidden_state = mask_and_compute_mean_hidden_state(last_hidden_state, attention_mask)
            mean_last_but_one_hidden_state = mask_and_compute_mean_hidden_state(last_but_one_hidden_state, attention_mask)
            mean_last_but_two_hidden_state = mask_and_compute_mean_hidden_state(last_but_two_hidden_state, attention_mask)
            cls_last_hidden_state = derive_cls_hidden_state(last_hidden_state)
            cls_last_but_one_hidden_state = derive_cls_hidden_state(last_but_one_hidden_state)
            cls_last_but_two_hidden_state = derive_cls_hidden_state(last_but_two_hidden_state)
            # Append features
            features['pooler_output'].extend(pooler_output.tolist())
            features['mean_last_hidden_state'].extend(mean_last_hidden_state.tolist())
            features['mean_last_but_one_hidden_state'].extend(mean_last_but_one_hidden_state.tolist())
            features['mean_last_but_two_hidden_state'].extend(mean_last_but_two_hidden_state.tolist())
            features['cls_last_hidden_state'].extend(cls_last_hidden_state.tolist())
            features['cls_last_but_one_hidden_state'].extend(cls_last_but_one_hidden_state.tolist())
            features['cls_last_but_two_hidden_state'].extend(cls_last_but_two_hidden_state.tolist())       
    features = {key: np.asarray(value) for key, value in features.items()}
    return features

In [None]:
%%time
train_features = extract_and_derive_features(train_dataloader)
test_features = extract_and_derive_features(test_dataloader)

train_targets = [batch['target'].detach().cpu().tolist() for batch in train_dataloader]
train_targets = list(itertools.chain(*train_targets))

# Fit regressor

In [None]:
for key in train_features.keys():
    regressor = SVR(C=10, kernel='rbf', gamma='auto')
#     regressor = Ridge(fit_intercept=True, normalize=False)
    scores = cross_val_score(regressor, train_features[key], train_targets, cv=5, scoring='neg_root_mean_squared_error')
    print(f'{key}: Average Root mean squared error: {np.abs(np.mean(scores))}')

In [None]:
best_feature = 'mean_last_hidden_state'
regressor = regressor.fit(train_features[best_feature], train_targets)

test_data = pd.read_csv(TEST_DATA_PATH)
test_data['target'] = regressor.predict(test_features[best_feature])
test_data[['id','target']].to_csv('submission.csv', index=False)