## Context

In this notebook, I try to compare LongFormer, AlBert and DistilBert models.

## Description

How embeddings were extracted

```
class GetTokens(transformers.FeatureExtractionPipeline):
    def postprocess(self, model_outputs):
        """ Change output: extract only cls tokens as a list. """
        last_hidden_state = model_outputs.last_hidden_state

        return last_hidden_state[0][0].tolist()
```

How the models were loaded

```
def get_extractor(model_name, device):
    """ Create the extractor for a feature extraction task."""
    
    [...]
    
    extractor = pipeline(
        task="feature-extraction",
        model=model_name,
        tokenizer=model_name,
        device=use_device,
        pipeline_class=GetTokens,
        **use_param
    )
    
    return extractor
```

How the models were compared

```
def get_scores_dict(features, labels, model, n_round=5):
    """ Get the scores dictionary (target / column as key) using a model. """
    result = {}
    
    X = features
    for target in labels.columns:
        y = labels[target]

        X_train, X_test, y_train, y_test = train_test_split(
                                                X, y,
                                                stratify=y,
                                                test_size=ts_param,
                                                random_state=rs_param)

        model.fit(X_train, y_train)

        score = model.score(X_test, y_test)

        [...]
```

## Sources

* [Part 3. Comparing extracted vs other created embeddings](https://www.kaggle.com/renokan/student-writing-comparing-embeddings-part-3)
* [Part 2. Comparing extracted vs created embeddings - only distilbert](https://www.kaggle.com/renokan/student-writing-comparing-embeddings-part-2)
* [Part 1. Comparing extracted embeddings (load from dataset)](https://www.kaggle.com/renokan/student-writing-comparing-embeddings-part-1)
* [Embeddings (CLS token) + LogisticRegression](https://www.kaggle.com/renokan/embeddings-cls-token-logisticregression)

# 1. Import & Def & Set & Load

In [None]:
import gc
import numpy as np
import pandas as pd

import torch
import transformers
from transformers import pipeline

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from cuml.linear_model import LogisticRegression as GPU_LogisticRegression

import warnings
warnings.filterwarnings('ignore')

In [None]:
class GetTokens(transformers.FeatureExtractionPipeline):
    def postprocess(self, model_outputs):
        """ Change output: extract only cls tokens as a list. """
        last_hidden_state = model_outputs.last_hidden_state

        return last_hidden_state[0][0].tolist()
    

def get_extractor(model_name, device):
    """ Create the extractor for a feature extraction task."""
    use_device = device
    use_param = {
        'max_length': 512,
        'truncation': 'only_first'
    }
    
    extractor = pipeline(
        task="feature-extraction",
        model=model_name,
        tokenizer=model_name,
        device=use_device,
        pipeline_class=GetTokens,
        **use_param
    )
    
    return extractor


def get_scores_dict(features, labels, model, n_round=5):
    """ Get the scores dictionary (target / column as key) using a model. """
    result = {}
    
    X = features
    for target in labels.columns:
        y = labels[target]

        X_train, X_test, y_train, y_test = train_test_split(
                                                X, y,
                                                stratify=y,
                                                test_size=ts_param,
                                                random_state=rs_param)

        model.fit(X_train, y_train)

        score = model.score(X_test, y_test)
        score = round(score, n_round)
        
        result[target] = score
        
    return result

In [None]:
rs_param = 42  # random_state
ts_param = 0.2 # train_test_split(test_size=

use_model = LogisticRegression(
    random_state=rs_param,
    solver='liblinear'
)
# There is a bug with solver='lbfgs'
# AttributeError: 'str' object has no attribute 'decode'
# in fitting Logistic Regression Model

use_device = -1  # cpu
if torch.cuda.is_available():
    use_device = 0
    use_model = GPU_LogisticRegression(max_iter=2000)
    # https://docs.rapids.ai/api/cuml/stable/api.html#logistic-regression

use_device

In [None]:
raw_data_path = "../input/feedback-prize-2021/train.csv"
raw_data = pd.read_csv(raw_data_path)

discourse_df = pd.DataFrame(
    {'text': raw_data['discourse_text'],
     'type': raw_data['discourse_type']},
       index=raw_data.index
)

type_prefix = "tp"
type_column = "type"
type_origin = discourse_df[type_column]

data = pd.get_dummies(discourse_df,
                      prefix=[type_prefix], columns=[type_column],
                      dummy_na=False, drop_first=False)

text_and_labels = data.join(type_origin)

In [None]:
text_and_labels.head()

# 2. Labels (target columns)

In [None]:
labels = text_and_labels.select_dtypes(exclude='object')
labels.head()

In [None]:
labels.mean().mul(100).round(2).map("{} %".format)

# 3. Creating embeddings (pipeline / transformers)

In [None]:
models_dict = {
    'longformer': 'allenai/longformer-base-4096',  # Wall time: 1h 31min 25s
    'deberta': 'microsoft/deberta-base',           # Wall time: 57min 3s
    'albert': 'albert-base-v2',                    # Wall time: 22min 3s
    'roberta': 'roberta-base',                     # Wall time: 19min 19s
    'distilbert': 'distilbert-base-uncased'        # Wall time: 17min 30s
}

result_dict = {}

data = text_and_labels['text']

## 3.1. Using a model N1

In [None]:
load_model = "longformer"

extractor_features = get_extractor(
    models_dict.get(load_model),
    use_device
)

In [None]:
%%time
features = pd.DataFrame(
    extractor_features(data.tolist())
)

In [None]:
features.head()

In [None]:
features.info(memory_usage='deep')

In [None]:
%%time
scores = get_scores_dict(features, labels, use_model)
scores = pd.Series(scores)
scores

In [None]:
result_dict[load_model] = scores.copy()

In [None]:
del features
del extractor_features
gc.collect()

## 3.2. Using a model N2

In [None]:
load_model = "albert"

extractor_features = get_extractor(
    models_dict.get(load_model),
    use_device
)

In [None]:
%%time
features = pd.DataFrame(
    extractor_features(data.tolist())
)

In [None]:
features.head()

In [None]:
features.info(memory_usage='deep')

In [None]:
%%time
scores = get_scores_dict(features, labels, use_model)
scores = pd.Series(scores)
scores

In [None]:
result_dict[load_model] = scores.copy()

In [None]:
del features
del extractor_features
gc.collect()

## 3.3. Using a model N3

In [None]:
load_model = "distilbert"

extractor_features = get_extractor(
    models_dict.get(load_model),
    use_device
)

In [None]:
%%time
features = pd.DataFrame(
    extractor_features(data.tolist())
)

In [None]:
features.head()

In [None]:
features.info(memory_usage='deep')

In [None]:
%%time
scores = get_scores_dict(features, labels, use_model)
scores = pd.Series(scores)
scores

In [None]:
result_dict[load_model] = scores.copy()

In [None]:
del features
del extractor_features
gc.collect()

# 4. Comparing Embeddings

In [None]:
compare_df = pd.DataFrame(result_dict, index=labels.columns)

compare_df

In [None]:
props_param = "color:white; font-weight:bold; background-color:darkblue;"

In [None]:
compare_df.assign(mean=lambda x: x.mean(axis=1)) \
    .style.highlight_max(
        axis=1,
        props=props_param
)

In [None]:
compare_df.sub(
    compare_df.mean(axis=1),
    axis=0).style.highlight_max(
                axis=1,
                props=props_param
)