# Idea
This notebooks explores how to utilize a model from Hugging Face for Feature Engineering. The idea is to extract the hidden state of a model and then feed this state into a simple classifier. 

# Setup
This competition is a little bit special as we mustn't utilize an internet connection during inference. For this reason we have to install pip packages that are not part of the standard kaggle image manually

In [None]:
from torch.utils.data import DataLoader
import warnings,transformers,logging,torch
from transformers import TrainingArguments,Trainer
from transformers import AutoModelForSequenceClassification,AutoTokenizer
from fastai.imports import *
import os
import pyarrow
import pyarrow.dataset
from sklearn import preprocessing
from umap import UMAP
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier

We mustn't use internet this is why we install the pip package from the input

In [None]:
!pip install datasets -q --no-index --find-links=file:///kaggle/input/hf-datasets/wheels

In [None]:
import datasets
from datasets import load_dataset, Dataset, DatasetDict

In [None]:
class CFG:
    model_nm = '../input/debertav3small'
    train_path='../input/us-patent-phrase-to-phrase-matching/train.csv'
    test_path='../input/us-patent-phrase-to-phrase-matching/test.csv'
    sample_submission='../input/us-patent-phrase-to-phrase-matching/sample_submission.csv'

# Reading the Data and the pre-trained model

In [None]:
tokz = AutoTokenizer.from_pretrained(CFG.model_nm)

In [None]:
df = pd.read_csv(CFG.train_path)
df_test = pd.read_csv(CFG.test_path)

In [None]:
sep = tokz.sep_token

# Pre-Processing

In [None]:
df['section'] = df.context.str[0]
df_test['section'] = df_test.context.str[0]
df['inputs'] = df.context + sep + df.anchor + sep + df.target
df_test['inputs'] = df_test.context + sep + df_test.anchor + sep + df_test.target

Here we create a Dataset for the Hugging Face Model - we rename the column as the models loaded from hugging face expect the target column to have the name label.

In [None]:
ds = Dataset.from_pandas(df).rename_column('score', 'label')

Here we won't have to rename the column as the evaluation Dataset has no target column

In [None]:
eval_ds = Dataset.from_pandas(df_test)

The tokenize function takes as input a row and then tokenizes from this row the input column. The tokenized row is an array of numbers. This is the only format that can be fed into the model.

In [None]:
def tok_func(x): return tokz(x["inputs"], padding=True, truncation=True)

In [None]:
inps = "anchor","target","context"
tok_ds = ds.map(tok_func, batched=True, batch_size=None, remove_columns=inps+('inputs','id','section'))
eval_tok_ds = eval_ds.map(tok_func, batched=True, batch_size=None, remove_columns=inps+('inputs','id','section'))

In the following cell we see that the map function created several new columns:
* The attention_mask column tells the model on which part of the sentence to focus. This column is created because during tokenization all tokenized senteces are padded until they have the same length as the longest tokenized sentence. The attention mask tells the model on which part of the sentence to focus
* The input_ids column represents the tokenized sentence
* The label column represents the label for the tokeized sentence
* The token_type_ids is of no interest I think

In [None]:
tok_ds

In [None]:
dds = tok_ds
eval_dds=eval_tok_ds

# Feature Engineering

In [None]:
from transformers import AutoModel

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(CFG.model_nm).to(device)

This function is the main idea of the notebook. Models from Hugging Face offer the function .last_hidden_state with which we can extract the last hidden state of a model and then utilize this as features for downstream models.

In [None]:
def extract_hidden_states(batch):
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in tokz.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
dds.set_format("torch",columns=["input_ids", "attention_mask", "token_type_ids"])
eval_dds.set_format("torch",columns=["input_ids", "attention_mask", "token_type_ids"])

In [None]:
dds_hidden = dds.map(extract_hidden_states, batched=True)
eval_dds_hidden = eval_dds.map(extract_hidden_states, batched=True)

In [None]:
import numpy as np

X_train = np.array(dds_hidden["hidden_state"])
y_train = np.array(dds_hidden["label"])
X_test = np.array(eval_dds_hidden["hidden_state"])


Here we see that the original input was transformed into a 768 dimensional vector

In [None]:
X_train[0].shape

In the next cell we visualize the effect of transforming the input into a 768 dimensional vector. For this we utilize the umap algorithm. This algorithm maps the 768 dimensional vector into a 2 dimensional vector. After mapping all instances belong to the same class as before the mapping. This gives us an idea of how good the feature engineering process made our samples more distinguishable.

In [None]:
# Scale features to [0,1] range
X_scaled = MinMaxScaler().fit_transform(X_train)
# Initialize and fit UMAP
mapper = UMAP(n_components=2, metric="cosine").fit(X_scaled)
# Create a DataFrame of 2D embeddings
df_emb = pd.DataFrame(mapper.embedding_, columns=["X", "Y"])
df_emb["label"] = y_train
fig, axes = plt.subplots(1, 5, figsize=(8,2))
axes = axes.flatten()
cmaps = ["Greys", "Blues", "Oranges", "Reds", "Purples"]
labels = df_emb["label"].unique().astype(str)
labels.sort()
for i, (label, cmap) in enumerate(zip(labels, cmaps)):
    df_emb_sub = df_emb.query(f"label == {label}")
    axes[i].hexbin(df_emb_sub["X"], df_emb_sub["Y"], cmap=cmap,
                   gridsize=20, linewidths=(0,))
    axes[i].set_title(label)
    axes[i].set_xticks([]), axes[i].set_yticks([])

plt.tight_layout()
plt.show()

As you can see the pre-processing does not separate the input very well. Many samples that have a label of 0.25, 0.5 or 0.75 fall into the same region. You could try out another model that might separate the input samples more clearly.

# Downstream model

As Downstream model we utilize logistic regression as this model can be trained quite fast with input that has got many features.

In [None]:
lbl_enc_train = preprocessing.LabelEncoder()
y_train = lbl_enc_train.fit_transform(y_train)

In [None]:
from sklearn.linear_model import LogisticRegression

# We increase `max_iter` to guarantee convergence
lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(X_train, y_train)

In [None]:
LR_prediction=lr_clf.predict(X_test)

In [None]:
LR_prediction

In [None]:
rf_classifier=RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

In [None]:
RF_prediction=rf_classifier.predict(X_test)

In [None]:
RF_prediction=lbl_enc_train.inverse_transform(RF_prediction)

In [None]:
LR_prediction=lbl_enc_train.inverse_transform(LR_prediction)

In [None]:
w1=.66
w2=.33

In [None]:
FINAL_prediction=w1*LR_prediction+w2*RF_prediction

In [None]:
FINAL_prediction

In [None]:
sub=pd.read_csv(CFG.sample_submission)

In [None]:
sub['score'] = FINAL_prediction

In [None]:
sub.to_csv('submission.csv', index=False)