# Feature Engineering based on GPT-2 (768 dim embedding)

## 1. Imports and Hardware Accelerants

In [1]:
!pip install -q git+https://github.com/huggingface/transformers.git
!pip install datasets

from datasets import load_dataset, Dataset
import os
import tensorflow as tf
from transformers import DataCollatorWithPadding, GPT2Tokenizer, TFGPT2Model
import numpy as np
from tqdm.notebook import tqdm
import pandas as pd
import json

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
    Preparing wheel metadata ... [?25l[?25hdone


In [2]:
# see GPU being used
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Sun Mar  6 11:03:45 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P0    29W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 27.3 gigabytes of available RAM

You are using a high-RAM runtime!


## 2. Data Imports

### 2.1. SNLI - Main Dataset - contains the training dataset and in-distribution evaluation datasets

In [4]:
snli = load_dataset("snli")
snli

Reusing dataset snli (/root/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 550152
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
})

### 2.2. Hans Dataset - for out-of-distribution evaluation

In [5]:
hans = load_dataset("hans")
hans

Reusing dataset hans (/root/.cache/huggingface/datasets/hans/plain_text/1.0.0/1bbcb735c482acd54f2e118074b59cfd2bf5f7a5a285d4d540d1e632216672ac)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'parse_premise', 'parse_hypothesis', 'binary_parse_premise', 'binary_parse_hypothesis', 'heuristic', 'subcase', 'template'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label', 'parse_premise', 'parse_hypothesis', 'binary_parse_premise', 'binary_parse_hypothesis', 'heuristic', 'subcase', 'template'],
        num_rows: 30000
    })
})

### 2.3. NLI Diagnostics - for out-of-distribution evaluation

In [6]:
!wget -q -nc https://www.dropbox.com/s/ju7d95ifb072q9f/diagnostic-full.tsv
nli_diagnostics = pd.read_csv('/content/diagnostic-full.tsv', 
                              delimiter = '\t')

## 3. Initial Data Pre-processing

### 3.1. SNLI

In [7]:
snli_train_premise = snli["train"]["premise"]
snli_train_hypothesis = snli["train"]["hypothesis"]
snli_train_label = snli["train"]["label"]

snli_valid_premise = snli["validation"]["premise"]
snli_valid_hypothesis = snli["validation"]["hypothesis"]
snli_valid_label = snli["validation"]["label"]

snli_test_premise = snli["test"]["premise"]
snli_test_hypothesis = snli["test"]["hypothesis"]
snli_test_label = snli["test"]["label"]

In [8]:
snli_train_premise[0:5], snli_train_hypothesis[0:5], snli_train_label[0:5]

(['A person on a horse jumps over a broken down airplane.',
  'A person on a horse jumps over a broken down airplane.',
  'A person on a horse jumps over a broken down airplane.',
  'Children smiling and waving at camera',
  'Children smiling and waving at camera'],
 ['A person is training his horse for a competition.',
  'A person is at a diner, ordering an omelette.',
  'A person is outdoors, on a horse.',
  'They are smiling at their parents',
  'There are children present'],
 [1, 2, 0, 1, 0])

### 3.2. Hans

In [9]:
all_hans = hans["train"]["premise"] + hans["train"]["hypothesis"] + \
  hans["validation"]["premise"] + hans["validation"]["hypothesis"]

### 3.3. NLI Diagnostics

In [10]:
all_nli_diag = list(nli_diagnostics["Premise"]) + \
  list(nli_diagnostics["Hypothesis"])

### 3.4. Combine all data
NB: Combining all data (appending in a list) is acceptable here because we only want to use GPT-2 to obtain feature representation / embeddings for each sequence in the list

In [11]:
# set up huggingface Dataset object with all the texts
all_texts = snli_train_premise + snli_train_hypothesis + \
                snli_valid_premise + snli_valid_hypothesis + \
                snli_test_premise + snli_test_hypothesis + \
                all_hans + all_nli_diag

hf_dataset = Dataset.from_dict({"all_texts": all_texts})

## 4. Feature Engineering using GPT2 (768 dim hidden state)

### 4.1. Preparation

In [12]:
# set up tokenizer -- particularly designed for GPT-2 -- NB: padding to left because GPT2 uses last token for prediction
tokenizer = GPT2Tokenizer.from_pretrained("gpt2", padding_side = 'left', padding = True, truncation = True)
tokenizer.pad_token = tokenizer.eos_token # pad with 'eos' token

In [14]:
# set up data collator - https://huggingface.co/docs/transformers/main_classes/data_collator
# this is a (callable) helper object that sends batches of data to the model
data_collator = DataCollatorWithPadding(tokenizer, padding = 'max_length', \
                                        return_tensors = 'tf', max_length = 100)

In [15]:
# set up GPT2
model = TFGPT2Model.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer)) # Resize input token embeddings matrix if num_tokens != config.vocab_size. - Source: HuggingFace
model.config.pad_token_id = model.config.eos_token_id # specify pad_token used by tokenizer

All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


### 4.2. Prep data for input into GPT2

In [16]:
# tokenize data; format dataset as tf.tensors for model input; push data to a generator
tokenized_tf = hf_dataset.map( \
  lambda x: tokenizer(x['all_texts']), batched=True)

tokenized_tf = tokenized_tf.to_tf_dataset(
  columns = ['input_ids', 'attention_mask'],
  shuffle = False, # no training going on; we just want GPT-2 features
  batch_size = 300,
  collate_fn = data_collator
)

tokenized_tf

  0%|          | 0/1263 [00:00<?, ?ba/s]

<PrefetchDataset element_spec={'input_ids': TensorSpec(shape=(None, None), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, None), dtype=tf.int64, name=None)}>

### 4.3. Get Features: get final hidden state produced after forward feeding through pre-trained GPT2 network

In [None]:
[idx for idx, x in enumerate(tokenized_tf)] # test run for checking if max_length = 100 worked

In [18]:
dir = './output'
os.mkdir(dir)

for idx, batch in tqdm(enumerate(tokenized_tf), total = len(tokenized_tf)):

  # super-batch every 1000 runs
  if idx % 1000 == 0:

    if idx > 0:
    
      # write out prev super-batch output to file -- to clear RAM
      file_num = str(int(idx/1000))
      np.savetxt(dir + "/out" + file_num + ".csv",
                  out, delimiter=",")

    # re-start fetching output in a new array
    out = model(**batch).last_hidden_state.numpy()[:, -1, :]
  
  else:

    # append to output container
    out = np.concatenate((out, 
        model(**batch).last_hidden_state.numpy()[:, -1, :]))

# write out final super-batch output
file_num = str(int(np.ceil(idx/1000)))
np.savetxt(dir + "/out" + file_num + ".csv",
            out, delimiter=",")

  0%|          | 0/4209 [00:00<?, ?it/s]