## Embedding Visualization with TSNE

[CLS] is used as sentence embedding.

In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [2]:
!pip install transformers
!pip3 install pickle5
!pip install --upgrade --user pandas==1.3
!pip install pyyaml==5.4.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 14.5 MB/s 
Collecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 70.7 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 63.2 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pickle5
  Downloading pickle5-0.0.11.tar.gz (132 kB)
[K     |████████████████████████████████| 132 kB 13.3 MB/s 
[?25hB

In [3]:
import numpy as np
import pandas as pd
import pickle5 as pickle
import plotly.express as px

from pathlib import Path
from sklearn.manifold import TSNE
from transformers import AutoConfig, AutoModelForMaskedLM, AutoTokenizer, RobertaModel, AutoModel, TextClassificationPipeline, AutoModelForSequenceClassification
import torch
from tqdm import tqdm
from tqdm.autonotebook import trange
import torch.nn.functional as F

In [4]:
PROJECT_DIR  = Path('/content/drive/MyDrive/few-shot-mr/')

BASE_MODEL = PROJECT_DIR.joinpath('MR_new/MR-RL-20-NORMAL')
MODELS = [PROJECT_DIR.joinpath(f'MR_new/{model}') for model in ['MR-RL-20-NORMAL', 'MR-RL-S20-0.9_la-0.6_SupCon_only_cls', 'MR-RL-S20-C25_0.4_m0.7_la-9_ga0.1_SoftTripleLoss_all']]

In [7]:
with open(BASE_MODEL.joinpath("MR_all.pkl"), 'rb') as pickle_file:
  df = pickle.load(pickle_file)

df = df[~(df["sentence"].isin(["* \n", "\n", ""]))]
df.shape

(10662, 2)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def batch_to_device(batch, target_device: device):
    """
    send a pytorch batch to a device (CPU/GPU)
    """
    for key in batch:
      batch[key] = batch[key].to(target_device)
    return batch


def prepare_embeddings(examples, model, tokenizer, device, batch_size=4):
  embeddings = []
  model = model.to(device)
  for start_index in trange(0, len(examples), batch_size, desc="Batches"):
      sentences_batch = examples[start_index:start_index+batch_size]
      features = tokenizer(sentences_batch, padding='max_length', max_length=512, truncation=True, return_tensors='pt')
      features = batch_to_device(features, device)
      embeddings.extend(model(**features).last_hidden_state[:, 0, :].squeeze().detach().to("cpu").numpy())
  return embeddings


def prepare_predictions(examples, model, tokenizer, device, batch_size=4):
  predictions = []
  model = model.to(device)
  for start_index in trange(0, len(examples), batch_size, desc="Batches"):
    sentences_batch = examples[start_index:start_index+batch_size]
    features = tokenizer(sentences_batch, padding='max_length', max_length=512, truncation=True, return_tensors='pt')
    features = batch_to_device(features, device)
    outputs = model(**features)
    probas = F.softmax(outputs.logits).detach()
    predictions.extend(probas.argmax(dim=1).cpu().numpy())
  return predictions


### Prepare embeddings

In [None]:
examples = df['sentence'].to_list()
for model_path in MODELS:
  config = AutoConfig.from_pretrained(model_path)
  config.output_hidden_states = True
  tokenizer = AutoTokenizer.from_pretrained(model_path)
  model = AutoModel.from_pretrained(model_path, config=config)  
  embeddings = prepare_embeddings(examples, model, tokenizer, device, batch_size=4)
  
  np.save(PROJECT_DIR.joinpath(f'embeddings/20_MR_new/embeddings_{str(model_path).split("-")[-1].lower()}_all.npy'), embeddings)

Some weights of the model checkpoint at /content/drive/MyDrive/wcci2022/MR_new/MR-RL-20-NORMAL were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/wcci2022/MR_new/MR-RL-20-NORMAL and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be ab

Batches:   0%|          | 0/2666 [00:00<?, ?it/s]

Some weights of the model checkpoint at /content/drive/MyDrive/wcci2022/MR_new/MR-RL-S20-0.9_la-0.6_SupCon_only_cls were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/wcci2022/MR_new/MR-RL-S20-0.9_la-0.6_SupCon_only_cls and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN

Batches:   0%|          | 0/2666 [00:00<?, ?it/s]

Some weights of the model checkpoint at /content/drive/MyDrive/wcci2022/MR_new/MR-RL-S20-C25_0.4_m0.7_la-9_ga0.1_SoftTripleLoss_all were not used when initializing RobertaModel: ['classifier.dense.bias', 'classifier.out_proj.bias', 'ntloss.fc', 'classifier.dense.weight', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at /content/drive/MyDrive/wcci2022/MR_new/MR-RL-S20-C25_0.4_m0.7_la-9_ga0.1_SoftTripleLoss_all and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.

Batches:   0%|          | 0/2666 [00:00<?, ?it/s]

## TSNE

In [5]:
def visualize_tsne(embeddings, label, sentence, model_type):
  X_embedded = TSNE(n_components=2).fit_transform(embeddings)

  df_embeddings = pd.DataFrame(X_embedded)
  df_embeddings = df_embeddings.rename(columns={0: 'x',1: 'y'})
  df_embeddings = df_embeddings.assign(label=label)

  df_embeddings = df_embeddings.assign(text=sentence)
  fig = px.scatter(
      df_embeddings, x='x', y='y',
      color='label', labels={'color': 'label'},
      hover_data=['text'], title = 'Embedding Visualization',
      width=1000,
      height=1000,)
  fig.show()
  save_dir = str(PROJECT_DIR.joinpath(f"embeddings/20_MR_new/cls-{model_type}-embedding.html"))
  print(f"Saving file in {save_dir}")
  fig.write_html(save_dir)

In [None]:
for model_path in MODELS:
  print(model_path)
  model_type = str(model_path).split("-")[-1].lower()
  embeddings = np.load(PROJECT_DIR.joinpath(f'embeddings/20_MR_new/embeddings_{model_type}_all.npy'))
  visualize_tsne(embeddings, label=df['label'].values, sentence=df['sentence'].values, model_type=model_type)