# Text Classification - Augmented Finetuned Embeddings

----



## $\color{blue}{Sections:}$
* Preamble
* Admin - importing libraries
* Load - Loading our data from pandas
* Embeddings - create the embeddings
* Save - save the embeddings on dataframes and docs

## $\color{blue}{Preamble:}$

Specifically for the augmented versions of the training set.

This note book will create embeddings and update dataframes and docs with embeddings from a finetuned version of ['thenlper/gte-base'](https://huggingface.co/thenlper/gte-base).

## $\color{blue}{Admin:}$


In [None]:
from google.colab import drive

In [None]:
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/'


Mounted at /content/drive
/content/drive/MyDrive


In [None]:
%%capture
!pip install sentence-transformers huggingface_hub

In [None]:
%%capture
!pip install dill
!pip install langchain

In [None]:
import torch
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


## $\color{blue}{Load:}$

In [None]:
import pandas as pd
path = "class/datasets/" # modify path
df_train = pd.read_pickle(path + "df_train_augmentation.1")
df_dev = pd.read_pickle(path + "df_dev_augmentation.1")
df_test = pd.read_pickle(path + "df_test_augmentation.1")

## $\color{blue}{Embeddings:}$

In [None]:
import os
from getpass import getpass
from huggingface_hub import login

# Prompt for your Hugging Face token securely
token = getpass("Please enter your Hugging Face token: ")

Please enter your Hugging Face token: ··········


In [None]:
# Use the token for Hugging Face login
if token:
    print("HuggingFace token has been successfully entered.")
    login(token=token)
else:
    print("Continuing without Hugging Face login")

HuggingFace token has been successfully entered.


In [None]:
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

input_texts = "what is the capital of China?",

tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-base")
model = AutoModel.from_pretrained("thenlper/gte-base")
model.load_state_dict(torch.load('class/models/direct_ft_augmented_embedding_model.pt'))

# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')

outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# (Optionally) normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
embeddings = embeddings.cpu().detach().squeeze()

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/618 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/219M [00:00<?, ?B/s]

In [None]:
model = model.to(device)

In [None]:
print(type(embeddings))
embeddings.size()

<class 'torch.Tensor'>


torch.Size([768])

In [None]:
embeddings

tensor([ 1.5256e-02, -8.4819e-03, -2.8939e-02,  6.0337e-02, -1.9855e-02,
         2.6115e-02,  4.7499e-02,  4.8150e-02,  2.0088e-02, -2.1950e-02,
        -3.7548e-02, -3.7851e-02, -4.1342e-02,  2.7627e-02,  1.8121e-03,
         3.0122e-02,  6.3803e-02,  3.6288e-03, -3.1450e-02,  2.7614e-02,
        -1.5874e-02, -4.0164e-04,  1.6758e-03,  3.7804e-02,  6.1533e-02,
         3.2934e-02, -3.4332e-02, -3.1549e-03, -7.4693e-02, -2.3001e-02,
        -2.6533e-02,  1.4006e-02,  2.9143e-02,  2.6086e-02,  8.8917e-03,
        -1.2343e-02, -3.2415e-02,  3.5483e-03,  2.0994e-03, -1.7454e-03,
        -2.0006e-03, -2.8238e-02, -2.0821e-02,  1.9302e-03, -9.9600e-03,
        -3.5052e-04, -1.1172e-02,  3.4983e-02, -5.5392e-02,  1.8625e-02,
        -6.2504e-02,  3.1712e-02, -1.6220e-02,  3.6161e-02,  1.5239e-03,
         6.3138e-02,  1.9728e-02, -8.6995e-02, -4.6716e-02, -5.0675e-02,
         6.0784e-02,  4.2556e-02,  5.8336e-02, -8.3014e-03,  1.9886e-02,
         1.8685e-02, -1.2038e-02,  5.3014e-02, -5.8

### $\color{red}{Train:}$

In [None]:
from tqdm import tqdm
train_sentences = list(df_train['content'])
train_embeddings = []
for sent in tqdm(train_sentences):
  batch_dict = tokenizer(sent, max_length=512, padding=True, truncation=True, return_tensors='pt')
  for key in batch_dict:
      batch_dict[key] = batch_dict[key].to(device)
  output = model(**batch_dict)
  embedding = average_pool(output.last_hidden_state, batch_dict['attention_mask'])
  embedding = F.normalize(embedding, p=2, dim=1)
  embedding = embedding.cpu().detach().squeeze()
  train_embeddings.append(embedding)

100%|██████████| 20474/20474 [04:19<00:00, 78.78it/s]


### $\color{red}{Dev:}$

In [None]:
from tqdm import tqdm
dev_sentences = list(df_dev['content'])
dev_embeddings = []
for sent in tqdm(dev_sentences):
  batch_dict = tokenizer(sent, max_length=512, padding=True, truncation=True, return_tensors='pt')
  for key in batch_dict:
      batch_dict[key] = batch_dict[key].to(device)
  output = model(**batch_dict)
  embedding = average_pool(output.last_hidden_state, batch_dict['attention_mask'])
  embedding = F.normalize(embedding, p=2, dim=1)
  embedding = embedding.cpu().detach().squeeze()
  dev_embeddings.append(embedding)

100%|██████████| 746/746 [00:09<00:00, 79.28it/s]


### $\color{red}{Test:}$

In [None]:
from tqdm import tqdm
test_sentences = list(df_test['content'])
test_embeddings = []
for sent in tqdm(test_sentences):
  batch_dict = tokenizer(sent, max_length=512, padding=True, truncation=True, return_tensors='pt')
  for key in batch_dict:
        batch_dict[key] = batch_dict[key].to(device)
  output = model(**batch_dict)
  embedding = average_pool(output.last_hidden_state, batch_dict['attention_mask'])
  embedding = F.normalize(embedding, p=2, dim=1)
  embedding = embedding.cpu().detach().squeeze()
  test_embeddings.append(embedding)

100%|██████████| 500/500 [00:06<00:00, 79.59it/s]


## $\color{blue}{Save:}$

### $\color{red}{Save-DataFrames:}$

In [None]:
train_embeddings_np = [tensor.numpy() for tensor in train_embeddings]
dev_embeddings_np = [tensor.numpy() for tensor in dev_embeddings]
test_embeddings_np = [tensor.numpy() for tensor in test_embeddings]


In [None]:
df_train['direct_ft_augmented_embedding'] = train_embeddings_np
df_dev['direct_ft_augmented_embedding'] = dev_embeddings_np
df_test['direct_ft_augmented_embedding'] = test_embeddings_np


In [None]:
df_train.head()

Unnamed: 0,master,book_idx,chapter_idx,content,vanilla_embedding.1,direct_ft_augmented_embedding
0,Ulysses,0,0,"Halted, he peered down the dark winding stairs...","[-0.01852537, -0.021713095, 0.041504614, -0.00...","[0.008948476, -0.0041401153, 0.06354155, -0.00..."
1,Ulysses,0,0,"Then, catching sight of Stephen Dedalus, he be...","[-0.019168912, -0.0048065097, -0.012622914, -0...","[0.013568789, -0.0069108065, 0.04533954, -0.00..."
2,Ulysses,0,0,"Stephen Dedalus, displeased and sleepy, leaned...","[-0.025832051, -0.0060330997, -0.013755375, 0....","[-0.0016611578, -0.013049294, 0.03868145, 0.01..."
3,Ulysses,0,0,he said sternly. He added in a preacher’s to...,"[-0.008437265, -0.011068143, 0.029162964, 0.00...","[0.0033423274, -0.027423449, 0.05888734, 0.001..."
4,Ulysses,0,0,He peered sideways up and gave a long slow whi...,"[-0.016204245, 0.015205742, 0.023865266, -0.01...","[0.0075438805, -0.027853461, 0.06111119, -0.00..."


In [None]:
df_train.to_pickle(path + "df_train_augmentation_ft")
df_dev.to_pickle(path + "df_dev_augmentation_ft")
df_test.to_pickle(path + "df_test_augmentation_ft")