# Text Classification - Vanilla Embeddings

----



## $\color{blue}{Sections:}$
* Preamble
* Admin - importing libraries
* Load - Loading our data from pandas
* Embeddings - create the embeddings
* Save - save the embeddings on dataframes and docs

## $\color{blue}{Preamble:}$

Specifically for the augmented versions of the training set.

This note book will create embeddings and update dataframes and docs with embeddings from ['thenlper/gte-base'](https://huggingface.co/thenlper/gte-base).

## $\color{blue}{Admin:}$


In [1]:
from google.colab import drive

In [2]:
drive.mount("/content/drive")
%cd '/content/drive/MyDrive/'


Mounted at /content/drive
/content/drive/MyDrive


In [None]:
%%capture
!pip install sentence-transformers huggingface_hub

In [4]:
%%capture
!pip install dill
!pip install langchain

## $\color{blue}{Load:}$

In [None]:
import pandas as pd
path = "class/datasets/" # modify path
df_train = pd.read_pickle(path + "df_train_augmentation.1")
df_dev = pd.read_pickle(path + "df_dev_augmentation.1")
df_test = pd.read_pickle(path + "df_test_augmentation.1")

## $\color{blue}{Embeddings:}$

In [None]:
import os
from getpass import getpass
from huggingface_hub import login

# Prompt for your Hugging Face token securely
token = getpass("Please enter your Hugging Face token: ")

In [None]:
# Use the token for Hugging Face login
if token:
    print("HuggingFace token has been successfully entered.")
    login(token=token)
else:
    print("Continuing without Hugging Face login")

In [None]:
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel

def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

input_texts = "what is the capital of China?",

tokenizer = AutoTokenizer.from_pretrained("thenlper/gte-base")
model = AutoModel.from_pretrained("thenlper/gte-base")

# Tokenize the input texts
batch_dict = tokenizer(input_texts, max_length=512, padding=True, truncation=True, return_tensors='pt')

outputs = model(**batch_dict)
embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# (Optionally) normalize embeddings
embeddings = F.normalize(embeddings, p=2, dim=1)
embeddings = embeddings.cpu().detach().squeeze()

In [None]:
print(type(embeddings))
embeddings.size()

In [None]:
embeddings

### $\color{red}{Train:}$

In [None]:
from tqdm import tqdm
train_sentences = list(df_train['content'])
train_embeddings = []
for sent in tqdm(train_sentences):
  batch_dict = tokenizer(sent, max_length=512, padding=True, truncation=True, return_tensors='pt')
  output = model(**batch_dict)
  embedding = average_pool(output.last_hidden_state, batch_dict['attention_mask'])
  embedding = F.normalize(embedding, p=2, dim=1)
  embedding = embedding.cpu().detach().squeeze()
  train_embeddings.append(embedding)

100%|██████████| 20474/20474 [1:29:55<00:00,  3.79it/s]


### $\color{red}{Dev:}$

In [None]:
from tqdm import tqdm
dev_sentences = list(df_dev['content'])
dev_embeddings = []
for sent in tqdm(dev_sentences):
  batch_dict = tokenizer(sent, max_length=512, padding=True, truncation=True, return_tensors='pt')
  output = model(**batch_dict)
  embedding = average_pool(output.last_hidden_state, batch_dict['attention_mask'])
  embedding = F.normalize(embedding, p=2, dim=1)
  embedding = embedding.cpu().detach().squeeze()
  dev_embeddings.append(embedding)

100%|██████████| 746/746 [03:09<00:00,  3.94it/s]


### $\color{red}{Test:}$

In [None]:
from tqdm import tqdm
test_sentences = list(df_test['content'])
test_embeddings = []
for sent in tqdm(test_sentences):
  batch_dict = tokenizer(sent, max_length=512, padding=True, truncation=True, return_tensors='pt')
  output = model(**batch_dict)
  embedding = average_pool(output.last_hidden_state, batch_dict['attention_mask'])
  embedding = F.normalize(embedding, p=2, dim=1)
  embedding = embedding.cpu().detach().squeeze()
  test_embeddings.append(embedding)

100%|██████████| 500/500 [02:11<00:00,  3.81it/s]


## $\color{blue}{Save:}$

### $\color{red}{Save-DataFrames:}$

In [None]:
train_embeddings_np = [tensor.numpy() for tensor in train_embeddings]
dev_embeddings_np = [tensor.numpy() for tensor in dev_embeddings]
test_embeddings_np = [tensor.numpy() for tensor in test_embeddings]


In [None]:
df_train['vanilla_embedding.1'] = train_embeddings_np
df_dev['vanilla_embedding.1'] = dev_embeddings_np
df_test['vanilla_embedding.1'] = test_embeddings_np


In [None]:
df_train.head()

Unnamed: 0,master,book_idx,chapter_idx,content,vanilla_embedding.1
0,Ulysses,0,0,"Halted, he peered down the dark winding stairs...","[-0.01852537, -0.021713095, 0.041504614, -0.00..."
1,Ulysses,0,0,"Then, catching sight of Stephen Dedalus, he be...","[-0.019168912, -0.0048065097, -0.012622914, -0..."
2,Ulysses,0,0,"Stephen Dedalus, displeased and sleepy, leaned...","[-0.025832051, -0.0060330997, -0.013755375, 0...."
3,Ulysses,0,0,he said sternly. He added in a preacher’s to...,"[-0.008437265, -0.011068143, 0.029162964, 0.00..."
4,Ulysses,0,0,He peered sideways up and gave a long slow whi...,"[-0.016204245, 0.015205742, 0.023865266, -0.01..."


In [None]:
df_train.to_pickle(path + "df_train_augmentation.1")
df_dev.to_pickle(path + "df_dev_augmentation.1")
df_test.to_pickle(path + "df_test_augmentation.1")