In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m63.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m104.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m94.6 MB/s[0m eta [36m0:00:

In [2]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.3 MB[0m [31m5.8 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/1.3 MB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [3]:
import pandas as pd
from transformers import T5Tokenizer, T5Model
import torch

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
cd /content/drive/MyDrive/text_classification/word_vectors/t5

/content/drive/MyDrive/text_classification/word_vectors/t5


In [7]:
# Read excel file
df = pd.read_excel('./data.xlsx')

# Initialize T5 tokenizer and model
tokenizer = T5Tokenizer.from_pretrained('t5-base')
model = T5Model.from_pretrained('t5-base')

# Create a list to store all the word vectors
all_vectors = []

# For each row in the DataFrame
for index, row in df.iterrows():
    # Input text
    text = row['review']

    # Encode the input
    inputs = tokenizer.encode_plus(text, return_tensors='pt', padding=True, truncation=True)

    # Provide dummy decoder inputs
    inputs['decoder_input_ids'] = torch.tensor([[0]])

    # Pass the input through the model and get the word vectors
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state

    # Append the tensor to all_vectors
    all_vectors.append(last_hidden_states)

# Concatenate the tensors along the batch dimension
all_vectors = torch.cat(all_vectors, dim=0)

# Save to file
torch.save(all_vectors, 't5_vectors.pt')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [10]:
# get a word vector
word = "宗教"
word_index = tokenizer.convert_tokens_to_ids(word)
word_vector = all_vectors[word_index]

In [11]:
word_vector

tensor([[-8.1013e-02,  3.0991e-03,  7.0416e-02, -1.1984e-01,  1.0911e-02,
          5.4737e-02,  9.7534e-04, -4.6635e-02, -3.0373e-02, -1.9431e-02,
         -3.1763e-02, -2.3609e-02, -9.9756e-01, -4.2645e-02,  4.5994e-02,
          4.0119e-02, -1.3130e-02, -9.3535e-02,  9.0953e-02,  9.6026e-03,
          4.0389e-02, -6.0000e-02, -6.8650e-02,  3.6024e-02, -3.9379e-03,
          5.9535e-02, -1.0701e-01,  8.1566e-01, -2.0432e-06, -7.9362e-03,
          3.3575e-02,  1.2464e+00, -1.1482e-03,  4.0915e-03, -1.4089e-01,
          2.3354e-02,  5.1622e-02, -1.0110e-01, -8.8161e-02, -6.5211e-04,
          4.4628e-02,  4.1752e-02,  6.0915e-02,  3.9070e-02,  2.3365e-01,
          7.8357e-02, -2.0556e-02, -9.3222e-02, -1.8359e-02, -5.4452e-02,
         -4.0602e-02, -8.1542e-02, -1.7528e-01,  5.6859e-02,  5.8969e+00,
         -1.7546e-02, -9.6429e-02,  1.4797e-01, -3.5602e-03,  7.4621e-02,
          9.4105e-02,  5.7047e-02,  6.7089e-02,  1.8988e-02,  7.2183e-02,
          6.8254e-01,  3.5832e-02, -4.

In [12]:
word_vector.size(-1)

768