In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
cd /content/drive/MyDrive/text_classification/word_vectors/ernie

/content/drive/MyDrive/text_classification/word_vectors/ernie


In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m28.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m47.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m56.2 MB/s[0m eta [36m0:00:0

In [7]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel

In [8]:
tokenizer = AutoTokenizer.from_pretrained('nghuyong/ernie-1.0')
model = AutoModel.from_pretrained('nghuyong/ernie-1.0')

data = pd.read_excel('./data.xlsx')
sentences = data['review'].tolist()

word_vectors = []

for sentence in sentences:
    inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding=True)

    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    word_vector = outputs.last_hidden_state.squeeze(0).numpy()
    word_vectors.append(word_vector)

# Concatenate all word vectors
word_vectors = np.concatenate(word_vectors)

# Save the word vectors to a file
np.savetxt('word_vectors.txt', word_vectors)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [10]:
# Get the word vector for a specific word
word = '宗教'
vocab = tokenizer.get_vocab()
if word in vocab:
    word_index = vocab[word]
    word_vector = word_vectors[word_index]

In [11]:
word_vector

array([[ 0.0773509 , -0.04025023,  0.35480556, ..., -0.28021708,
        -0.7773344 ,  0.16867734],
       [ 0.8146401 , -0.07301649,  0.69244546, ...,  0.41400278,
         0.7778681 , -2.1493318 ],
       [-0.3957155 ,  0.5515925 ,  0.50673115, ..., -0.22774471,
         0.615883  , -2.2146833 ],
       ...,
       [ 0.1858219 , -0.1026479 ,  0.20203154, ..., -0.40363574,
         0.5757687 , -0.46530357],
       [-0.2398001 ,  0.1756868 ,  0.98601365, ..., -0.07775233,
         0.497204  , -1.8423371 ],
       [ 0.07735107, -0.04024997,  0.35480556, ..., -0.28021675,
        -0.77733403,  0.16867483]], dtype=float32)

In [12]:
# get dimension
word_vector.shape[0]

29