In [3]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m101.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m61.0 MB/s[0m eta [36m0:00:

In [4]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2Model
import torch

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
cd /content/drive/MyDrive/text_classification/word_vectors

/content/drive/MyDrive/text_classification/word_vectors


In [10]:
# Read excel file
df = pd.read_excel('./data.xlsx')

# Initialize GPT2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2Model.from_pretrained('gpt2')

# Get the maximum length of the tensors
max_length = 0
for index, row in df.iterrows():
    text = row['review']
    inputs = tokenizer(text, return_tensors='pt')
    max_length = max(max_length, inputs.input_ids.shape[1])

# Create a list to store all the padded tensors
all_vectors = []

# For each row in the DataFrame
for index, row in df.iterrows():
    text = row['review']
    inputs = tokenizer(text, return_tensors='pt')

    # Pad the input tensor to the maximum length
    padded_inputs = torch.nn.functional.pad(inputs.input_ids, (0, max_length - inputs.input_ids.shape[1]))

    # Pass the padded input through the model and get the last hidden state
    outputs = model(input_ids=padded_inputs)
    last_hidden_states = outputs.last_hidden_state

    # Append the tensor to the list
    all_vectors.append(last_hidden_states)

# Concatenate the tensors along dimension 0
all_vectors_tensor = torch.cat(all_vectors, dim=0)

# Save to file
torch.save(all_vectors_tensor, 'gpt2_vectors.pt')

In [11]:
# get a word vector
word = "宗教"
encoded_input = tokenizer.encode(word, return_tensors='pt')
with torch.no_grad():
    outputs = model(encoded_input)
    last_hidden_state = outputs.last_hidden_state
word_vector = last_hidden_state[0, 0, :]

In [12]:
print(word_vector)

tensor([-1.9958e-01, -6.1397e-02, -3.9569e-01,  8.8818e-03,  5.1586e-02,
        -2.4651e-01,  3.8516e+00, -6.1524e-02, -2.2814e-01,  5.6127e-02,
         4.2728e-01,  1.3186e-01,  1.0884e-01,  7.4197e-02, -2.9171e-01,
         2.3068e-02,  3.1887e-03,  4.3155e-02, -6.3804e-02, -5.3721e-01,
         1.1262e-01, -9.4910e-02, -2.1613e-01,  6.3404e-02,  5.5840e-02,
         1.4192e-01, -1.6311e-01, -2.4839e-01, -8.7441e-02, -3.7314e-01,
         2.9608e-02,  9.4477e-04, -8.7810e-02, -4.5631e-01, -8.8451e-02,
         3.4351e-01,  2.1863e+01,  7.8878e-02,  8.2113e-02, -6.6090e-02,
         1.5352e-02, -8.6923e-02,  5.9910e-02, -3.3587e-01, -5.5899e-02,
         2.0056e-01, -1.7704e-01,  2.3824e-02,  6.1148e-02, -2.5813e-01,
        -2.5291e-02,  7.7428e-02,  1.8166e-02, -7.5684e-02,  1.0221e-01,
         1.9594e-01,  1.1243e-01, -1.2608e-01, -1.4457e-01, -6.6094e-02,
        -6.1421e-02, -7.1418e-02, -8.3673e-02, -7.1453e-02, -1.0403e+00,
        -1.4443e-01, -6.0125e-02, -3.6301e-02,  1.4

In [14]:
model.config.hidden_size

768