In [1]:
# 首先，我们需要对数据进行预处理，包括分词、构建词汇表和生成训练数据。

import torch
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import numpy as np

# 分词
def tokenize(text):
    return text.lower().split()

# 构建词汇表
def build_vocab(tokenized_text):
    word_counts = Counter(tokenized_text)
    vocab = {word: idx for idx, (word, _) in enumerate(word_counts.most_common())}
    return vocab

In [2]:
# 生成训练数据
class TextDataset(Dataset):
    def __init__(self, text, vocab):
        self.text = text
        self.vocab = vocab

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        return self.text[idx], self.vocab[self.text[idx]]

text = "I have a cat. She likes to play with her toys. My cat is very cute."
tokenized_text = tokenize(text)
vocab = build_vocab(tokenized_text)
dataset = TextDataset(tokenized_text, vocab)
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)

    

In [7]:
for batch in dataloader:
    X, inputs = batch
    print(X, inputs)

('play', 'very') tensor([ 7, 14])
('cat', 'likes') tensor([12,  5])
('cute.', 'a') tensor([15,  2])
('cat.', 'her') tensor([3, 9])
('i', 'have') tensor([0, 1])
('is', 'my') tensor([13, 11])
('toys.', 'to') tensor([10,  6])
('she', 'with') tensor([4, 8])


In [2]:
# 3. 搭建ELMo模型

# 接下来，我们将使用PyTorch搭建ELMo模型。模型包括一个词嵌入层、一个双向LSTM层和一个线性输出层。

import torch.nn as nn

class ELMo(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers):
        super(ELMo, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers, bidirectional=True)
        self.linear = nn.Linear(hidden_dim * 2, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.linear(x)
        return x

vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 128
num_layers = 2
model = ELMo(vocab_size, embedding_dim, hidden_dim, num_layers)

   

# 4. 训练模型

# 现在我们可以开始训练模型。我们将使用交叉熵损失函数和Adam优化器。

import torch.optim as optim

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 20
for epoch in range(num_epochs):
    for batch in dataloader:
        _, inputs = batch
        inputs = torch.tensor(inputs).long()  # 将输入数据转换为张量
        targets = torch.tensor(inputs).long()  # 将目标数据转换为张量
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}")


# 5. 预测

# 训练完成后，我们可以使用模型进行预测。这里我们将预测一个简单的句子：“My cat likes to play.”

def predict(model, sentence, vocab):
    tokenized_sentence = tokenize(sentence)
    input_ids = [vocab[word] for word in tokenized_sentence]
    inputs = torch.tensor(input_ids).unsqueeze(1)
    outputs = model(inputs)
    predictions = torch.argmax(outputs, dim=-1)
    pred = [tokenized_text[x] for x in list(predictions.numpy().reshape(-1))]

    return [word for word, _ in vocab.items() if word in pred ]

sentence = "My cat likes to play"
predictions = predict(model, sentence, vocab)
print("Predictions:", predictions)


# 6. 总结

# 这篇文章主要介绍了如何使用PyTorch搭建ELMo模型，包括模型的原理、数据准备、模型搭建、训练和预测。我们提供了完整的代码实现，确保代码可运行且无错误。希望本文能帮助您理解ELMo模型并在自己的项目中应用，更多模型的运用技巧请持续关注。

  inputs = torch.tensor(inputs).long()  # 将输入数据转换为张量
  targets = torch.tensor(inputs).long()  # 将目标数据转换为张量


Epoch 1/20, Loss: 2.791206121444702
Epoch 2/20, Loss: 2.774533748626709
Epoch 3/20, Loss: 2.6169509887695312
Epoch 4/20, Loss: 2.4948501586914062
Epoch 5/20, Loss: 2.4127554893493652
Epoch 6/20, Loss: 1.857100248336792
Epoch 7/20, Loss: 1.6170330047607422
Epoch 8/20, Loss: 1.3536661863327026
Epoch 9/20, Loss: 0.8292950391769409
Epoch 10/20, Loss: 0.2775568962097168
Epoch 11/20, Loss: 0.14102372527122498
Epoch 12/20, Loss: 0.08528187870979309
Epoch 13/20, Loss: 0.09879791736602783
Epoch 14/20, Loss: 0.021245460957288742
Epoch 15/20, Loss: 0.036651767790317535
Epoch 16/20, Loss: 0.02133701741695404
Epoch 17/20, Loss: 0.04726295918226242
Epoch 18/20, Loss: 0.021418007090687752
Epoch 19/20, Loss: 0.01557416282594204
Epoch 20/20, Loss: 0.02507207915186882
Predictions: ['likes', 'to', 'play', 'my', 'cat']
