In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
from torchtext.data import Field, LabelField, BucketIterator
from torchtext.vocab import GloVe

# 设置随机种子以保证实验的可重复性
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# 定义字段
TEXT = Field(tokenize='spacy', lower=True, tokenizer_language='en_core_web_sm')
LABEL = LabelField(dtype=torch.float)


# 下载和加载IMDb数据集
train_data, test_data = IMDB.splits(TEXT, LABEL)

# 构建词汇表并加载预训练的GloVe词向量
TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=100))
LABEL.build_vocab(train_data)

# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')





In [15]:
# 定义LSTM模型
class LSTM(nn.Module):
    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, (hidden, cell) = self.rnn(embedded)
        return self.fc(hidden.squeeze(0))

In [17]:
# 初始化模型
INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 256
OUTPUT_DIM = 1
DROPOUT = 0.5

model = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT).to(device)

# 加载预训练的词向量
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

# 定义损失函数和优化器
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters())

# 定义数据加载器
BATCH_SIZE = 64
train_iterator, test_iterator = BucketIterator.splits(
    (train_data, test_data),
    batch_size=BATCH_SIZE,
    device=device
)

# 训练模型
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for batch in iterator:
        optimizer.zero_grad()
        text = batch.text
        predictions = model(text).squeeze(1)
        loss = criterion(predictions, batch.label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# 测试模型
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in iterator:
            text = batch.text
            predictions = model(text).squeeze(1)
            loss = criterion(predictions, batch.label)
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# 训练并测试模型
N_EPOCHS = 30
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_iterator, optimizer, criterion)
    test_loss = evaluate(model, test_iterator, criterion)
    print(f'Epoch: {epoch+1}')
    print(f'\tTrain Loss: {train_loss:.3f} | Test Loss: {test_loss:.3f}')

# 保存模型
torch.save(model.state_dict(), 'lstm_model_imdb.pth')
print("Model saved.")


Epoch: 1
	Train Loss: 0.693 | Test Loss: 0.689
Epoch: 2
	Train Loss: 0.694 | Test Loss: 0.694
Epoch: 3
	Train Loss: 0.693 | Test Loss: 0.691
Epoch: 4
	Train Loss: 0.693 | Test Loss: 0.688
Epoch: 5
	Train Loss: 0.692 | Test Loss: 0.663
Epoch: 6
	Train Loss: 0.692 | Test Loss: 0.661
Epoch: 7
	Train Loss: 0.691 | Test Loss: 0.657
Epoch: 8
	Train Loss: 0.690 | Test Loss: 0.624
Epoch: 9
	Train Loss: 0.663 | Test Loss: 0.605
Epoch: 10
	Train Loss: 0.688 | Test Loss: 0.607
Epoch: 11
	Train Loss: 0.687 | Test Loss: 0.585
Epoch: 12
	Train Loss: 0.648 | Test Loss: 0.565
Epoch: 13
	Train Loss: 0.582 | Test Loss: 0.692
Epoch: 14
	Train Loss: 0.564 | Test Loss: 0.608
Epoch: 15
	Train Loss: 0.556 | Test Loss: 0.558
Epoch: 16
	Train Loss: 0.563 | Test Loss: 0.513
Epoch: 17
	Train Loss: 0.540 | Test Loss: 0.519
Epoch: 18
	Train Loss: 0.558 | Test Loss: 0.547
Epoch: 19
	Train Loss: 0.539 | Test Loss: 0.514
Epoch: 20
	Train Loss: 0.530 | Test Loss: 0.534
Epoch: 21
	Train Loss: 0.536 | Test Loss: 0.528
E

In [19]:
# 构建词汇表
TEXT.build_vocab(train_data, vectors=GloVe(name='6B', dim=100))
LABEL.build_vocab(train_data)

# 加载模型
loaded_model = LSTM(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, DROPOUT).to(device)
loaded_model.load_state_dict(torch.load('lstm_model_imdb.pth'))
loaded_model.eval()

# 定义函数对用户输入的影评进行预测
def predict_sentiment(model, sentence):
    model.eval()
    tokenized = TEXT.tokenize(sentence)
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]
    tensor = torch.LongTensor(indexed).to(device)
    tensor = tensor.unsqueeze(1)
    prediction = torch.sigmoid(model(tensor))
    return prediction.item()

# 用户输入影评
user_review = input("请输入一个影评：")

# 预测影评的情感标签
prediction = predict_sentiment(loaded_model, user_review)

# 打印预测结果
if prediction >= 0.5:
    print("Positive")
else:
    print("Negative")



Negative
