<a href="https://colab.research.google.com/github/ruheyun/python_pytorch/blob/main/lstm_acllmdb_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

ruheyun_lstm_acllmdb_path = kagglehub.dataset_download('ruheyun/lstm-acllmdb')
ruheyun_glove_6b_100d_path = kagglehub.dataset_download('ruheyun/glove-6b-100d')

print('Data source import complete.')


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import os
import re
import string
import time
import snowballstemmer
from nltk.corpus import stopwords
import nltk
from itertools import chain
from sklearn.metrics import accuracy_score
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
def readIMDB(path, seg='train'):
    pos_or_neg = ['pos', 'neg']
    data = []
    for label in pos_or_neg:
        files = os.listdir(os.path.join(path, seg, label))
        for file in files:
            with open(os.path.join(path, seg, label, file), 'r', encoding='utf8') as rf:
                # review = rf.read().replace('\n', '')
                review = rf.read()
                # review = re.sub(r'(<br\s*/?>|\n)', ' ', review)
                if label == 'pos':
                    data.append([review, 1])
                elif label == 'neg':
                    data.append([review, 0])
    return data

In [None]:
# !pwd

In [None]:
# %cd /kaggle/input/lstm-acllmdb

In [None]:
# !ls

In [None]:
# %cd /kaggle/working

In [None]:
root = '/kaggle/input/lstm-acllmdb/aclImdb'
train_data = readIMDB(root)
test_data = readIMDB(root, 'test')

In [None]:
train_data[0]

["This was one of those wonderful rare moments in T.V. that I wished I'd captured forever on VHS. Won't it ever air again? <br /><br />It was so creative and I remember it was aired once a week and the wait for the next episode was excruciating. I want to see it all again. I want to buy it. I want what I can't have. Not even on EBAY. <br /><br />So, having ranted enough it was, by far, one of the best series the 80's put out. It should be considered a classic but is lost in space. At least this website and Wikipedia mention it. Sob.<br /><br />It was utterly appealing, funny, flirtatious, and original. Maybe not like Sherlock Holmes original, I actually think Quintin is far more attractive and has a better chance with his leading lady than the stiff and chalky Holmes ever could.",
 1]

In [None]:
def clean_text(text):
    # Convert words to lower case
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    # text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r"(\d+\.?\d*)\s*[kK]", lambda m: str(int(float(m.group(1)) * 1000), text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    ## Remove puncuation
    # text = text.translate(string.punctuation)
    # 创建一个翻译表，将每个标点符号映射为 None（即删除）
    translator = str.maketrans('', '', string.punctuation)
    text = text.translate(translator)

    text = text.split()

    # Remove stop words
    stops = set(stopwords.words("english"))
    text = [w for w in text if w not in stops and len(w) >= 3]

    # text = " ".join(text)

    ## Stemming
    stemmer = snowballstemmer.stemmer('english')
    stemmed_words = [stemmer.stemWord(word) for word in text]
    # text = " ".join(stemmed_words)
    # print(text)
    return text

In [None]:
# def tokenizer(text):
#     return [tok.lower() for tok in text.split()]

In [None]:
train_tokenized = []
test_tokenized = []
for review, score in train_data:
    # train_tokenized.append(tokenizer(review))
    train_tokenized.append(clean_text(review))
for review, score in test_data:
    test_tokenized.append(clean_text(review))

In [None]:
print(train_tokenized[20])
print(len(train_tokenized[20]))

['second', 'two', 'filmed', 'hamlets', 'nineties', 'first', 'franco', 'zeffirelli', 'starring', 'mel', 'gibson', '1990', 'zeffirelli', 'version', 'like', 'laurence', 'olivier', '1948', 'based', 'upon', 'abridged', 'version', 'play', 'much', 'shakespeare', 'original', 'text', 'cut', 'never', 'seen', 'tony', 'richardson', '1969', 'version', 'ran', 'less', 'two', 'hours', 'shorter', 'even', 'zeffirelli', 'presume', 'also', 'abridged', 'kenneth', 'branagh', 'attempting', 'something', 'much', 'ambitious', 'film', 'based', 'complete', 'text', 'play', 'running', 'time', 'around', 'four', 'hours', 'henry', 'branagh', 'claimed', 'olivier', 'crown', 'cinema', 'leading', 'shakespearean', 'confirming', 'claim', 'brilliant', 'much', 'ado', 'nothing', 'rare', 'example', 'great', 'film', 'based', 'shakespeare', 'comedy', 'hamlet', 'third', 'shakespeare', 'film', 'director', 'also', 'acted', 'iago', 'oliver', 'parker', '1995', 'othello', 'one', 'might', 'expect', 'different', 'much', 'ado', 'earlier',

In [None]:
vocab = set(chain(*train_tokenized))
vocab_size = len(vocab)

In [None]:
print(vocab_size)

74916


In [None]:
# !mkdir /kaggle/working/glove-6b-100d

In [None]:
# !rm /kaggle/working/glove-6b-100d/wv.6B.100d.txt

In [None]:
# 输入文件路径
glove_file = datapath('/kaggle/input/glove-6b-100d/glove.6B.100d.txt')
# 输出文件路径
# tmp_file = get_tmpfile('/kaggle/working/glove-6b-100d/wv.6B.100d.txt')

# call glove2word2vec script
# default way (through CLI): python -m gensim.scripts.glove2word2vec --input <glove_file> --output <w2v_file>

# 开始转换
# glove2word2vec(glove_file, tmp_file)

# 加载转化后的文件
# wvmodel = KeyedVectors.load_word2vec_format(tmp_file)
wvmodel = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

In [None]:
# print(wvmodel['computer'])  # 输出单词 'computer' 的词向量
# print(wvmodel.most_similar('computer'))  # 找最相似词

In [None]:
word_to_idx = {word: i+1 for i, word in enumerate(vocab)}
word_to_idx['<unk>'] = 0
idx_to_word = {i+1: word for i, word in enumerate(vocab)}
idx_to_word[0] = '<unk>'

In [None]:
def encode_samples(tokenized_samples, vocab):
    features = []
    for sample in tokenized_samples:
        feature = []
        for token in sample:
            if token in word_to_idx:
                feature.append(word_to_idx[token])
            else:
                feature.append(0)
        features.append(feature)
    return features

In [None]:
def pad_samples(features, maxlen=500, PAD=0):
    padded_features = []
    for feature in features:
        if len(feature) >= maxlen:
            padded_feature = feature[:maxlen]
        else:
            padded_feature = feature
            while(len(padded_feature) < maxlen):
                padded_feature.append(PAD)
        padded_features.append(padded_feature)
    return padded_features

In [None]:
train_features = torch.tensor(pad_samples(encode_samples(train_tokenized, vocab)))
train_labels = torch.tensor([score for _, score in train_data])
test_features = torch.tensor(pad_samples(encode_samples(test_tokenized, vocab)))
test_labels = torch.tensor([score for _, score in test_data])

In [None]:
class SentimentNet(nn.Module):
    def __init__(self, vocab_size, embed_size, num_hiddens, num_layers,
                 bidirectional, weight, labels, **kwargs):
        super(SentimentNet, self).__init__(**kwargs)
        self.num_hiddens = num_hiddens
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.embedding = nn.Embedding.from_pretrained(weight, freeze=True)
        # self.embedding.weight.requires_grad = False
        self.encoder = nn.LSTM(input_size=embed_size, hidden_size=self.num_hiddens,
                               num_layers=num_layers, bidirectional=self.bidirectional,
                               dropout=0)
        if self.bidirectional:
            self.decoder = nn.Linear(num_hiddens * 4, labels)
        else:
            self.decoder = nn.Linear(num_hiddens * 2, labels)

    def forward(self, inputs):
        embeddings = self.embedding(inputs)
        states, hidden = self.encoder(embeddings.permute([1, 0, 2]))
        encoding = torch.cat([states[0], states[-1]], dim=1)
        outputs = self.decoder(encoding)
        return outputs

In [None]:
num_epochs = 5
embed_size = 100
num_hiddens = 100
num_layers = 2
bidirectional = True
batch_size = 64
labels = 2
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
weight = torch.zeros(vocab_size+1, embed_size)

In [None]:
# for i in range(len(wvmodel.index2word)):
#     try:
#         index = word_to_idx[wvmodel.index2word[i]]
#     except:
#         continue
#     weight[index, :] = torch.from_numpy(wvmodel.get_vector(
#         idx_to_word[word_to_idx[wvmodel.index2word[i]]]))

In [None]:
for word in wvmodel.index_to_key:
    if word in word_to_idx:
        index = word_to_idx[word]
        weight[index] = torch.from_numpy(wvmodel.get_vector(word).copy())

In [None]:
net = SentimentNet(vocab_size=(vocab_size+1), embed_size=embed_size,
                   num_hiddens=num_hiddens, num_layers=num_layers,
                   bidirectional=bidirectional, weight=weight,
                   labels=labels).to(device)
# net.to(device)
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters())

In [None]:
train_set = torch.utils.data.TensorDataset(train_features, train_labels)
test_set = torch.utils.data.TensorDataset(test_features, test_labels)

train_iter = torch.utils.data.DataLoader(train_set, batch_size=batch_size,
                                         shuffle=True)
test_iter = torch.utils.data.DataLoader(test_set, batch_size=batch_size,
                                        shuffle=False)

In [None]:
num_epochs = 20

In [None]:
for epoch in range(num_epochs):
    start = time.time()
    train_loss, test_losses = 0, 0
    train_acc, test_acc = 0, 0
    n, m = 0, 0
    net.train()
    for feature, label in train_iter:
        n += 1
        net.zero_grad()
        # feature = Variable(feature.cuda())
        # label = Variable(label.cuda())
        feature, label = feature.to(device), label.to(device)
        score = net(feature)
        loss = loss_function(score, label)
        loss.backward()
        optimizer.step()
        train_acc += accuracy_score(torch.argmax(score.cpu().data,
                                                 dim=1), label.cpu())
        train_loss += loss
    net.eval()
    with torch.no_grad():
        for test_feature, test_label in test_iter:
            m += 1
            # test_feature = test_feature.cuda()
            # test_label = test_label.cuda()
            test_feature, test_label = test_feature.to(device), test_label.to(device)
            test_score = net(test_feature)
            test_loss = loss_function(test_score, test_label)
            test_acc += accuracy_score(torch.argmax(test_score.cpu().data,
                                                    dim=1), test_label.cpu())
            test_losses += test_loss
    end = time.time()
    runtime = end - start
    print('epoch: %d, train loss: %.4f, train acc: %.2f, test loss: %.4f, test acc: %.2f, time: %.2f' %
          (epoch, train_loss.data / n, train_acc / n, test_losses.data / m, test_acc / m, runtime))

epoch: 0, train loss: 0.6131, train acc: 0.66, test loss: 0.6108, test acc: 0.68, time: 29.75
epoch: 1, train loss: 0.4919, train acc: 0.77, test loss: 0.3949, test acc: 0.83, time: 29.55
epoch: 2, train loss: 0.3709, train acc: 0.84, test loss: 0.3322, test acc: 0.86, time: 29.73
epoch: 3, train loss: 0.3258, train acc: 0.86, test loss: 0.3067, test acc: 0.87, time: 29.86
epoch: 4, train loss: 0.3025, train acc: 0.88, test loss: 0.2991, test acc: 0.87, time: 30.07
epoch: 5, train loss: 0.2824, train acc: 0.88, test loss: 0.3031, test acc: 0.87, time: 29.82
epoch: 6, train loss: 0.2605, train acc: 0.89, test loss: 0.2951, test acc: 0.88, time: 29.69
epoch: 7, train loss: 0.2345, train acc: 0.91, test loss: 0.3006, test acc: 0.88, time: 29.69
epoch: 8, train loss: 0.2128, train acc: 0.92, test loss: 0.3306, test acc: 0.87, time: 29.69
epoch: 9, train loss: 0.1871, train acc: 0.93, test loss: 0.3285, test acc: 0.87, time: 29.94
epoch: 10, train loss: 0.1571, train acc: 0.94, test loss: 0