In [1]:
import sys
import torch
import torch.nn as nn
import re
import pandas as pd
import time
from collections import Counter, OrderedDict
from torchtext.vocab import vocab
from torch.utils.data import Dataset, DataLoader
from datasets import load_metric

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# **General Settings**

torch.backends.cudnn.deterministic = True
RANDOM_SEED = 123
torch.manual_seed(RANDOM_SEED)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

NUM_EPOCHS = 100
BATCH_SIZE = 1000

In [3]:
# 1. load Yelp user review dataset

df = pd.read_csv('./yelp_review_first_130K_with_sentiment.csv')

# **Split Dataset into Train/Validation/Test**
# Train 100K, Valid 10K, Test 20K

train_texts = df.iloc[:100000]['text'].values
train_labels = df.iloc[:100000]['sentiment'].values

valid_texts = df.iloc[100000:110000]['text'].values
valid_labels = df.iloc[100000:110000]['sentiment'].values

test_texts = df.iloc[110000:]['text'].values
test_labels = df.iloc[110000:]['sentiment'].values

In [4]:
# 2. find unique tokens (words)
token_counts = Counter()

def tokenizer(text):
    text = re.sub('<[^>]*>', '', text)
    emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text.lower())
    text = re.sub('[\W]+', ' ', text.lower()) +        ' '.join(emoticons).replace('-', '')
    tokenized = text.split()
    return tokenized

for line in train_texts:
    tokens = tokenizer(line)
    token_counts.update(tokens)

print('Vocab-size:', len(token_counts))

Vocab-size: 73058


In [5]:
# 3. encoding each unique token into integers

sorted_by_freq_tuples = sorted(token_counts.items(), key=lambda x:x[1], reverse=True)
ordered_dict = OrderedDict(sorted_by_freq_tuples)

vocab = vocab(ordered_dict)
vocab.insert_token('<pad>', 0)
vocab.insert_token('<unk>', 1)
vocab.set_default_index(1)

In [6]:
print([vocab[token] for token in ['this', 'is', 'an', 'example']])

[18, 10, 60, 2203]


In [7]:
# Define Yelp Dataset

class YelpDataset(Dataset):
    def __init__(self, texts, labels):
        """
        'texts' here should be a list/array of texts, labels here should be a list/array of labels.
        """
        self.reviews = texts # sentiments
        self.labels = labels # labels

    def __getitem__(self, idx):
        return self.reviews[idx], self.labels[idx]

    def __len__(self):
        return len(self.labels)

In [8]:
# Step 3.a: define functions for transformation

# to transform each review in the dataset
text_pipeline = lambda x: [vocab[token] for token in tokenizer(x)]

In [9]:
# Step 3.b: wrap the encode and transformation function

def collate_batch(batch):
    label_list, text_list, lengths = [], [], []
    for _text, _label in batch:
        label_list.append(_label)
        processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
        text_list.append(processed_text)
        lengths.append(processed_text.size(0))
    label_list = torch.tensor(label_list, dtype=torch.float) # label
    lengths = torch.tensor(lengths)
    padded_text_list = nn.utils.rnn.pad_sequence(text_list, batch_first=True)
    return padded_text_list.to(DEVICE), label_list.to(DEVICE), lengths.to(DEVICE)


In [10]:
# train_dataset, valid_dataset, and test_dataset
train_dataset = YelpDataset(train_texts, train_labels)
valid_dataset = YelpDataset(valid_texts, valid_labels)
test_dataset = YelpDataset(test_texts, test_labels)

In [11]:
# take a small batch
dataloader = DataLoader(train_dataset, batch_size=4, shuffle=True, collate_fn=collate_batch)

# check if text is padded to same lengths
text_batch, label_batch, length_batch = next(iter(dataloader))
print(text_batch)
print(label_batch)
print(length_batch)
print(text_batch.shape)

tensor([[   18,    10,  2969,     5,   583,   327,     3,     2,    29,    10,
            88,    84,     2, 17551,    21,    45,     2,   352,    10,   144,
            42,     2,   368,     4,    92,    23,   177,     8,    26,    60,
           327,    98,    45,    19,    63,   161,     6,    56,   104,   137,
           369,     2,   355,    12,     2,   547,  1996,   528,   149,   165,
            48,    45,     8,    26,    27,     5,   299,  2064,    48,     2,
            29,    10,    63,    35,    21,   373,    27,   198,     5,   299,
           127,    12,    13,  1087,    21,    45,    19,    71,   668,   247,
             3,    47,    25,     6,    56,    98,     8,     2,   695,    31,
            33,   383,     2,   236,   552,     3,     2,    46,    10,    33,
            38,    83,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,  

In [12]:
# prepare the dataloaders for train, valid, and test
batch_size = BATCH_SIZE
train_dl = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_batch)
valid_dl = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)
test_dl = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_batch)

In [13]:
## embedding layers for sentence encoding
# an example
embedding = nn.Embedding(num_embeddings=10, embedding_dim=3, padding_idx=0)
# num_embedding corresponds to the unique integer values that the model will receive as input (n+2, 2 includes 'pad' and 'unk')
# embedding_dim is the size of embedding features

# a batch of 2 samples of 4 indices each
text_encoded_input = torch.LongTensor([[1,2,4,5],[4,3,2,0]])
print(embedding(text_encoded_input))

tensor([[[-0.3792,  0.4689,  0.7525],
         [-0.6422, -0.8128,  0.1794],
         [ 0.4324, -1.4235, -2.1338],
         [ 1.0524, -0.3885, -0.9343]],

        [[ 0.4324, -1.4235, -2.1338],
         [ 1.8951,  0.4954,  0.2692],
         [-0.6422, -0.8128,  0.1794],
         [ 0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)


In [14]:
### Build an RNN model for sentiment analysis task

class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size,fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
        self.fc1 = nn.Linear(rnn_hidden_size, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        out, (hidden, cell) = self.rnn(out)
        out = hidden[-1,:,:]
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [15]:
vocab_size = len(vocab)
embed_dim = 20
rnn_hidden_size = 64
fc_hidden_size = 64
torch.manual_seed(0)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size)
model = model.to(DEVICE)
model

RNN(
  (embedding): Embedding(73060, 20, padding_idx=0)
  (rnn): LSTM(20, 64, batch_first=True)
  (fc1): Linear(in_features=64, out_features=64, bias=True)
  (relu): ReLU()
  (fc2): Linear(in_features=64, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [16]:
## train function

def train(dataloader):
    model.train()
    total_acc, total_loss = 0, 0
    for text_batch, label_batch, lengths in dataloader:
        optimizer.zero_grad()
        pred = model(text_batch, lengths)[:, 0]
        loss = loss_fn(pred, label_batch)
        loss.backward()
        optimizer.step()
        total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
        total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)
 
def evaluate(dataloader):
    model.eval()
    total_acc, total_loss = 0, 0
    with torch.no_grad():
        for text_batch, label_batch, lengths in dataloader:
            pred = model(text_batch, lengths)[:, 0]
            loss = loss_fn(pred, label_batch)
            total_acc += ((pred>=0.5).float() == label_batch).float().sum().item()
            total_loss += loss.item()*label_batch.size(0)
    return total_acc/len(dataloader.dataset), total_loss/len(dataloader.dataset)

In [17]:
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


start_time = time.time()
# Training
for epoch in range(NUM_EPOCHS):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = train(valid_dl)
    print(f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d} | '
          f'Accuracy: {acc_train:.4f} | '
          f'Val_accuracy: {acc_valid:.4f} | '
          f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    if (epoch+1)%10 == 0: # test every 10 epochs
        # Test
        acc_test, _ = evaluate(test_dl)
        print(f'Test_accuracy: {acc_test:.4f}') 
# Test
acc_test, _ = evaluate(test_dl)
print(f'Final Test_accuracy: {acc_test:.4f}') 

Epoch: 0001/0100 | Accuracy: 0.7858 | Val_accuracy: 0.7899 | Time elapsed: 1.09 min
Epoch: 0002/0100 | Accuracy: 0.7912 | Val_accuracy: 0.8116 | Time elapsed: 2.04 min
Epoch: 0003/0100 | Accuracy: 0.8741 | Val_accuracy: 0.9014 | Time elapsed: 2.98 min
Epoch: 0004/0100 | Accuracy: 0.9195 | Val_accuracy: 0.9279 | Time elapsed: 3.94 min
Epoch: 0005/0100 | Accuracy: 0.9357 | Val_accuracy: 0.9390 | Time elapsed: 4.90 min
Epoch: 0006/0100 | Accuracy: 0.9451 | Val_accuracy: 0.9458 | Time elapsed: 5.87 min
Epoch: 0007/0100 | Accuracy: 0.9510 | Val_accuracy: 0.9530 | Time elapsed: 6.83 min
Epoch: 0008/0100 | Accuracy: 0.9563 | Val_accuracy: 0.9564 | Time elapsed: 7.83 min
Epoch: 0009/0100 | Accuracy: 0.9605 | Val_accuracy: 0.9614 | Time elapsed: 8.80 min
Epoch: 0010/0100 | Accuracy: 0.9631 | Val_accuracy: 0.9628 | Time elapsed: 9.75 min
Test_accuracy: 0.9473
Epoch: 0011/0100 | Accuracy: 0.9661 | Val_accuracy: 0.9665 | Time elapsed: 10.86 min
Epoch: 0012/0100 | Accuracy: 0.9694 | Val_accuracy: 0

In [18]:
# Bidirectional RNN
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, 
                                      embed_dim, 
                                      padding_idx=0) 
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, 
                           batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(rnn_hidden_size*2, fc_hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, text, lengths):
        out = self.embedding(text)
        out = nn.utils.rnn.pack_padded_sequence(out, lengths.cpu().numpy(), enforce_sorted=False, batch_first=True)
        _, (hidden, cell) = self.rnn(out)
        out = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [19]:
torch.manual_seed(0)
model = RNN(vocab_size, embed_dim, rnn_hidden_size, fc_hidden_size) 
model = model.to(DEVICE)

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.002)


start_time = time.time()
# Training
for epoch in range(NUM_EPOCHS):
    acc_train, loss_train = train(train_dl)
    acc_valid, loss_valid = train(valid_dl)
    print(f'Epoch: {epoch+1:04d}/{NUM_EPOCHS:04d} | '
          f'Accuracy: {acc_train:.4f} | '
          f'Val_accuracy: {acc_valid:.4f} | '
          f'Time elapsed: {(time.time() - start_time)/60:.2f} min')
    if (epoch+1)%10 == 0: # test every 10 epochs
        # Test
        acc_test, _ = evaluate(test_dl)
        print(f'Test_accuracy: {acc_test:.4f}') 
# Test
acc_test, _ = evaluate(test_dl)
print(f'Final Test_accuracy: {acc_test:.4f}') 

Epoch: 0001/0100 | Accuracy: 0.7963 | Val_accuracy: 0.8614 | Time elapsed: 1.11 min
Epoch: 0002/0100 | Accuracy: 0.9084 | Val_accuracy: 0.9283 | Time elapsed: 2.17 min
Epoch: 0003/0100 | Accuracy: 0.9398 | Val_accuracy: 0.9446 | Time elapsed: 3.36 min
Epoch: 0004/0100 | Accuracy: 0.9525 | Val_accuracy: 0.9583 | Time elapsed: 4.47 min
Epoch: 0005/0100 | Accuracy: 0.9613 | Val_accuracy: 0.9636 | Time elapsed: 5.61 min
Epoch: 0006/0100 | Accuracy: 0.9666 | Val_accuracy: 0.9690 | Time elapsed: 6.69 min
Epoch: 0007/0100 | Accuracy: 0.9712 | Val_accuracy: 0.9710 | Time elapsed: 7.78 min
Epoch: 0008/0100 | Accuracy: 0.9749 | Val_accuracy: 0.9782 | Time elapsed: 8.86 min
Epoch: 0009/0100 | Accuracy: 0.9800 | Val_accuracy: 0.9820 | Time elapsed: 9.96 min
Epoch: 0010/0100 | Accuracy: 0.9823 | Val_accuracy: 0.9820 | Time elapsed: 11.10 min
Test_accuracy: 0.9559
Epoch: 0011/0100 | Accuracy: 0.9858 | Val_accuracy: 0.9868 | Time elapsed: 12.39 min
Epoch: 0012/0100 | Accuracy: 0.9879 | Val_accuracy: 