Remarks 

* BERT tokenizer had 768 word vectors, which can be a bit much. But the APIs for the tokenizer is very easy to use.
    *  Embeddings are also a bit sensitive to puncutations and stopwords. 
    * You can try tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") and print(tokenizer.tokenize("This is an example of the bert tokenizer"))
    * You should see that it tries to parse all elements in the sentence. Depends
* We can use Word2Vec but it may take a while to download the pretrained weights. The implementations are a bit harder 
    * Also word2vec is sensitive to unseen words. it will throw an exception when you try to vectorize unseen words. Therefore, we need to have a function to process them.
    * For example, in the paper I believe it used "UNK" or simply "pass" / torch.zeros()

In [43]:
import pickle 
import sys
import pandas as pd 
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import torch 
import torch.nn as nn

from sklearn.model_selection import train_test_split

# loading file 
with open('../components/review_embedding.pkl', 'rb') as handle:
    x = pickle.load(handle)

In [57]:
df = pd.DataFrame(x)
df = pd.concat([df.loc[df.overall>=4].sample(frac=0.03,random_state=22),df.loc[df.overall<4].sample(frac=0.1,random_state=22)]).reset_index()

x_train, x_test, y_train, y_test = train_test_split(df['embedding'], df['overall'], test_size= 0.33, random_state=22)

x_test, holdout_x_test, y_test, holdout_y_test = train_test_split(x_test, y_test, test_size= 0.2, random_state=22)

dataframes = [x_train, y_train, x_test, y_test, holdout_x_test, holdout_y_test]

for i in dataframes:
    i.reset_index(drop=True, inplace=True)

In [45]:
class ReviewDataSet(Dataset):
    def __init__(self, reviews, ratings):
        self.reviews = reviews 
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, index):
        reviews = self.reviews[index]
        ratings = self.ratings[index]
        
        # pytorch starts at 0 just like python 
        return reviews, int(ratings)-1

batch_size = 2
train_set = ReviewDataSet(reviews=x_train, ratings=y_train)
valid_set = ReviewDataSet(reviews=x_test, ratings=y_test)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=0)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=0)

In [24]:
for ratings, labels in train_loader:
    print('Size of reviews:', ratings.size())  
    print('Type of reviews:', ratings.dtype)   # float32
    print('Size of label:', ratings.size())  # batch_size
    print('Type of label:', ratings.dtype)   # int64(long)
    break

Size of reviews: torch.Size([2, 768])
Type of reviews: torch.float32
Size of label: torch.Size([2, 768])
Type of label: torch.float32


In [27]:
class Classifier(nn.Module):
    def __init__(self):
    # def __init__(self, dw, h1, A):
        super(Classifier, self).__init__()
        # self.Wa = nn.Parameter(torch.Tensor(dw, h1))
        # self.ba = nn.Parameter(torch.Tensor(h1))

        ## Initialize weights with a uniform distribution between -0.01 and 0.01
        # init.uniform_(self.Wa, -0.01, 0.01)
        ## Initialize bias with zeros 
        # init.uniform_(self.ba, -0.01, 0.01) # zeros 

        self.fc_layers = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(256, 5),
            nn.Softmax(dim=1)
        )

    # def aspect_layers(self, x, ):
    #     # X = f(X*W + b) where f() is the activation function according to the research paper 
    #     Xu_a = torch.matmul(word_vectors, self.Wa) + self.ba
    #     Xu_a = nn.ReLU(Xu_a)  
    #     return Xu_a

    def forward(self, x):
        pred_ratings = self.fc_layers(x)
        return pred_ratings

# Aspect-Sentiment Transformation Layer Parameters
# h1 = 3  # Hidden dimension
# dw = 768  # embedding dimension
# A = 3  # Number of aspects

n_epochs = 30
lr = 0.0001
break_counter = 0
criterion = nn.CrossEntropyLoss()
best = float('inf')  # Initialize with a large value
path = './model.pth'

device = "cuda" 
model = Classifier().to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(n_epochs): 

    # training mode
    model.train()
    train_loss = []
    train_accs = []

    for batch in tqdm(train_loader):
        reviews, rating = batch 

        logits = model(reviews.to(device))
        loss = criterion(logits, rating.to(device))

        optimizer.zero_grad()
        loss.backward()

        optimizer.step()

        pred = torch.argmax(logits, dim=1)
        acc = sum(pred==rating.to(device)).item()

        # acc = (logits.argmax(dim=-1) == rating.float().to(device)).float().mean()

        # Record the loss and accuracy.
        train_loss.append(loss.item())
        train_accs.append(acc)
    
    # The average loss and accuracy of the training set is the average of the recorded values.
    train_loss = sum(train_loss) / len(train_loss)
    train_acc = sum(train_accs) / len(train_accs)

    print(f"[ Train | {epoch + 1:03d}/{n_epochs:03d} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}")

    # ---------- Validation ----------
    # Make sure the model is in eval mode so that some modules like dropout are disabled and work normally.
    model.eval()

    # These are used to record information in validation.
    valid_loss = []
    valid_accs = []

    # Iterate the validation set by batches.
    for batch in tqdm(valid_loader):

        reviews, rating = batch 

        with torch.no_grad():
            logits = model(reviews.to(device))

        # We can still compute the loss (but not the gradient).
        loss = criterion(logits, rating.to(device))

        # Compute the accuracy for current batch.
        pred = torch.argmax(logits, dim=1)
        acc = sum(pred==rating.to(device)).item()

        # Record the loss and accuracy.
        valid_loss.append(loss.item())
        valid_accs.append(acc)

    # The average loss and accuracy for entire validation set is the average of the recorded values.
    valid_loss = sum(valid_loss) / len(valid_loss)
    valid_acc = sum(valid_accs) / len(valid_accs)
    
    print(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")

    if valid_loss < best:
        torch.save(model.state_dict(), path)
        best = valid_loss
        break_counter = 0
    else:
        break_counter += 1
        print(f'did not increase valid_acc. break counter now at {break_counter}')


100%|██████████| 3619/3619 [00:06<00:00, 529.58it/s]


[ Train | 001/030 ] loss = 1.51354, acc = 0.74523


100%|██████████| 1783/1783 [00:01<00:00, 1262.37it/s]


[ Valid | 001/030 ] loss = 1.46815, acc = 0.86540


100%|██████████| 3619/3619 [00:07<00:00, 470.88it/s]


[ Train | 002/030 ] loss = 1.46602, acc = 0.86433


100%|██████████| 1783/1783 [00:01<00:00, 1438.37it/s]


[ Valid | 002/030 ] loss = 1.43267, acc = 0.93326


100%|██████████| 3619/3619 [00:07<00:00, 505.83it/s]


[ Train | 003/030 ] loss = 1.43762, acc = 0.92401


100%|██████████| 1783/1783 [00:01<00:00, 1381.96it/s]


[ Valid | 003/030 ] loss = 1.42007, acc = 0.96635


100%|██████████| 3619/3619 [00:06<00:00, 521.96it/s]


[ Train | 004/030 ] loss = 1.42971, acc = 0.94004


100%|██████████| 1783/1783 [00:01<00:00, 1470.54it/s]


[ Valid | 004/030 ] loss = 1.41265, acc = 0.98598


100%|██████████| 3619/3619 [00:07<00:00, 461.02it/s]


[ Train | 005/030 ] loss = 1.41976, acc = 0.96297


100%|██████████| 1783/1783 [00:01<00:00, 1438.71it/s]


[ Valid | 005/030 ] loss = 1.40828, acc = 0.98542


100%|██████████| 3619/3619 [00:07<00:00, 490.74it/s]


[ Train | 006/030 ] loss = 1.41552, acc = 0.96214


100%|██████████| 1783/1783 [00:01<00:00, 1170.10it/s]


[ Valid | 006/030 ] loss = 1.40626, acc = 0.98542


 76%|███████▌  | 2759/3619 [00:05<00:01, 476.82it/s]


KeyboardInterrupt: 

Lets try out on the test set 

In [69]:
# holdout_x_test[0]
holdout_y_test[1]

4.0

In [78]:
# Some modules like Dropout or BatchNorm affect if the model is in training mode.
model.eval()
# unsqueeze turns 1d tensor into 2d
embedding = torch.unsqueeze(torch.tensor(holdout_x_test[1]), 0).to(device)
# adding 1 because we initially -1 from all ratings
torch.argmax(model(embedding), dim=1).cpu().numpy()+1 

array([3], dtype=int64)