Remarks 

* BERT tokenizer had 768 word vectors, which can be a bit much. But the APIs for the tokenizer is very easy to use.
    *  Embeddings are also a bit sensitive to puncutations and stopwords. 
    * You can try tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") and print(tokenizer.tokenize("This is an example of the bert tokenizer"))
    * You should see that it tries to parse all elements in the sentence. 
* We can use Word2Vec but it may take a while to download the pretrained weights. The implementations are a bit harder 
    * Also word2vec is sensitive to unseen words. it will throw an exception when you try to vectorize unseen words. Therefore, we need to have a function to process them.
    * For example, in the paper I believe it used "UNK" or simply "pass" / torch.zeros()

In [126]:
import pickle 
import sys
import pandas as pd 
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import torch 
import torch.nn as nn
import torch.nn.functional as F

from sklearn.model_selection import train_test_split

# loading file 
with open('../components/review_embedding.pkl', 'rb') as handle:
    x = pickle.load(handle)

In [184]:
df = pd.DataFrame(x)
df = pd.concat([df.loc[df.overall>=4].sample(frac=0.08,random_state=22),df.loc[df.overall<4].sample(frac=0.3,random_state=22)]).reset_index()

x_train, x_test, y_train, y_test = train_test_split(df['embedding'], df['overall'], test_size= 0.33, random_state=22)

# x_test, holdout_x_test, y_test, holdout_y_test = train_test_split(x_test, y_test, test_size= 0.2, random_state=22)

# dataframes = [x_train, y_train, x_test, y_test, holdout_x_test, holdout_y_test]
dataframes = [x_train, y_train, x_test, y_test]

for i in dataframes:
    i.reset_index(drop=True, inplace=True)

In [189]:
df.overall.value_counts()

5.0    9487
3.0    8528
4.0    4335
1.0    4311
2.0    4018
Name: overall, dtype: int64

In [190]:
class ReviewDataSet(Dataset):
    def __init__(self, reviews, ratings):
        self.reviews = reviews 
        self.ratings = ratings

    def __len__(self):
        return len(self.ratings)
    
    def __getitem__(self, index):
        reviews = self.reviews[index]
        ratings = self.ratings[index]
        
        # pytorch starts at 0 just like python 
        return reviews, int(ratings)-1

batch_size = 128
train_set = ReviewDataSet(reviews=x_train, ratings=y_train)
valid_set = ReviewDataSet(reviews=x_test, ratings=y_test)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=0)
valid_loader = DataLoader(valid_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=0)

In [191]:
for ratings, labels in train_loader:
    print('Size of reviews:', ratings.size())  
    print('Type of reviews:', ratings.dtype)   # float32
    print('Size of label:', ratings.size())  # batch_size
    print('Type of label:', ratings.dtype)   # int64(long)
    break

Size of reviews: torch.Size([128, 768])
Type of reviews: torch.float32
Size of label: torch.Size([128, 768])
Type of label: torch.float32


In [208]:
class Classifier(nn.Module):
    # def __init__(self):
    def __init__(self, dw, h1, A):
        super(Classifier, self).__init__()
        self.h1 = h1
        self.Wa = nn.Parameter(torch.Tensor(dw, self.h1),requires_grad = True)
        self.ba = nn.Parameter(torch.Tensor(self.h1),requires_grad = True)

        ## Initialize weights with a uniform distribution between -0.01 and 0.01
        nn.init.uniform_(self.Wa, -0.01, 0.01)

        ## Initialize bias with zeros 
        nn.init.zeros_(self.ba) # zeros  
        
        self.fc_layers = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(self.h1, 256),
            nn.LeakyReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(256, 5),

            # Probabilistic Output (link function)
            nn.Softmax(dim=1)
        )

    # utils 
    def torch_device(self):
        return "cuda" if torch.cuda.is_available() else "cpu"

    ## Multi-head word attention layer - not working 
    # def attention_layer(self, x):
    #      multihead_attn = nn.MultiheadAttention(768, self.h1).to("cuda")
    #      x, _ = multihead_attn(x,x,x)
    #      return x

    def aspect_layers(self, x):
        # X = f(X*W + b) where f() is the activation function according to the research paper 
        Xu_a = torch.matmul(x, self.Wa) + self.ba
        Xu_a = F.leaky_relu(Xu_a)
        return Xu_a

    def forward(self, x):
        x = self.aspect_layers(x)
        # x = self.attention_layer(x)

        pred_ratings = self.fc_layers(x)
        return pred_ratings
    

In [212]:
# Aspect-Sentiment Transformation Layer Parameters
h1 = 50  # Hidden dimension
dw = 768  # embedding dimension
A = 3  # Number of aspects

n_epochs = 5
lr = 0.001
break_counter = 0
criterion = nn.CrossEntropyLoss()
best = float('inf')  # Initialize with a large value
path = './model.pth'

device = "cuda" 
model = Classifier(dw, h1, A).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

for epoch in range(n_epochs): 

    # training mode
    model.train()
    train_loss = []
    train_accs = []

    for batch in tqdm(train_loader):
        reviews, rating = batch 

        logits = model(reviews.to(device))
        loss = criterion(logits, rating.to(device))

        optimizer.zero_grad()
        loss.backward()

        optimizer.step()

        pred = torch.argmax(logits, dim=1)
        acc = sum(pred==rating.to(device)).item()

        # acc = (logits.argmax(dim=-1) == rating.float().to(device)).float().mean()

        # Record the loss and accuracy.
        train_loss.append(loss.item())
        train_accs.append(acc)
    
    # The average loss and accuracy of the training set is the average of the recorded values.
    train_loss = sum(train_loss) / len(train_loss)
    train_acc = sum(train_accs) / len(train_accs)

    print(f"[ Train | {epoch + 1:03d}/{n_epochs:03d} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}")

    # ---------- Validation ----------
    # Make sure the model is in eval mode so that some modules like dropout are disabled and work normally.
    model.eval()

    # These are used to record information in validation.
    valid_loss = []
    valid_accs = []

    # Iterate the validation set by batches.
    for batch in tqdm(valid_loader):

        reviews, rating = batch 

        with torch.no_grad():
            logits = model(reviews.to(device))

        # We can still compute the loss (but not the gradient).
        loss = criterion(logits, rating.to(device))

        # Compute the accuracy for current batch.
        pred = torch.argmax(logits, dim=1)
        acc = sum(pred==rating.to(device)).item()

        # Record the loss and accuracy.
        valid_loss.append(loss.item())
        valid_accs.append(acc)

    # The average loss and accuracy for entire validation set is the average of the recorded values.
    valid_loss = sum(valid_loss) / len(valid_loss)
    valid_acc = sum(valid_accs) / len(valid_accs)
    
    print(f"[ Valid | {epoch + 1:03d}/{n_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")

    if valid_loss < best:
        torch.save(model.state_dict(), path)
        best = valid_loss
        break_counter = 0
    else:
        break_counter += 1
        print(f'did not increase valid_acc. break counter now at {break_counter}')


100%|██████████| 161/161 [00:02<00:00, 80.44it/s] 


[ Train | 001/005 ] loss = 1.50604, acc = 48.44720


100%|██████████| 80/80 [00:00<00:00, 145.16it/s]


[ Valid | 001/005 ] loss = 1.45728, acc = 57.63750


100%|██████████| 161/161 [00:01<00:00, 133.48it/s]


[ Train | 002/005 ] loss = 1.43187, acc = 59.16149


100%|██████████| 80/80 [00:00<00:00, 189.80it/s]


[ Valid | 002/005 ] loss = 1.42187, acc = 60.16250


100%|██████████| 161/161 [00:01<00:00, 155.04it/s]


[ Train | 003/005 ] loss = 1.41145, acc = 61.65217


100%|██████████| 80/80 [00:00<00:00, 212.58it/s]


[ Valid | 003/005 ] loss = 1.41151, acc = 60.93750


100%|██████████| 161/161 [00:01<00:00, 153.21it/s]


[ Train | 004/005 ] loss = 1.40609, acc = 62.60870


100%|██████████| 80/80 [00:00<00:00, 216.40it/s]


[ Valid | 004/005 ] loss = 1.40454, acc = 61.16250


100%|██████████| 161/161 [00:01<00:00, 157.82it/s]


[ Train | 005/005 ] loss = 1.39949, acc = 63.31677


100%|██████████| 80/80 [00:00<00:00, 229.08it/s]

[ Valid | 005/005 ] loss = 1.41456, acc = 60.31250
did not increase valid_acc. break counter now at 1





Lets try out on the test set 

In [78]:
# # Some modules like Dropout or BatchNorm affect if the model is in training mode.
# model.eval()
# # unsqueeze turns 1d tensor into 2d
# embedding = torch.unsqueeze(torch.tensor(holdout_x_test[1]), 0).to(device)
# # adding 1 because we initially -1 from all ratings
# torch.argmax(model(embedding), dim=1).cpu().numpy()+1 

array([3], dtype=int64)