In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


# Task 1: Sentence Transformer Implementation

In [2]:
class SentenceTransformerModel(nn.Module):
    def __init__(self, transformer_name="distilbert-base-uncased"):
        super(SentenceTransformerModel, self).__init__()
        self.tokenizer = AutoTokenizer.from_pretrained(transformer_name)
        self.transformer = AutoModel.from_pretrained(transformer_name)
        
    def encode(self, sentences):
        # Tokenize and encode sentences
        inputs = self.tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")
        outputs = self.transformer(**inputs)
        
        # Using the embedding of the [CLS] token for sentence representation
        # Other approach could be obtaining the embeddings by averaging the last hidden states across the sequence
        embeddings = outputs.last_hidden_state[:, 0, :]  # [CLS] token is at index 0
        
        return embeddings


In [3]:
# Testing the model with sample sentences
model = SentenceTransformerModel()
test_sentences = ["I love machine learning!", "Transformers are powerful models."]
embeddings = model.encode(test_sentences)
print("Embeddings:", embeddings)
# Each sentence is represented by a vector of size 768
print("Length:", embeddings.shape)

Embeddings: tensor([[-0.0577,  0.0290, -0.0846,  ..., -0.0329,  0.3543,  0.3237],
        [-0.1678, -0.3774,  0.0050,  ..., -0.2765,  0.3918,  0.0882]],
       grad_fn=<SliceBackward0>)
Length: torch.Size([2, 768])


# Task 2: Multi-Task Learning Expansion

The dataset used contains six of the review topics used in the paper *John Blitzer, Mark Dredze, and Fernando Pereira: Biographies, Bollywood, Boom-boxes and Blenders: Domain Adaptation for Sentiment Classification. In Proceedings of the 45th Annual Meeting of the Association of Computational Linguistics (ACL 2007).*

The data has been formatted so that there is one review per line, and the texts have been tokenized and normalized. 

A line in the file is organized in columns as follows:

0: <b> topic</b> category label (books, camera, dvd, health, music, or software) <br>
1: <b>sentiment</b> category label (pos or neg) <br>
2: document identifier <br>
3 and on: the actual review  <br>



In [4]:
# Read the dataset
with open("data/all_reviews.txt") as fp:
    data = fp.readlines()
    all_classes = [review.split(" ")[0] for review in data]
    all_labels = [review.split(" ")[1] for review in data]
    all_reviews = [" ".join(review.split(" ")[3:]) for review in data]

print(f"Number of Reviews: {len(all_reviews)}")

# Printing 1
print("Label:", all_labels[0])
print("Class:", all_classes[0])
print("Review:", all_reviews[0])

Number of Reviews: 11914
Label: neg
Class: music
Review: i bought this album because i loved the title song . it 's such a great song , how bad can the rest of the album be , right ? well , the rest of the songs are just filler and are n't worth the money i paid for this . it 's either shameless bubblegum or oversentimentalized depressing tripe . kenny chesney is a popular artist and as a result he is in the cookie cutter category of the nashville music scene . he 's gotta pump out the albums so the record company can keep lining their pockets while the suckers out there keep buying this garbage to perpetuate more garbage coming out of that town . i 'll get down off my soapbox now . but country music really needs to get back to it 's roots and stop this pop nonsense . what country music really is and what it is considered to be by mainstream are two different things . 



In [5]:
# Using only 500 reviews
n_samples = 500
all_reviews = all_reviews[:n_samples]
all_labels = all_labels[:n_samples]
all_classes = all_classes[:n_samples]

In [6]:
print("Unique Labels:", set(all_labels))
print("Unique Classes:", set(all_classes))

Unique Labels: {'pos', 'neg'}
Unique Classes: {'software', 'books', 'music', 'dvd', 'camera', 'health'}


In [7]:
# Encoding the labels and classes
label_encoder = {"pos": 1, "neg": 0}
class_encoder = {"books": 0, "camera": 1, "dvd": 2, "health": 3, "music": 4, "software": 5}
label_decoder = {v: k for k, v in label_encoder.items()}
class_decoder = {v: k for k, v in class_encoder.items()}

all_classes = torch.tensor([class_encoder[cls] for cls in all_classes])
all_labels = torch.tensor([label_encoder[label] for label in all_labels])

In [8]:
# Splitting the dataset into train and test
split_point = 0.8
split_idx = int(len(all_reviews) * split_point)
    
train_reviews = all_reviews[:split_idx]
test_reviews = all_reviews[split_idx:]
train_labels = all_labels[:split_idx]
test_labels = all_labels[split_idx:]
train_classes = all_classes[:split_idx]
test_classes = all_classes[split_idx:]

print(f"Train Size: {len(train_reviews)}, Test Size: {len(test_reviews)}")

Train Size: 400, Test Size: 100


### Model Definition

In [9]:
class MultiTaskModel(SentenceTransformerModel):
    def __init__(self, transformer_name="distilbert-base-uncased"):
        super().__init__(transformer_name)
        
        # Freezing the transformer backbone weights
        for param in self.transformer.parameters():
            param.requires_grad = False
        
        self.sentiment_classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 100),
            nn.ReLU(),
            nn.Linear(100, 50),
            nn.ReLU(),
            nn.Linear(50, 1),
            nn.Sigmoid()
        )

        self.sentence_classifier = nn.Sequential(
            nn.Linear(self.transformer.config.hidden_size, 100),
            nn.ReLU(),
            nn.Linear(100, 50),
            nn.ReLU(),
            nn.Linear(50, 6),  # Output layer for 6 classes
            nn.Softmax(dim=1)  # Softmax for multi-class classification
        )

    def forward(self, sentences):
        # Obtain the embeddings from the transformer backbone
        embeddings = self.encode(sentences)
        sentiment_pred = self.sentiment_classifier(embeddings)
        class_pred = self.sentence_classifier(embeddings)
        return sentiment_pred, class_pred
    
    def loss(self, sentences, labels, classes):
        sentiment_pred, class_pred = self.forward(sentences)

        loss = F.binary_cross_entropy(input=sentiment_pred.squeeze(), target=labels.float())
        loss += F.cross_entropy(input=class_pred, target=classes)
        return loss

model = MultiTaskModel()

### Layer-Wise Learning Rate Implementation

Layer-wise learning rates is defined to set higher rates for the top layers and lower rates for the earlier (more general) layers

In [10]:
for name, p in model.transformer.named_parameters():
    print(name)

embeddings.word_embeddings.weight
embeddings.position_embeddings.weight
embeddings.LayerNorm.weight
embeddings.LayerNorm.bias
transformer.layer.0.attention.q_lin.weight
transformer.layer.0.attention.q_lin.bias
transformer.layer.0.attention.k_lin.weight
transformer.layer.0.attention.k_lin.bias
transformer.layer.0.attention.v_lin.weight
transformer.layer.0.attention.v_lin.bias
transformer.layer.0.attention.out_lin.weight
transformer.layer.0.attention.out_lin.bias
transformer.layer.0.sa_layer_norm.weight
transformer.layer.0.sa_layer_norm.bias
transformer.layer.0.ffn.lin1.weight
transformer.layer.0.ffn.lin1.bias
transformer.layer.0.ffn.lin2.weight
transformer.layer.0.ffn.lin2.bias
transformer.layer.0.output_layer_norm.weight
transformer.layer.0.output_layer_norm.bias
transformer.layer.1.attention.q_lin.weight
transformer.layer.1.attention.q_lin.bias
transformer.layer.1.attention.k_lin.weight
transformer.layer.1.attention.k_lin.bias
transformer.layer.1.attention.v_lin.weight
transformer.lay

In [11]:
# Unfreezing some layers of the transformer and setting different learning rates within the backbone
for name, param in model.transformer.named_parameters():
    if "layer.5.output_layer_norm" in name or "layer.5.ffn" in name: 
        param.requires_grad = True

optimizer = optim.AdamW([
    {'params': model.sentiment_classifier.parameters(), 'lr': 1e-3},
    {'params': model.sentence_classifier.parameters(), 'lr': 1e-3},
    {'params': [param for _, param in model.transformer.named_parameters() if param.requires_grad], 'lr': 5e-5}
])


### Model Training

In [12]:
batch_size = 20
epochs = 20
patience = 2  # Number of epochs to wait for improvement
best_loss = float('inf')
no_improvement = 0

# Training 
model.train()
train_n = len(train_reviews)
for epoch in range(epochs):
    total_loss = 0
    curr_idx = 0
    while curr_idx < train_n:
        # Get batch
        reviews = train_reviews[curr_idx: min(curr_idx + batch_size, train_n)]
        labels = train_labels[curr_idx: min(curr_idx + batch_size, train_n)]
        classes = train_classes[curr_idx: min(curr_idx + batch_size, train_n)]

        loss = model.loss(sentences=reviews, classes=classes, labels=labels)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += loss.item()
        curr_idx += batch_size

    avg_loss = round(total_loss / train_n, 4)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {avg_loss}")

    # Early stopping
    if avg_loss < best_loss:
        best_loss = avg_loss
        no_improvement = 0
    else:
        no_improvement += 1
        if no_improvement >= patience:
            print("Early stopping: No improvement for {} epochs".format(patience))
            break


Epoch 1/20, Loss: 0.1225
Epoch 2/20, Loss: 0.1104
Epoch 3/20, Loss: 0.0902
Epoch 4/20, Loss: 0.078
Epoch 5/20, Loss: 0.0726
Epoch 6/20, Loss: 0.0686
Epoch 7/20, Loss: 0.0678
Epoch 8/20, Loss: 0.0662
Epoch 9/20, Loss: 0.0656
Epoch 10/20, Loss: 0.0631
Epoch 11/20, Loss: 0.0597
Epoch 12/20, Loss: 0.0607
Epoch 13/20, Loss: 0.0594
Epoch 14/20, Loss: 0.0621
Epoch 15/20, Loss: 0.0599
Early stopping: No improvement for 2 epochs


In [13]:
model.eval()
test_n = len(test_reviews)
correct_sentiment = correct_class = 0
batch_size = 32

with torch.no_grad():
    curr_idx = 0
    while curr_idx < test_n:
        reviews = test_reviews[curr_idx: min(curr_idx + batch_size, test_n)]
        labels = test_labels[curr_idx: min(curr_idx + batch_size, test_n)]
        classes = test_classes[curr_idx: min(curr_idx + batch_size, test_n)]

        sentiment_pred, class_pred = model(reviews)
        sentiment_pred = (sentiment_pred.squeeze() >= 0.5).float()
        class_pred = torch.argmax(class_pred, dim=1)

        correct_sentiment += (sentiment_pred == labels).sum().item()
        correct_class += (class_pred == classes).sum().item()
        curr_idx += batch_size

sentiment_accuracy = correct_sentiment / test_n
class_accuracy = correct_class / test_n
print(f"Sentiment Accuracy: {sentiment_accuracy}, Class Accuracy: {class_accuracy}")



Sentiment Accuracy: 0.74, Class Accuracy: 0.89


In [14]:
torch.save(model.state_dict(), "model/model_500_reviews.pt")

### To load saved model and run prediction

In [15]:
model = MultiTaskModel()
model.load_state_dict(torch.load("model/model_500_reviews.pt", weights_only=True))

<All keys matched successfully>

In [16]:
sentiment_pred, class_pred = model("i bought this album because i loved the title song . it 's such a great song , how bad can the rest of the album be , right ? well , the rest of the songs are just filler and are n't worth the money i paid for this . it 's either shameless bubblegum or oversentimentalized depressing tripe . kenny chesney is a popular artist and as a result he is in the cookie cutter category of the nashville music scene . he 's gotta pump out the albums so the record company can keep lining their pockets while the suckers out there keep buying this garbage to perpetuate more garbage coming out of that town . i 'll get down off my soapbox now . but country music really needs to get back to it 's roots and stop this pop nonsense . what country music really is and what it is considered to be by mainstream are two different things")
sentiment_pred = (sentiment_pred >= 0.5).int().item()
class_pred = torch.argmax(class_pred, dim=1).item()

print("Label:", label_decoder[sentiment_pred])
print("Class:", class_decoder[class_pred])

Label: neg
Class: music
