In [None]:
%%bash
pip install -q pandas scikit-learn numpy matplotlib seaborn torch torchvision torchaudio transformers datasets sentence-transformers mlflow


### Multitask Transformer (Aspect + Sentiment)
Fine-tune a shared encoder with two heads: one for aspect classification and one for sentiment classification.


In [None]:
import pandas as pd
from pathlib import Path
from datasets import Dataset

df = pd.read_csv(Path('../../data/comments.csv'))
df['prompt'] = 'Aspect: ' + df['aspect'] + ' | ' + df['comment']
aspect_labels = sorted(df['aspect'].unique())
sentiment_labels = sorted(df['label'].unique())
aspect2id = {a:i for i,a in enumerate(aspect_labels)}
sent2id = {s:i for i,s in enumerate(sentiment_labels)}
df['aspect_id'] = df['aspect'].map(aspect2id)
df['sent_id'] = df['label'].map(sent2id)
ds = Dataset.from_pandas(df[['prompt','aspect_id','sent_id']])
ds = ds.train_test_split(test_size=0.2, seed=42)


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['prompt'], truncation=True, padding='max_length', max_length=128)
tokenized = ds.map(tokenize, batched=True)


In [None]:
import torch
from torch import nn
from transformers import AutoModel, Trainer, TrainingArguments

class DualHeadModel(nn.Module):
    def __init__(self, base_model_name, num_aspects, num_sentiments):
        super().__init__()
        self.encoder = AutoModel.from_pretrained(base_model_name)
        hidden = self.encoder.config.hidden_size
        self.aspect_head = nn.Linear(hidden, num_aspects)
        self.sent_head = nn.Linear(hidden, num_sentiments)
    def forward(self, input_ids=None, attention_mask=None, labels=None, aspect_labels=None, sent_labels=None):
        outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        pooled = outputs.last_hidden_state[:,0]
        aspect_logits = self.aspect_head(pooled)
        sent_logits = self.sent_head(pooled)
        loss = None
        if aspect_labels is not None and sent_labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(aspect_logits, aspect_labels) + loss_fn(sent_logits, sent_labels)
        return {'loss': loss, 'aspect_logits': aspect_logits, 'sent_logits': sent_logits}

model = DualHeadModel('distilbert-base-uncased', len(aspect2id), len(sent2id))


In [None]:
def collate(batch):
    keys = ['input_ids','attention_mask']
    out = {k: torch.tensor([b[k] for b in batch]) for k in keys}
    out['aspect_labels'] = torch.tensor([b['aspect_id'] for b in batch])
    out['sent_labels'] = torch.tensor([b['sent_id'] for b in batch])
    return out

args = TrainingArguments(
    output_dir='../../outputs/multitask-transformer',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy='epoch',
    logging_steps=5,
    learning_rate=5e-5,
    save_strategy='no'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['test'],
    data_collator=collate,
)
trainer.train()
