In [None]:
%%bash
pip install -q pandas scikit-learn numpy matplotlib seaborn torch torchvision torchaudio transformers datasets sentence-transformers mlflow


### Transformer Fine-Tuning with Aspect Prompts
Fine-tune a single-head classifier that receives the aspect in the prompt.


In [None]:
import pandas as pd
from pathlib import Path
from datasets import Dataset

df = pd.read_csv(Path('../../data/comments.csv'))
df['prompt'] = 'Aspect: ' + df['aspect'] + ' | ' + df['comment']
labels = sorted(df['label'].unique())
label2id = {l:i for i,l in enumerate(labels)}
df['label_id'] = df['label'].map(label2id)
ds = Dataset.from_pandas(df[['prompt','label_id']])
ds = ds.train_test_split(test_size=0.2, seed=42)


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['prompt'], truncation=True, padding='max_length', max_length=128)
tokenized = ds.map(tokenize, batched=True)


In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=len(label2id), id2label={v:k for k,v in label2id.items()}, label2id=label2id
)

args = TrainingArguments(
    output_dir='../../outputs/transformer-aspect-prompts',
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy='epoch',
    learning_rate=5e-5,
    save_strategy='no'
)


In [None]:
def collate(batch):
    import torch
    keys = ['input_ids','attention_mask']
    out = {k: torch.tensor([b[k] for b in batch]) for k in keys}
    out['labels'] = torch.tensor([b['label_id'] for b in batch])
    return out

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['test'],
    data_collator=collate,
)
trainer.train()
