In [None]:
%%bash
pip install -r ../../requirements.txt
pip install transformers datasets accelerate


### Objective
Fine-tune a transformer with aspect prompts for sentiment classification.


In [None]:
%%bash
set -e
python - <<'PY'
import pandas as pd
df = pd.read_csv('../../data/teacher_course.csv')
df['prompted'] = 'Aspect: ' + df['aspect'].str.lower() + ' | Text: ' + df['comments']
df[['prompted', 'sentiment']].to_csv('../../data/teacher_course_prompted.csv', index=False)
PY


In [None]:
%%bash
set -e
python - <<'PY'
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import f1_score

model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = load_dataset('csv', data_files='../../data/teacher_course_prompted.csv').class_encode_column('sentiment')

def tokenize(batch):
    return tokenizer(batch['prompted'], truncation=True, padding='max_length', max_length=256)

tokenized = dataset.map(tokenize, batched=True)
tokenized = tokenized.rename_column('sentiment', 'labels').remove_columns(['prompted'])

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=tokenized['train'].features['labels'].num_classes,
)
args = TrainingArguments(
    output_dir='../../outputs/transformer-aspect-prompts',
    evaluation_strategy='epoch',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {'macro_f1': f1_score(labels, preds, average='macro')}

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized['train'],
    eval_dataset=tokenized['train'],
    compute_metrics=compute_metrics,
)
trainer.train()
trainer.save_model('../../outputs/transformer-aspect-prompts/best')
PY
