In [1]:
from pathlib import Path
import pandas as pd
import transformers
from transformers import pipeline
from datasets import load_dataset

## Load the data

In [None]:
# data_path = Path("/data/wkt406/covid-patients-progress-notes.csv")
# df = pd.read_csv(data_path)
# df.head()

## Tokenize text

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
encoded_text = tokenizer("This is a test of the fantastically capable tokenizer.")
tokens = tokenizer.convert_ids_to_tokens(encoded_text['input_ids'])
text = tokenizer.convert_tokens_to_string(tokens)

In [None]:
print(tokens)
print(text)

In [None]:
emotions = load_dataset("emotion")

In [None]:
emotions['train']['text'][:10]

In [None]:
emotions.set_format(type="pandas")

In [None]:
df = emotions['train'][:]

In [None]:
df

In [None]:
emotions.reset_format()

In [None]:
def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

In [None]:
tokenize(emotions['train'][:2])

In [None]:
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=None)

In [None]:
emotions_encoded

## Extracting hidden states

In [None]:
import torch
from transformers import AutoModel

In [None]:
model_ckpt = "distilbert-base-uncased"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_ckpt).to(device)

In [None]:
def extract_hidden_states(batch):
    # Place model inputs on the GPU
    inputs = {k:v.to(device) for k,v in batch.items()
              if k in tokenizer.model_input_names}
    # Extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    # Return vector for [CLS] token
    return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

In [None]:
emotions_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)

In [None]:
print(emotions_hidden['train']['text'][0])
print(emotions_hidden['train']['hidden_state'][0].shape)

## Train a classifier

In [None]:
from transformers import AutoModelForSequenceClassification

num_labels = 6
model = (AutoModelForSequenceClassification
         .from_pretrained(model_ckpt, num_labels=num_labels)
         .to(device))

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(emotions_encoded["train"]) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(output_dir=model_name,
                                  # optim=torch.optim.AdamW,
                                  optim="adamw_torch",
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  push_to_hub=False,
                                  log_level="error")

In [None]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=emotions_encoded["train"],
                  eval_dataset=emotions_encoded["validation"],
                  tokenizer=tokenizer)
trainer.train()