<a href="https://colab.research.google.com/github/schreinersoft/big5-ki-personality/blob/main/Universal_Tunstall_Fine_Tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Fine-Tuning ausgehend von pretrained model

##Setup

In [None]:
modelckpt = "distilbert-base-uncased"
modelckpt = "distilbert-base-cased"
model_name = f"{modelckpt}-test-finetuned-sms-spam"
dataset_name = "emotion"
dataset_name = "ucirvine/sms_spam"
num_labels = 6
num_labels = 2

In [None]:
!pip install datasets
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(modelckpt)


In [None]:
from datasets import load_dataset
dataset = load_dataset(dataset_name)


In [None]:
def tokenize(batch):
  return tokenizer(batch["sms"], padding=True, truncation=True)
dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)

In [None]:
# Check if the dataset only has a 'train' split
if "train" in dataset_encoded and len(dataset_encoded) == 1:
    dataset_encoded = dataset_encoded["train"].train_test_split(test_size=0.2, seed=42)
    dataset_encoded["validation"] = dataset_encoded.pop("test")
    print("Dataset split into train, validation")
elif "train" in dataset_encoded and "test" in dataset_encoded:
  dataset_encoded["validation"] = dataset_encoded["test"]
  print("Dataset already has train and test, renamed test to validation")
else:
  print("Dataset already has train, validation and test splits")
dataset_encoded

In [None]:
dataset_encoded

#Training

In [None]:
from transformers import AutoModelForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Add num_labels to match the number of emotion labels
model = AutoModelForSequenceClassification.from_pretrained(modelckpt, num_labels=num_labels).to(device)

###Metriken hinzufügen

In [None]:
from sklearn.metrics import accuracy_score, f1_score
def computemetrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)
  return {"accuracy": acc, "f1": f1}


###Huggingface einloggen

In [None]:
from huggingface_hub import notebook_login
import os
notebook_login()


In [None]:
from transformers import Trainer, TrainingArguments
batchsize = 64
loggingsteps = len(dataset_encoded["train"]) // batchsize
trainingargs = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batchsize,
    per_device_eval_batch_size=batchsize,
    weight_decay=0.01,
    eval_strategy="epoch",
    disable_tqdm=False,
    logging_steps=loggingsteps,
    push_to_hub=True,
    log_level="error"
    )
wandb_token="fa13d32d1d1b21d514ddc7a16dd7b8729598b090"

In [None]:
from transformers import Trainer
trainer = Trainer(model=model, args=trainingargs, compute_metrics=computemetrics, train_dataset=dataset_encoded["train"],
                  eval_dataset=dataset_encoded["validation"], tokenizer=tokenizer)
trainer.train();


In [None]:
predsoutput = trainer.predict(dataset_encoded["validation"])


In [None]:
predsoutput.metrics

In [None]:
import numpy as np
y_preds = np.argmax(predsoutput.predictions, axis=1)


###Confusion Matrix zur Visualisierung

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
import matplotlib.pyplot as plt
def plot_confusion_matrix(ypreds, ytrue, labels):
  cm = confusion_matrix(ytrue, ypreds, normalize="true")
  fig, ax = plt.subplots(figsize=(6, 6))
  disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
  disp.plot(cmap="YlOrRd", values_format=".2f", ax=ax, colorbar=False)
  plt.title("Normalized confusion matrix")
  plt.show()
y_valid = dataset_encoded["validation"]["label"]
labels = dataset_encoded["train"].features["label"].names
plot_confusion_matrix(y_preds, y_valid, labels)


##Error analysis: loss je kategorie ermitteln

In [None]:
from torch.nn.functional import cross_entropy
def forward_pass_with_label(batch):
  # Place all input tensors on the same device as the model
  inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
  with torch.no_grad():
    output = model(**inputs)
    pred_label = torch.argmax(output.logits, axis=-1)
    loss = cross_entropy(output.logits, batch["label"].to(device), reduction="none")
    # Place outputs on CPU for compatibility with other dataset columns
  return {"loss": loss.cpu().numpy(), "predictedlabel": pred_label.cpu().numpy()}



In [None]:
# Convert our dataset back to PyTorch tensors
dataset_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
# Compute loss values
dataset_encoded["validation"] = dataset_encoded["validation"].map(forward_pass_with_label, batched=True, batch_size=16)


In [None]:
def label_int2str(row):
  return dataset_encoded["train"].features["label"].int2str(row)


In [None]:
dataset_encoded.set_format("pandas")
cols = ["sms", "label", "predictedlabel", "loss"]
dftest = dataset_encoded["validation"][:][cols]
dftest["label"] = dftest["label"].apply(label_int2str)
dftest["predictedlabel"] = (dftest["predictedlabel"] .apply(label_int2str))


In [None]:
dftest[dftest["predictedlabel"]!=dftest["label"]].sort_values("loss", ascending=True)

###schlechteste Vorhersagen suchen und auf eventuelle Fehler im Datensatz prüfen

In [None]:
dftest.sort_values("loss", ascending=False).head(10)

###Beste Vorhersagen prüfen, z.B. auf Shortcuts (zu einfache Signale im Text)

In [None]:
dftest.sort_values("loss", ascending=True).head(10)


In [None]:
dftest[dftest["predictedlabel"]!=dftest["label"]].sort_values("loss", ascending=True).head(10)

##Auf Huggingface publizieren

In [None]:
trainer.push_to_hub(commit_message="Training completed!")


##Pipeline mit dem eigenen Modell einrichten

In [None]:
from transformers import pipeline
modelid = f"joiner75/{model_name}"
classifier = pipeline("text-classification", model=modelid)


In [None]:
text = "Hello Bernd, would you like to meet next sunday?"
text = "Find all that you need at https://www.goods.com. good prices, big wins"

preds = classifier(text, return_all_scores=True)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
predsdf = pd.DataFrame(preds[0])
plt.bar(labels, 100 * predsdf["score"], color='C0')
plt.title(f'"{text}"')
plt.ylabel("Class probability (%)")
plt.show()


In [None]:
predsdf