In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [23]:
import pandas
import datasets

import transformers

In [11]:
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import (
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainerCallback,
)
from copy import deepcopy
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, log_loss
from torch.nn import CrossEntropyLoss
import torch

In [24]:
DATA_FILE: str = "../data/processed/DefaktS_Twitter.binary.csv"
TEST_FRAC: float = 0.05

MODEL_SLUG: str = "bert-base-german-cased"

OUT_DIR: str = "./fine_tuning_ouput/"

In [25]:
DATA: pandas.DataFrame = (
    pandas.read_csv(DATA_FILE, index_col=[0])
    .replace(dict(binary_label={0.0: "neutral_post", 1.0: "possible_fake_news"}))
    .rename(columns={"binary_label": "label"})

    # remove urls
    .pipe(lambda _df: _df.assign(text=(
        _df["text"].str
        # replace urls with special token
        .replace(r"https?://\S+|www\.\S+", "[URL]")
    )))

    # downsample to smallest category
    .pipe(lambda _df: (
        _df
        .groupby("label")
        .sample(n=min(_df["label"].value_counts()))
    ))
)
DATA.head()

Unnamed: 0_level_0,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
408378,Wenn chinesische Elektro-Fahrzeuge auf den deu...,neutral_post
389814,Fahrplan bis 2030: #Bundeskanzler will Windkra...,neutral_post
378444,#Präsident Joe #Biden stelle die Position der ...,neutral_post
390255,#NichtGenesen: Neue Wege für die #MECFS-Forsch...,neutral_post
409315,BREAKING: Erdogan nennt dieses Erdbeben die gr...,neutral_post


In [26]:
DATA_TRAIN = DATA.sample(frac=1.0 - TEST_FRAC)
DATA_TEST = DATA.loc[DATA.index.difference(DATA_TRAIN.index)]

DATASET_TRAIN = datasets.Dataset.from_pandas(DATA_TRAIN, split="train")
DATASET_TEST = datasets.Dataset.from_pandas(DATA_TEST, split="test")

len(DATASET_TRAIN), len(DATASET_TEST)

(15628, 822)

In [27]:
TOKENIZER = transformers.AutoTokenizer.from_pretrained(MODEL_SLUG)
MODEL = transformers.AutoModelForSequenceClassification.from_pretrained(MODEL_SLUG, num_labels=DATA_TRAIN.label.nunique())

tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/255k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/485k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize_function(sample):
    return tokenizer(sample["text"], padding="max_length", truncation=True, max_length=512)

In [None]:
train_tokenized_dataset = DATASET_TRAIN.map(tokenize_function, batched=True)
test_tokenized_dataset = DATASET_TEST.map(tokenize_function, batched=True)

In [3]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }


# Hugging Face Trainer
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    evaluation_strategy="epoch", 
    report_to="none",
    num_train_epochs=10,
    #warmup_steps=500,
    #weight_decay=0.01,
    overwrite_output_dir=True,
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    compute_metrics=compute_metrics,
)



Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /media/data/models/bert-base-german-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


Epoch,Training Loss,Validation Loss,Accuracy@de,F1@de,Precision@de,Recall@de,Loss@de
1,No log,0.544142,0.74,0.733515,0.73169,0.739548,0.544085
2,No log,0.496931,0.784,0.778753,0.776364,0.785972,0.496048
3,No log,0.558303,0.769,0.758022,0.758619,0.757467,0.556984
4,No log,0.914272,0.779,0.750305,0.795366,0.740526,0.913352
5,No log,1.009434,0.788,0.77594,0.779707,0.773196,1.00654
6,No log,1.230274,0.773,0.751039,0.773625,0.743386,1.229162
7,No log,1.198938,0.779,0.767867,0.769327,0.766615,1.195317
8,No log,1.220808,0.785,0.772433,0.776749,0.769408,1.217518
9,No log,1.232568,0.788,0.772118,0.783779,0.766239,1.230055


    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argumen

ValueError: You are trying to save a non contiguous tensor: `bert.encoder.layer.0.attention.self.query.weight` which is not allowed. It either means you are trying to save tensors which are reference of each other in which case it's recommended to save only the full tensors, and reslice at load time, or simply call `.contiguous()` on your tensor to pack it before saving.

In [None]:
trainer.train()