In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"

In [2]:
import pandas
import datasets

import transformers

from sklearn.metrics import precision_recall_fscore_support, accuracy_score

In [3]:
DATA_FILE: str = "../data/processed/DefaktS_Twitter.binary.csv"
TEST_FRAC: float = 0.10

MODEL_SLUG: str = "Twitter/twhin-bert-base"

OUT_DIR: str = "./fine_tuning_ouput/"

In [4]:
DATA: pandas.DataFrame = (
    pandas.read_csv(DATA_FILE, index_col=[0])
    .rename(columns={"binary_label": "label"})

    # remove urls
    .pipe(lambda _df: _df.assign(
        text=(
            _df["text"]
            # replace urls with special token
            .str.replace(r"https:\/\/t.co\/\S+", "[URL]", regex=True)
        ),
        label=(
            _df["label"].astype(int)
        )
    ))

    # downsample to smallest category
    .pipe(lambda _df: (
        _df
        .groupby("label")
        .sample(n=min(_df["label"].value_counts()))
    ))
)
DATA.head()

Unnamed: 0_level_0,text,label
id,Unnamed: 1_level_1,Unnamed: 2_level_1
428142,"Die Menschen in Belutschistan hören nicht auf,...",0
387854,Im #Iran geht das Regime nicht nur in #Kurdist...,0
407119,US-Jury spricht Elon #Musk im Betrugsprozess u...,0
392035,Führende Fachpolitiker von Grünen und SPD im B...,0
407800,Hyundai Ioniq 6 Electrified Streamliner\nab 29...,0


In [5]:
DATA_TRAIN = DATA.sample(frac=1.0 - TEST_FRAC)
DATA_TEST = DATA.loc[DATA.index.difference(DATA_TRAIN.index)]

DATASET_TRAIN = datasets.Dataset.from_pandas(DATA_TRAIN, split="train")
DATASET_TEST = datasets.Dataset.from_pandas(DATA_TEST, split="test")

len(DATASET_TRAIN), len(DATASET_TEST), DATA_TRAIN.label.nunique()

(14805, 1645, 2)

In [7]:
TOKENIZER = transformers.AutoTokenizer.from_pretrained(MODEL_SLUG)
MODEL = transformers.AutoModelForSequenceClassification.from_pretrained(MODEL_SLUG, num_labels=DATA_TRAIN.label.nunique())

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at Twitter/twhin-bert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def tokenize_function(sample):
    return TOKENIZER(sample["text"], padding="max_length", truncation=True, max_length=512)

In [9]:
train_tokenized_dataset = DATASET_TRAIN.map(tokenize_function, batched=True)
test_tokenized_dataset = DATASET_TEST.map(tokenize_function, batched=True)

Map:   0%|          | 0/14805 [00:00<?, ? examples/s]

Map:   0%|          | 0/1645 [00:00<?, ? examples/s]

In [10]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro', zero_division=0.0)
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall,
    }

trainer = transformers.Trainer(
    model=MODEL,
    args=transformers.TrainingArguments(
        num_train_epochs=3,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        output_dir=OUT_DIR,
        overwrite_output_dir=True,
        save_total_limit=1,
        logging_first_step=True,
        logging_steps=50,
        eval_strategy="steps"
    ),
    train_dataset=train_tokenized_dataset,
    eval_dataset=test_tokenized_dataset,
    compute_metrics=compute_metrics,
)

In [11]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
50,0.5941,0.484997,0.758055,0.754955,0.775115,0.759614
100,0.486,0.445908,0.794529,0.792783,0.807643,0.79583
150,0.4373,0.42267,0.795137,0.79351,0.802289,0.794133
200,0.4168,0.404027,0.812158,0.811516,0.814897,0.811545
250,0.4177,0.381721,0.824316,0.824127,0.824887,0.824031
300,0.3913,0.396565,0.829179,0.829057,0.831056,0.829656
350,0.4025,0.358078,0.829179,0.82913,0.829193,0.829097
400,0.3869,0.368451,0.832219,0.832115,0.832439,0.832036
450,0.3864,0.346382,0.844377,0.844148,0.847724,0.845004
500,0.332,0.378373,0.819453,0.817772,0.834812,0.820813


TrainOutput(global_step=1389, training_loss=0.31007371783857296, metrics={'train_runtime': 1110.9229, 'train_samples_per_second': 39.98, 'train_steps_per_second': 1.25, 'total_flos': 1.16860775238144e+16, 'train_loss': 0.31007371783857296, 'epoch': 3.0})