In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import f1_score

Load the dataset

In [2]:
df = pd.read_csv("../data/processed/classification_train.csv")
df.head()

Unnamed: 0,text,label
0,A great article on what's taking place in Boli...,Yes
1,Chris Lehto interviews Ashton Forbes about his...,Can't tell
2,Germany has upset other EU member states by se...,No
3,"Redditors are, just like most social media use...",Can't tell
4,u/DLWzll shared a couple days ago how the Virg...,Can't tell


Encode labels

In [3]:
label2id = {"No": 0, "Yes": 1, "Can't tell": 2}
id2label = {v: k for k, v in label2id.items()}

df["label"] = df["label"].map(label2id)


In [4]:
dataset = Dataset.from_pandas(df)


Tokenizer Setup

In [5]:
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=256
    )

dataset = dataset.map(preprocess, batched=True)
dataset = dataset.train_test_split(test_size=0.1)




Map:   0%|          | 0/4316 [00:00<?, ? examples/s]

Define Model + Training Arguments

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

args = TrainingArguments(
    output_dir="../models/classification",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    load_best_model_at_end=True
)



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import transformers
print(transformers.__version__)

4.44.2


In [8]:
import sys
print(sys.executable)


c:\Users\Admin\anaconda3\envs\psycomark\python.exe


In [10]:
from sklearn.metrics import f1_score
import numpy as np

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    f1 = f1_score(labels, preds, average="macro")
    return {"macro_f1": f1}


In [11]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()


  0%|          | 0/972 [00:00<?, ?it/s]



: 