In [4]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# Wczytanie danych
train_df = pd.read_csv("train_data.csv")
test_df = pd.read_csv("test_data.csv")

train_small = train_df.sample(frac=0.1, random_state=42)
test_small = test_df.sample(frac=0.1, random_state=42)


# Przekształcenie do HuggingFace Dataset
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_small)
test_dataset = Dataset.from_pandas(test_small)

# Tokenizacja
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


In [5]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

# Zmieniamy tokenizer i model na DistilBERT
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Używamy poprawnej kolumny do tokenizacji
def tokenize(batch):
    return tokenizer(
        [str(t) for t in batch["text"]],
        padding="max_length",
        truncation=True,
        max_length=128
    )

# Tokenizowanie zbiorów danych
train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Usuwanie niepotrzebnych kolumn
train_dataset = train_dataset.remove_columns(["text"])
test_dataset = test_dataset.remove_columns(["text"])

# Zmiana nazwy kolumny "sentiment" na "label"
train_dataset = train_dataset.rename_column("sentiment", "label")
test_dataset = test_dataset.rename_column("sentiment", "label")

# Ustawienie formatu danych na tensorowy
train_dataset.set_format("torch")
test_dataset.set_format("torch")


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/128000 [00:00<?, ? examples/s]

Map:   0%|          | 0/32000 [00:00<?, ? examples/s]

In [6]:
# Ustawienie hiperparametrów
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
)

In [7]:
import torch
from transformers import get_scheduler

# Przygotowanie optymalizatora
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
# Dodanie learning rate scheduler
scheduler = get_scheduler(
    name="linear",  # Typ scheduler - można użyć np. 'linear' lub 'cosine'
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataset) * training_args.num_train_epochs,
)

In [8]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

# Funkcja do obliczania metryk
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
        "f1": f1_score(labels, preds),
    }

In [9]:
trainer = Trainer(
    model=model,                         # Model
    args=training_args,                  # Hiperparametry
    train_dataset=train_dataset,         # Zbiór treningowy
    eval_dataset=test_dataset,           # Zbiór walidacyjny
    compute_metrics=compute_metrics,     # Metryki
)

In [10]:
# Trening modelu
trainer.train()



Step,Training Loss
10,0.7017
20,0.6858
30,0.6898
40,0.6849
50,0.697
60,0.6942
70,0.6899
80,0.6918
90,0.6827
100,0.6904


KeyboardInterrupt: 