In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer

# Wczytanie danych
train_df = pd.read_csv("data/test_data.csv")
test_df = pd.read_csv("data/test_data.csv")

# Próbkowanie 5% danych
train_small = train_df.sample(frac=0.4, random_state=42)
test_small = test_df.sample(frac=0.4, random_state=42)

# Reset indeksów
train_small.reset_index(drop=True, inplace=True)
test_small.reset_index(drop=True, inplace=True)

# Przekształcenie do HuggingFace Dataset
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenizacja
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def tokenize(batch):
    return tokenizer(
        [str(t) for t in batch["text"]],
        padding="max_length",
        truncation=True,
        max_length=128
    )

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

train_dataset = train_dataset.remove_columns(["text"])
test_dataset = test_dataset.remove_columns(["text"])


train_dataset = train_dataset.rename_column("sentiment", "label")
test_dataset = test_dataset.rename_column("sentiment", "label")

train_dataset.set_format("torch")
test_dataset.set_format("torch")


Map: 100%|██████████| 320000/320000 [00:15<00:00, 20847.83 examples/s]
Map: 100%|██████████| 320000/320000 [00:15<00:00, 21005.30 examples/s]


In [3]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np


2025-05-20 13:34:36.694464: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-20 13:34:36.704819: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1747740876.717067   31682 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1747740876.721048   31682 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1747740876.731722   31682 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [4]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "precision": precision_score(labels, preds),
        "recall": recall_score(labels, preds),
        "f1": f1_score(labels, preds),
    }


In [6]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    eval_strategy="epoch",           # zamiast evaluation_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
)


In [7]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [8]:
# train_small = train_df.sample(frac=0.4, random_state=42)  # 5% danych
# test_small = test_df.sample(frac=0.4, random_state=42)

# train_dataset = Dataset.from_pandas(train_small.reset_index(drop=True))
# test_dataset = Dataset.from_pandas(test_small.reset_index(drop=True))


In [9]:
results = trainer.evaluate()
print(results)


{'eval_loss': 0.7065777778625488, 'eval_model_preparation_time': 0.0023, 'eval_accuracy': 0.499878125, 'eval_precision': 0.4999389818416219, 'eval_recall': 0.99855625, 'eval_f1': 0.6662913359064342, 'eval_runtime': 1067.292, 'eval_samples_per_second': 299.824, 'eval_steps_per_second': 18.739}


In [11]:
print(train_small['sentiment'].value_counts(normalize=True))


sentiment
1    0.500578
0    0.499422
Name: proportion, dtype: float64


In [12]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [13]:
model.save_pretrained("ogorzaly_saved_model")
tokenizer.save_pretrained("ogorzaly_saved_model")

('ogorzaly_saved_model/tokenizer_config.json',
 'ogorzaly_saved_model/special_tokens_map.json',
 'ogorzaly_saved_model/vocab.txt',
 'ogorzaly_saved_model/added_tokens.json',
 'ogorzaly_saved_model/tokenizer.json')