In [1]:
!pip install evaluate



In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
import datasets
from datasets import load_dataset
import numpy as np
import evaluate

2025-09-29 13:18:03.576881: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759151883.599187     376 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759151883.605905     376 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
model_name = 'dbmdz/bert-base-turkish-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
ds = load_dataset("winvoker/turkish-sentiment-analysis-dataset")

In [5]:
train = ds['train']
test = ds['test']

train = train.remove_columns('dataset')
test = test.remove_columns('dataset')

In [6]:
def tokenize(batch):
    label_map = {'Notr': 0, 'Negative': 1, 'Positive': 2}
    batch['label'] = [label_map[label] for label in batch['label']]
    return tokenizer(batch['text'], truncation=True, padding="max_length",
        max_length=128)

tokenized_train = train.map(tokenize, batched=True)
tokenized_test = test.map(tokenize, batched=True)

Map:   0%|          | 0/48965 [00:00<?, ? examples/s]

In [7]:
tokenized_train.set_format("torch", columns=["input_ids","attention_mask","label"])
tokenized_test.set_format("torch", columns=["input_ids","attention_mask","label"])

In [8]:
def compute_metrics(eval_pred):
    load_acc = evaluate.load('accuracy')
    load_f1 = evaluate.load('f1')
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load_acc.compute(predictions=predictions, references=labels)["accuracy"]
    f1 = load_f1.compute(predictions=predictions, references=labels)["f1"]
    return {"accuracy": accuracy, "f1": f1}

In [11]:
training_args = TrainingArguments(
    output_dir="tr_sent_analysis",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    report_to="none", 
    disable_tqdm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

  trainer = Trainer(


In [12]:
trainer.train()



Step,Training Loss
100,0.0947
200,0.0792
300,0.0613
400,0.0564
500,0.044
600,0.0589
700,0.0437
800,0.0506
900,0.0405
1000,0.0454




KeyboardInterrupt: 

In [13]:
!zip /kaggle/working/tr_sent_analysis


zip error: Nothing to do! (/kaggle/working/tr_sent_analysis.zip)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [16]:
from transformers import BertForSequenceClassification, BertTokenizer

checkpoint_path = "/kaggle/working/tr_sent_analysis/checkpoint-13772"  # son kaydettiğin checkpoint

# Tokenizer yükle
tokenizer = BertTokenizer.from_pretrained(checkpoint_path)

# Modeli yükle
model = BertForSequenceClassification.from_pretrained(checkpoint_path)


In [46]:
text = "Bu ürün çok hoştu ama ufak tefek sıkıntıları var"
tokenized_text = tokenizer(text, return_tensors='pt', truncation=True, max_length=128, padding="max_length")
tokenized_text

{'input_ids': tensor([[    2,  2123,  2782,  2140,  4008,  2598,  2262,  6770, 27681, 22646,
          2166,     3,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [47]:
type(tokenized_text)

transformers.tokenization_utils_base.BatchEncoding

In [48]:
model.eval()
with torch.no_grad():
    res = model(**tokenized_text)  # res bir SequenceClassifierOutput

logits = res.logits  # logits tensoru al
predicted_class_id = torch.argmax(logits, dim=-1).item()  # tensor üzerinde argmax

id2label = {0: "Notr", 1: "Negative", 2: "Positive"}
print("Tahmin:", id2label[predicted_class_id])


Tahmin: Positive
