In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
from tqdm import tqdm

# Veri setini yükleme
df = pd.read_csv("e-ticaret_urun_yorumlari.csv", delimiter=';')
X = df['Metin']
y = df['Durum']

# Veriyi eğitim ve test setlerine bölme
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# BERT için önceden eğitilmiş bir tokenizer kullanma
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")

tokenizer_config.json: 100%|██████████| 60.0/60.0 [00:00<?, ?B/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
vocab.txt: 100%|██████████| 251k/251k [00:00<00:00, 644kB/s]
config.json: 100%|██████████| 385/385 [00:00<00:00, 383kB/s]


In [3]:
# Veriyi BERT için uygun formata getirme
def tokenize_data(data, max_length=128):
    input_ids = []
    attention_masks = []

    for text in tqdm(data, desc="Tokenization"):
        encoded_data = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors="pt"
        )

        input_ids.append(encoded_data["input_ids"])
        attention_masks.append(encoded_data["attention_mask"])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

In [4]:
# Eğitim ve test verilerini tokenize etme
X_train_ids, X_train_masks = tokenize_data(X_train)
X_test_ids, X_test_masks = tokenize_data(X_test)

Tokenization:   0%|          | 0/12136 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Tokenization: 100%|██████████| 12136/12136 [00:08<00:00, 1424.00it/s]
Tokenization: 100%|██████████| 3034/3034 [00:01<00:00, 1547.44it/s]


In [5]:
# Etiketleri torch tensor'larına dönüştürme
y_train = torch.tensor(y_train.values)
y_test = torch.tensor(y_test.values)

In [6]:
# DataLoader oluşturma
train_data = TensorDataset(X_train_ids, X_train_masks, y_train)
test_data = TensorDataset(X_test_ids, X_test_masks, y_test)

batch_size = 16
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size)


In [10]:
# BERT modelini yükleme
model = BertForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-cased", num_labels=3)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Etiketleri Kontrol Etme
print("Eğitim Etiketleri:", np.unique(y_train))
print("Test Etiketleri:", np.unique(y_test))

# Modelin Çıkış Sınıf Sayısını Kontrol Etme
num_labels = model.config.num_labels
print("Modelin Çıkış Sınıf Sayısı:", num_labels)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Eğitim Etiketleri: [0 1 2]
Test Etiketleri: [0 1 2]
Modelin Çıkış Sınıf Sayısı: 3


In [11]:
# Eğitim
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{epochs}"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_loss = total_loss / len(train_dataloader)
    print(f"Average Loss: {avg_loss}")


Epoch 1/3:   4%|▍         | 33/759 [10:08<3:37:00, 17.93s/it]

In [None]:
# Test
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Testing"):
        batch = tuple(t.to(device) for t in batch)
        inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[2]}

        outputs = model(**inputs)
        logits = outputs.logits

        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(batch[2].cpu().numpy())


In [None]:
# Performans değerlendirmesi
print("Sınıflandırma Raporu:")
print(classification_report(true_labels, predictions))

accuracy = accuracy_score(true_labels, predictions)
print(f"Doğruluk (Accuracy): {accuracy}")