In [12]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import logging

In [14]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [28]:
import os
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0"
device = torch.device("cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [16]:
# Uyarıları kapatma
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

# Model 
MODEL_NAME = "distilbert-base-uncased"

# Tokenizer yükleyeme
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Amazon veri setini yükleyeme
dataset = load_dataset("amazon_polarity")

In [18]:
# Veri çerçevesi oluşturma ve ön işleme
df = pd.DataFrame(dataset["train"])
df = df.sample(20000, random_state=42)  # Küçük bir alt küme alıyoruz
df.columns = ["label", "title", "review"]
df["text"] = df["title"] + " " + df["review"]
df = df[["text", "label"]]

# Eğitim ve test setine ayırma
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42
)

In [20]:
# Tokenizasyon
train_encodings = tokenizer(list(train_texts), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(val_texts), truncation=True, padding=True, max_length=128)

# Dataset oluşturma
train_dataset = Dataset.from_dict({
    "input_ids": train_encodings["input_ids"],
    "attention_mask": train_encodings["attention_mask"],
    "labels": list(train_labels)
})
val_dataset = Dataset.from_dict({
    "input_ids": val_encodings["input_ids"],
    "attention_mask": val_encodings["attention_mask"],
    "labels": list(val_labels)
})


In [22]:
# Modeli yükleyelim
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)


In [24]:
# Yeni sınıflandırma katmanının olduğunu belirtiyoruz
model.config.problem_type = "single_label_classification"

# Eğitim parametreleri
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
)

# Trainer nesnesi
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


In [26]:
# Modeli eğitme
trainer.train()

# Modeli değerlendirme
predictions = trainer.predict(val_dataset)
preds = np.argmax(predictions.predictions, axis=-1)
print("Doğruluk Oranı:", accuracy_score(val_labels, preds))
print(classification_report(val_labels, preds))

# Kullanıcı yorumu tahmini
def predict_review(review_text):
    inputs = tokenizer(review_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    return "Olumlu" if prediction == 1 else "Olumsuz"

# Örnek kullanım
yorum = "Bu ürün harika! Çok memnun kaldım."
print("Tahmin:", predict_review(yorum))


Epoch,Training Loss,Validation Loss
1,0.21,0.218149
2,0.1261,0.266668
3,0.0676,0.301247


Doğruluk Oranı: 0.9315
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1974
           1       0.93      0.93      0.93      2026

    accuracy                           0.93      4000
   macro avg       0.93      0.93      0.93      4000
weighted avg       0.93      0.93      0.93      4000



RuntimeError: Placeholder storage has not been allocated on MPS device!