In [1]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


In [2]:
import json
import torch
import numpy as np

from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Kullanılan cihaz:", device)


Kullanılan cihaz: cuda


In [4]:
with open("data.json", "r", encoding="utf-8") as f:
    data = json.load(f)

print("Toplam örnek sayısı:", len(data))


Toplam örnek sayısı: 100


In [5]:
labels = sorted(list(set(item["davaTuru"].split(",")[0].strip() for item in data)))

label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}

print("Label mapping:")
print(label2id)


Label mapping:
{'Borç': 0, 'Ceza': 1, 'Ceza (Güvenilirlik: Ceza: %72)': 2, 'Icra iflas': 3, 'Ihale (Güvenilirlik: Ihale: %22)': 4, 'Iş': 5, 'Mahkeme uyuşmazlık': 6, 'Miras': 7, 'Mülkiyet': 8, 'Trafik': 9, 'Uyuşturucu': 10}


In [6]:
train_data, test_data = train_test_split(
    data,
    test_size=0.2,
    random_state=42
)

print("Train:", len(train_data))
print("Test :", len(test_data))


Train: 80
Test : 20


In [7]:
tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-turkish-cased")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
class CaseDataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        text = item["content"]
        label_text = item["davaTuru"].split(",")[0].strip()
        label = label2id[label_text]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=512,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }


In [9]:
train_dataset = CaseDataset(train_data, tokenizer)
test_dataset  = CaseDataset(test_data, tokenizer)


In [10]:
model = BertForSequenceClassification.from_pretrained(
    "dbmdz/bert-base-turkish-cased",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted"
    )
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }


In [12]:
from transformers import TrainingArguments


In [14]:
training_args = TrainingArguments(
    output_dir="./results",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    report_to="none"
)


In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


  trainer = Trainer(


In [16]:
trainer.train()


Step,Training Loss
10,2.4451
20,2.429
30,2.0732
40,1.7866
50,1.4389
60,1.8815
70,1.6771
80,1.4618
90,1.4152
100,1.5857


TrainOutput(global_step=120, training_loss=1.7344920794169107, metrics={'train_runtime': 18.5108, 'train_samples_per_second': 12.965, 'train_steps_per_second': 6.483, 'total_flos': 63151756001280.0, 'train_loss': 1.7344920794169107, 'epoch': 3.0})

In [17]:
metrics = trainer.evaluate()
print("Test Metrikleri:")
for k, v in metrics.items():
    print(f"{k}: {v:.4f}")


Test Metrikleri:
eval_loss: 1.5059
eval_accuracy: 0.6000
eval_precision: 0.6167
eval_recall: 0.6000
eval_f1: 0.6022
eval_runtime: 0.5385
eval_samples_per_second: 37.1410
eval_steps_per_second: 18.5700
epoch: 3.0000


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [18]:
model.eval()

for i in range(min(3, len(test_data))):
    item = test_data[i]
    text = item["content"]

    inputs = tokenizer(
        text,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        pred_id = torch.argmax(outputs.logits, dim=1).item()

    print("="*60)
    print("METİN:", text[:300], "...")
    print("GERÇEK:", item["davaTuru"])
    print("TAHMİN:", id2label[pred_id])


METİN: 4. Ceza Dairesi 2020/27481 E. , 2023/49 K. Asliye Ceza Mahkemesi SUÇLAR : Hakaret, tehdit Sanık hakkında kurulan hükümlerin; karar tarihi itibarıyla 6723 sayılı Kanun'un 33 üncü maddesiyle değişik 5320 sayılı Kanun'un 8 inci maddesi gereği yürürlükte bulunan 1412 sayılı Ceza Muhakemeleri Usulü Kanun ...
GERÇEK: Ceza, Ceza (Güvenilirlik: Ceza: %100 | Ceza: %100)
TAHMİN: Ceza
METİN: 5. Hukuk Dairesi 2022/14205 E. , 2023/1 K. Aile Mahkemesi I. YARGI YERİ BELİRLENMESİNE KONU KARARLAR A. Bursa 1. İcra Ceza Mahkemesinin 14. 06. 2022 Tarihli ve 2022/76 Esas, 2022/226 Karar Sayılı Kararı Çocuk teslimi emrine mualefet etme suçundan cezalandırılması istemli şikâyeti inceleme görev ve y ...
GERÇEK: Ceza, Borç (Güvenilirlik: Ceza: %95 | Borç: %68)
TAHMİN: Mahkeme uyuşmazlık
METİN: 5. Hukuk Dairesi 2022/14161 E. , 2023/3 K. Asliye Hukuk Mahkemesi (Tüketici Mahkemesi Sıfatıyla) I. YARGI YERİ BELİRLENMESİNE KONU KARARLAR A. İstanbul Anadolu 3. Tüketici Mahkemesinin 13. 11. 2018 Tarihli ve 2017

In [19]:
def count_tokens(dataset):
    total = 0
    for item in dataset:
        tokens = tokenizer.encode(
            item["content"],
            truncation=True,
            max_length=512
        )
        total += len(tokens)
    return total

train_tokens = count_tokens(train_data)
test_tokens  = count_tokens(test_data)

print("Toplam Train Token:", train_tokens)
print("Toplam Test  Token:", test_tokens)
print("GENEL TOPLAM:", train_tokens + test_tokens)


Toplam Train Token: 30595
Toplam Test  Token: 7680
GENEL TOPLAM: 38275
