## 0- Fine-tuning BERT for multi-class classification with Turkish language datasets

In [42]:
import pandas as pd 
import kagglehub
import os
import torch
import numpy as np
from transformers import BertTokenizerFast 
from transformers import BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from torch.utils.data import Dataset
from datetime import datetime
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Early stopping callback
from transformers.integrations import TensorBoardCallback
from transformers import EarlyStoppingCallback

# Set seed for reproducibility
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)


## 1- Load the dataset

In [16]:
# Download latest version
dataset_path = kagglehub.dataset_download("savasy/ttc4900")
print("Path to dataset files:", dataset_path)
csv_file = os.path.join(dataset_path, "7allV03.csv")

# Load the CSV into a pandas DataFrame
data = pd.read_csv(csv_file)

# Display the first 5 rows
print(data.head())

Path to dataset files: /home/saeid/.cache/kagglehub/datasets/savasy/ttc4900/versions/3
   category                                               text
0  siyaset    3 milyon ile ön seçim vaadi mhp nin 10 olağan...
1  siyaset    mesut_yılmaz yüce_divan da ceza alabilirdi pr...
2  siyaset    disko lar kaldırılıyor başbakan_yardımcısı ar...
3  siyaset    sarıgül anayasa_mahkemesi ne gidiyor mustafa_...
4  siyaset    erdoğan idamın bir haklılık sebebi var demek ...


In [20]:
labels=["teknoloji", "ekonomi", "saglik", "siyaset", "kultur", "spor", "dunya"]
Num_labels= len(labels)
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}
# Divide the dataset into training, validation, and test sets
size_data = data.shape[0] 
# text Data
train_texts = data["text"][:int(size_data*0.5)].tolist()
val_texts= data["text"][int(size_data*0.5):int(size_data*0.75)].tolist()
test_texts = data["text"][int(size_data*0.75):].tolist()
# Label Data 
train_labels= data["category"][:int(size_data*0.5)].tolist()
val_labels= data["category"][int(size_data*0.5):int(size_data*0.75)].tolist()
test_labels= data["category"][int(size_data*0.75):].tolist()

## 2- call the model and tokenizer

In [21]:
device= "cuda" if torch.cuda.is_available() else "cpu"
tokenizer= BertTokenizerFast.from_pretrained("dbmdz/bert-base-turkish-uncased", max_length=512)
model= BertForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-uncased",
                                                    num_labels=Num_labels,
                                                    id2label=id2label,
                                                    label2id=label2id).to(device)
print(f"len(train_texts): {len(train_texts)}")
print(f"len(train_labels): {len(train_labels)}")

print(f"len(val_texts): {len(val_texts)}")  
print(f"len(val_labels): {len(val_labels)}")

print(f"len(test_texts): {len(test_texts)}")
print(f"len(test_labels): {len(test_labels)}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 3- Tokenize datasets

In [35]:
enc_train= tokenizer(train_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
enc_val= tokenizer(val_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
enc_test= tokenizer(test_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

## 4- Create a customized dataset

In [40]:
class MyDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels)

train_dataset=MyDataset(enc_train, train_labels)
val_dataset=MyDataset(enc_val, val_labels)
test_dataset=MyDataset(enc_test, test_labels)

## 5- Fine-tuning the model

In [None]:
# Create output directories
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f'./multi_class_results_{timestamp}'
log_dir = f'./multi_class_logs_{timestamp}'
os.makedirs(output_dir, exist_ok=True)
os.makedirs(log_dir, exist_ok=True)

# Function to calculate the metrics
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir=output_dir,  # output directory for model predictions and checkpoints
    num_train_epochs=10,  # total number of training epochs
    per_device_train_batch_size=16,  # reduced batch size to prevent CUDA OOM errors
    per_device_eval_batch_size=16,  # batch size for evaluation
    warmup_ratio=0.1,  # ratio of warmup steps - more flexible than fixed steps
    weight_decay=0.01,  # strength of weight decay
    logging_dir=log_dir,  # directory to save logs
    do_eval=True,  # whether to evaluate during training
    do_train=True,  # whether to train the model
    save_strategy='epoch',  # save the model after each epoch
    evaluation_strategy='epoch',  # evaluate the model after each epoch
    logging_strategy='steps',  # log steps instead of epochs for more frequent updates
    report_to='tensorboard',  # report logs to TensorBoard
    logging_steps=100,  # how often to log the training loss
    fp16=True if torch.cuda.is_available() else False,  # whether to use mixed precision training
    load_best_model_at_end=True,  # load the best model when finished training
    metric_for_best_model='f1',  # use F1 score to determine best model
    greater_is_better=True,  # higher F1 is better
    seed=seed,
    dataloader_drop_last=True,  # drop last incomplete batch
    gradient_accumulation_steps=2,  # accumulate gradients for effective larger batch size
    save_total_limit=1,  # limit the total amount of checkpoints saved
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=enc_train,
    eval_dataset=enc_val,
    compute_metrics=compute_metrics,
    callbacks=[
        EarlyStoppingCallback(early_stopping_patience=3),
        TensorBoardCallback()
    ]
)
# Train the model
print("Starting training...")
results = trainer.train()
print("Training completed!")
print(results)

## 6- Evaluate on test set

In [None]:
# Evaluate on test set
print("Evaluating on test set...")
test_results = trainer.evaluate(enc_test)
print("Test results:", test_results)

## 7- Save the final model


In [None]:
# Save the final model
trainer.save_model(f"{output_dir}/final_multi_class_model")
print(f"Final model saved to {output_dir}/final_multi_class_model")

# Example of using the model for inference a new sentence
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    
    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    prediction = torch.argmax(probabilities, dim=-1).item()
    confidence = probabilities[0][prediction].item()
    
    sentiment = "Positive" if prediction == 1 else "Negative"
    return sentiment, confidence

## 8- Run the model for inference

In [None]:
test_texts = [
        "Fenerbahçeli futbolcular kısa paslarla hazırlık çalışması yaptılar",
        "Türkiye’de mali istikrarı sağlamak ve yatırımları artırmak için yeni politikalar geliştirilmelidir.",
        "Yapay zeka ve otomasyon, üretim sektöründe verimliliği artırarak maliyetleri düşürüyor.",
        "Küresel ısınma, dünyanın ekosistemlerini ve iklim dengesini tehdit eden en büyük sorunlardan biridir.",
    ]
for text in test_texts:
        sentiment, confidence = predict_sentiment(text)
        print(f"Text: {text}")
        print(f"Sentiment: {sentiment} (confidence: {confidence:.4f})")
        print("-" * 50)