<a href="https://colab.research.google.com/github/saribasmetehan/Transformers/blob/main/Turkish_Text_Classifiaction_Fine_Tuning_PyTorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q datasets

In [2]:
from datasets import load_dataset

In [3]:
dataset = load_dataset("winvoker/turkish-sentiment-analysis-dataset")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'dataset'],
        num_rows: 440679
    })
    test: Dataset({
        features: ['text', 'label', 'dataset'],
        num_rows: 48965
    })
})

In [5]:
dataset["train"][0]

{'text': 'ürünü hepsiburadadan alalı 3 hafta oldu. orjinal ve eksiksiz şekilde geldi. şarj konusunda 1 günü rahat çıkarıyor oyun oynamama rağmen. teslimat sürecide hızlı gerçekleşti. en uygun fiyata iphone kalitesi kaçırmayın..',
 'label': 'Positive',
 'dataset': 'urun_yorumlari'}

In [6]:
label_mapping = {"Positive": 1, "Notr": 0, "Negative": 2}

In [7]:
def encode_labels(example):
    if example['label'] == "Positive":
        example['label'] = 1
    elif example['label'] == "Negative":
        example['label'] = 2
    else:
        example['label'] = 0
    return example

In [8]:
dataset = dataset.map(encode_labels)

In [9]:
print(dataset['train'].features)
print(dataset['train'][0])

{'text': Value(dtype='string', id=None), 'label': Value(dtype='int64', id=None), 'dataset': Value(dtype='string', id=None)}
{'text': 'ürünü hepsiburadadan alalı 3 hafta oldu. orjinal ve eksiksiz şekilde geldi. şarj konusunda 1 günü rahat çıkarıyor oyun oynamama rağmen. teslimat sürecide hızlı gerçekleşti. en uygun fiyata iphone kalitesi kaçırmayın..', 'label': 1, 'dataset': 'urun_yorumlari'}


In [10]:
from transformers import AutoTokenizer

In [11]:
model_name = "dbmdz/bert-base-turkish-cased"

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_name)



In [13]:
tokenizer.vocab_size

32000

In [14]:
tokenizer.model_max_length

512

In [15]:
def tokenize(batch):
  return tokenizer(batch["text"], truncation = True)

In [16]:
tokenize(dataset["train"][:3])

{'input_ids': [[2, 6916, 6077, 5050, 2356, 2030, 2911, 1991, 23, 3087, 2111, 18, 15824, 1992, 13283, 2542, 3381, 18, 10696, 3080, 21, 3195, 3128, 17305, 2672, 10398, 1981, 3617, 18, 15928, 6607, 1988, 3807, 9480, 18, 2127, 2897, 14661, 25285, 9707, 22524, 12937, 18, 18, 3], [2, 23960, 2140, 25673, 16, 5428, 7474, 5668, 5002, 18, 4146, 6077, 5050, 2356, 2010, 7864, 3807, 3065, 1995, 2058, 3898, 5002, 18, 3], [2, 3807, 9250, 16, 3477, 5601, 18, 3898, 5002, 18, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [17]:
dataset_encoded = dataset.map(tokenize, batched = True, batch_size = None)

In [18]:
from transformers import DataCollatorWithPadding

In [19]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [20]:
dataset_encoded["train"].column_names

['text', 'label', 'dataset', 'input_ids', 'token_type_ids', 'attention_mask']

In [21]:
from transformers import AutoModelForSequenceClassification

In [22]:
num_labels = 3

In [23]:
!pip install transformers[torch]



In [24]:
!pip install accelerate -U



In [25]:
import torch

In [26]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [27]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = num_labels).to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
!pip install -q evaluate

In [29]:
import evaluate

In [30]:
accuracy = evaluate.load("accuracy")

In [31]:
import numpy as np

In [32]:
def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis = 1)
  return accuracy.compute(predictions = predictions, references = labels)

In [33]:
!pip install huggingface_hub



In [34]:
from huggingface_hub import notebook_login

In [35]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [36]:
from transformers import TrainingArguments

In [37]:
training_args = TrainingArguments(output_dir = "bert-base-turkish-sentiment-analysis",
                                   num_train_epochs = 4,
                                   per_device_train_batch_size = 16,
                                   per_device_eval_batch_size = 16,
                                   weight_decay = 0.01,
                                   evaluation_strategy = "epoch",
                                   save_strategy="epoch",
                                   load_best_model_at_end = True,
                                   report_to = "none",
                                   learning_rate=5e-5,
                                   push_to_hub = True
                                   )

In [38]:
from transformers import Trainer

In [39]:
train_dataset = dataset_encoded["train"].shuffle(seed=42).select(range(10000))
eval_dataset = dataset_encoded["test"].shuffle(seed=42).select(range(2000))

In [40]:
trainer = Trainer(
    model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    tokenizer = tokenizer,
)

In [41]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.1902,0.162863,0.9575
2,0.1064,0.179041,0.96
3,0.0631,0.235768,0.96
4,0.0146,0.245756,0.962


TrainOutput(global_step=2500, training_loss=0.07996141262054443, metrics={'train_runtime': 1008.773, 'train_samples_per_second': 39.652, 'train_steps_per_second': 2.478, 'total_flos': 2505291962957280.0, 'train_loss': 0.07996141262054443, 'epoch': 4.0})

In [51]:
trainer.push_to_hub(commit_message=" Ok! ")

CommitInfo(commit_url='https://huggingface.co/saribasmetehan/bert-base-turkish-sentiment-analysis/commit/00406ea15461adbd1840fe4cfcbb1912581d4acf', commit_message=' Ok! ', commit_description='', oid='00406ea15461adbd1840fe4cfcbb1912581d4acf', pr_url=None, pr_revision=None, pr_num=None)

In [52]:
from transformers import pipeline

In [53]:
model_id = "saribasmetehan/bert-base-turkish-sentiment-analysis"

In [55]:
classifer = pipeline("text-classification",model = model_id)

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/251k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/755k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [56]:
text = "Senden nefret ediyorum."

In [63]:
preds= classifer(text)

In [61]:
import pandas as pd

In [64]:
pd.DataFrame(preds)

Unnamed: 0,label,score
0,LABEL_2,0.751006
