In [27]:

import torch, os
import pandas as pd


from torch import cuda
from transformers import BertTokenizerFast
from transformers import BertForSequenceClassification
from transformers import pipeline, BertForSequenceClassification, BertTokenizerFast
from torch.utils.data import Dataset
from transformers import DistilBertForSequenceClassification, DistilBertTokenizerFast
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score 




In [11]:

device = 'cuda' if cuda.is_available() else 'cpu'


In [12]:

data = pd.read_csv("TTC4900.csv")
data = data.sample(frac=1.0, random_state=42)

print(data.head())


        category                                               text
4657  teknoloji    acıların kedisi sam çatık kaşlı kedi sam in i...
3539       spor    g saray a git santos van_persie den forma ala...
907       dunya    endonezya da çatışmalar 14 ölü endonezya da i...
4353  teknoloji    emniyetten polis logolu virüs uyarısı telefon...
3745       spor    beni türk yapın cristian_baroni yıldırım dan ...


In [13]:

labels=["teknoloji","ekonomi","saglik","siyaset","kultur","spor","dunya"]

NUM_LABELS = len(labels)

NUM_LABELS


7

In [14]:
id2label={i:l for i,l in enumerate(labels)}
label2id={l:i for i,l in enumerate(labels)}

print( label2id )

{'teknoloji': 0, 'ekonomi': 1, 'saglik': 2, 'siyaset': 3, 'kultur': 4, 'spor': 5, 'dunya': 6}


In [15]:

data["labels"] = data.category.map(lambda x: label2id[x.strip()])

print( data.head() )

        category                                               text  labels
4657  teknoloji    acıların kedisi sam çatık kaşlı kedi sam in i...       0
3539       spor    g saray a git santos van_persie den forma ala...       5
907       dunya    endonezya da çatışmalar 14 ölü endonezya da i...       6
4353  teknoloji    emniyetten polis logolu virüs uyarısı telefon...       0
3745       spor    beni türk yapın cristian_baroni yıldırım dan ...       5


In [19]:


print(   data.category.value_counts()  )   ##.plot(kind='pie')   ##, figsize=(8,8))



teknoloji     700
spor          700
dunya         700
kultur        700
ekonomi       700
saglik        700
siyaset       700
Name: category, dtype: int64


In [21]:

tokenizer = BertTokenizerFast.from_pretrained("dbmdz/bert-base-turkish-uncased", max_length=512)
model = BertForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-uncased", 
                       num_labels=NUM_LABELS, id2label=id2label, label2id=label2id)

model.to(device)



Some weights of the model checkpoint at dbmdz/bert-base-turkish-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [22]:

SIZE = data.shape[0]

train_texts= list(data.text[:SIZE//2])
val_texts=   list(data.text[SIZE//2:(3*SIZE)//4 ])
test_texts=  list(data.text[(3*SIZE)//4:])

train_labels= list(data.labels[:SIZE//2])
val_labels=   list(data.labels[SIZE//2:(3*SIZE)//4])
test_labels=  list(data.labels[(3*SIZE)//4:])


In [23]:

print(   len(train_texts), len(val_texts), len(test_texts)   )


2450 1225 1225


In [24]:

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings   = tokenizer(val_texts,   truncation=True, padding=True)
test_encodings  = tokenizer(test_texts,  truncation=True, padding=True)


In [25]:

class MyDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [26]:

train_dataset = MyDataset(train_encodings, train_labels)
val_dataset   = MyDataset(val_encodings,   val_labels)
test_dataset  = MyDataset(test_encodings,  test_labels)


In [28]:

def compute_metrics(pred): 
    labels = pred.label_ids 
    preds  = pred.predictions.argmax(-1) 
    f1  = f1_score(labels, preds, average='macro') 
    acc = accuracy_score(labels, preds) 
    return {
        'Accuracy': acc,
        'F1': f1
    }


In [30]:


training_args = TrainingArguments(
    # The output directory where the model predictions and checkpoints will be written
    output_dir='./TTC4900Model', 
    do_train=True,
    do_eval=True,
    #  The number of epochs, defaults to 3.0 
    num_train_epochs=3,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size  = 8,
    # Number of steps used for a linear warmup
    warmup_steps=100,
    weight_decay=0.01,
    logging_strategy='steps',
   # TensorBoard log directory
    logging_dir='./multi-class-logs',
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="epoch",
    ## fp16=True
    #load_best_model_at_end=True
)



In [32]:

trainer = Trainer(
    # the pre-trained model that will be fine-tuned
    model=model,
     # training arguments that we defined above
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics= compute_metrics
)


## Requires GPU

In [34]:
'''
res_train = trainer.train()
'''



'\nres_train = trainer.train()\n'

In [None]:

print(   res_train   )


In [None]:

q = [trainer.evaluate(eval_dataset=data) for data in [train_dataset, val_dataset, test_dataset]]

pd.DataFrame(q, index=["train","val","test"]).iloc[:,:5]


In [None]:


def predict(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")
    outputs = model(**inputs)
    probs = outputs[0].softmax(1)
    return probs, probs.argmax(),model.config.id2label[probs.argmax().item()]



In [None]:

# Example #1

text = "Fenerbahçeli futbolcular kısa paslarla hazırlık çalışması yaptılar"
print(predict(text))



In [None]:

# saving the fine tuned model & tokenizer

model_path = "turkish-text-classification-model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)


In [None]:

model_path = "turkish-text-classification-model"

model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer= BertTokenizerFast.from_pretrained(model_path)

nlp = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)


In [None]:

r1 = nlp("Sinemada hangi filmler oynuyor bugün")
print(r1)
#[{‘label': 'kultur', 'score': 0.897723913192749}]


In [None]:

r2 = nlp("Dolar ve Euro bugün yurtiçi piyasalarda yükseldi")
print(r2)
#[{‘label': 'ekonomi', 'score': 0.9639127254486084}]


In [None]:

r3 = nlp("Bayern Münih ile Barcelona bugün karşı karşıya geliyor. Maçı İngiliz hakem James Watts yönetecek!")
print(r3)
#[{‘label': 'spor', 'score': 0.9791778922080994}]
