# Klasifikacija rečenica

## Uvoz potrebnih biblioteka

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, Features, ClassLabel, Sequence, Value, Dataset, DatasetDict

import pandas as pd
import numpy as np
import random

import torch
from torch.nn.functional import cross_entropy, softmax 

from sklearn.metrics import accuracy_score, f1_score, classification_report, roc_curve, auc 
import matplotlib.pyplot as plt 

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false" 
os.environ["WANDB_DISABLED"] = "true"

## Definiranje argumenata, odabir jezičnog modela

In [None]:
# Slučajni broj
seed = 16
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

# Značajke skupa podataka 
label_names = ["0", "1"] 
features = Features({
    "id": Value("int64"),
    "sentence": Value("string"),
    "label": ClassLabel(num_classes=len(label_names), names=label_names)
})

# Odabir jezičnog modela i parametri modela (Language model selection and parameters)
# Other options mentioned in comments: EMBEDDIA/crosloengual-bert, bert-base-multilingual-uncased, classla/xlm-r-bertic, xlm-roberta-base, xlm-roberta-large
model_ckpt = "classla/xlm-r-bertic" 
batch_size_train = 32
batch_size_test = 16 
learning_rate = 3e-5
epoch = 3 

## Učitavanje skupa podataka

In [None]:
df_train = pd.read_csv('data/fold_1/train_1.csv', encoding="UTF-8")
df_test = pd.read_csv('data/fold_1/test_1.csv', encoding="UTF-8")

dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train, features=features),
    "test": Dataset.from_pandas(df_test, features=features)
})

print("Dataset loaded:")
print(dataset)

## Tokenizacija skupa podataka

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt).to(device)

def tokenize(batch):
  return tokenizer(batch["sentence"], padding = True, truncation = False)

dataset_encoded = dataset.map(tokenize, batched = True, batch_size = None)

def extract_hidden_states(batch):
  inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
  with torch.no_grad():
    last_hidden_state = model(**inputs).last_hidden_state
  return {"hidden_state": last_hidden_state[:,0].cpu().numpy()}

dataset_encoded.set_format("torch", columns = ["input_ids", "attention_mask", "label"])

dataset_hidden = dataset_encoded.map(extract_hidden_states, batched = True)

## Izračun metrika klasifikatora

In [None]:
def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  f1 = f1_score(labels, preds, average="weighted")
  acc = accuracy_score(labels, preds)
  print(classification_report(labels, preds, digits=6))
  return {"accuracy": acc, "f1": f1}

## Treniranje i evaluacija modela

In [None]:
num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels = num_labels).to(device)

logging_steps = len(dataset_encoded["train"]) // batch_size_train
model_name = f"{model_ckpt}-finetuned-metaphor"

training_args = TrainingArguments(
    output_dir = model_name,
    num_train_epochs = epoch,
    learning_rate = learning_rate,
    per_device_train_batch_size = batch_size_train,
    per_device_eval_batch_size = batch_size_test,
    warmup_steps = 0,
    weight_decay = 0.01,
    evaluation_strategy = "no",
    #save_strategy = "epoch",
    #load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    disable_tqdm = False,
    logging_steps = logging_steps,
    push_to_hub = False,
    log_level = "error",
    seed=seed
)

trainer = Trainer(
    model = model,
    args = training_args,
    compute_metrics = compute_metrics,
    train_dataset = dataset_encoded["train"],
    #eval_dataset = dataset_encoded["validation"],
    tokenizer = tokenizer
)

trainer.train();

trainer.evaluate(dataset_encoded['test'])

## Krivulja ROC

In [None]:
predictions_output = trainer.predict(dataset_encoded['test'])
test_predictions_logits = predictions_output.predictions
test_true_labels = predictions_output.label_ids

test_probabilities = softmax(torch.tensor(test_predictions_logits), dim=-1).numpy()

positive_class_index = label_names.index("1") if "1" in label_names else 1
y_scores = test_probabilities[:, positive_class_index]

fpr, tpr, thresholds = roc_curve(test_true_labels, y_scores)
roc_auc_value = auc(fpr, tpr) 

print(f"AUC: {roc_auc_value:.4f}")

plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC krivulja (AUC = {roc_auc_value:.4f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Stopa lažno pozitivnih rezultata')
plt.ylabel('Stopa točno pozitivnih rezultata')
plt.title('Krivulja ROC - stopa učenja 2e-5')
plt.legend(loc="lower right", fontsize=12)
plt.grid(True)
#plt.savefig("results/roc_auc_curve-3e5-1.png") 
plt.show()

## Analiza grešaka

In [None]:
def forward_pass_with_label(batch):
  inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
  with torch.no_grad():
    output = model(**inputs)
    pred_label = torch.argmax(output.logits, axis = -1)
    loss = cross_entropy(output.logits, batch["label"].to(device), reduction = "none")
    return {"loss": loss.cpu().numpy(),
            "predicted_label": pred_label.cpu().numpy()}

dataset_encoded.set_format("torch", columns = ["input_ids", "attention_mask", "label"])

dataset_encoded["test"] = dataset_encoded["test"].map(forward_pass_with_label, batched = True, batch_size = 16)

dataset_encoded.set_format("pandas")
cols = ["id","sentence", "label", "predicted_label", "loss"]
df_test = dataset_encoded["test"][:][cols]
df_test["label"] = df_test["label"]
df_test["predicted_label"] = (df_test["predicted_label"])
df_test["id"] = df_test["id"]

df_test.to_csv("results/xlm-r-bertic-1-3e5.csv")

## Ispis broja parametara modela

In [None]:
total_params = sum(
	param.numel() for param in model.parameters()
)

print(f"Broj parametara modela: {total_params}")