In [None]:
import torch
from torch.utils.data import DataLoader
from scipy.special import softmax

In [None]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

### Load pre-trained model

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

### Load dataset

In [None]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="p6/train_data.csv")
dataset = dataset.rename_column("rating", "label")
dataset = dataset.rename_column("review", "text")
dataset = dataset["train"].train_test_split(test_size=0.2)
train_dataloader = DataLoader(dataset["train"], shuffle=True, batch_size=16)
eval_dataloader = DataLoader(dataset["test"], batch_size=16)

### Tokenize dataset

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

In [None]:
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset = tokenized_datasets["test"].shuffle(seed=42)

### Training

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer",per_device_train_batch_size=16,num_train_epochs=3)

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

### Ładowanie wytrenowanego modelu

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("saved_model", num_labels=5)

In [None]:
def get_result(model, tokenizer, text):
    encoded_input = tokenizer(text, return_tensors="pt")
    result = model(**encoded_input)
    result = result[0][0].detach().numpy()
    result = softmax(result)
    return np.argmax(result)

In [None]:
text = "nice issues ok begin, firstly stayed wyland waikiki feb. 23-march 3 2008. checked standard room check asked like upgrade suite 25 night check said no got room nice small room omg shower big 1 person, morning upgraded 1 bedroom suite definetly recommend going stay, suite great size seperate living room bedroom small fridge microwave coffee pot, ok starting day suite living room plasma tv not work called everyday come fix finally day 6 offered compensation, hote lobby nice hotel far beach, parking 25 plus tip valet valet staff nice late night people hanging valet desk drinking alcohol not professional did not really think appropriate, hotel far beach probably not stay,  "
get_result(model, tokenizer, text)