In [17]:
import torch
from torch.utils.data import DataLoader
from scipy.special import softmax

In [18]:
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

Device: cpu


### Load pre-trained model

In [19]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "bert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

loading configuration file config.json from cache at /Users/mac/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file vocab.txt from cache at /Users/mac/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/vocab.txt
loading fil

### Load dataset

In [20]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="p6/train_data.csv")
dataset = dataset.rename_column("rating", "label")
dataset = dataset.rename_column("review", "text")
dataset = dataset["train"].train_test_split(test_size=0.2)
train_dataloader = DataLoader(dataset["train"], shuffle=True, batch_size=16)
eval_dataloader = DataLoader(dataset["test"], batch_size=16)

Using custom data configuration default-57b8bb3caed54ced
Found cached dataset csv (/Users/mac/.cache/huggingface/datasets/csv/default-57b8bb3caed54ced/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

### Tokenize dataset

In [21]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")


def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)


tokenized_datasets = dataset.map(tokenize_function, batched=True)

loading configuration file config.json from cache at /Users/mac/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.26.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file vocab.txt from cache at /Users/mac/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/vocab.txt
loading fil

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [None]:
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset = tokenized_datasets["test"].shuffle(seed=42)

### Training

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir="test_trainer",per_device_train_batch_size=16,num_train_epochs=3)

In [None]:
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Zrzut ekranu straty z treningu:


![](loss.png "Loss")

### Ładowanie wytrenowanego modelu

In [22]:
model = AutoModelForSequenceClassification.from_pretrained("saved_model", num_labels=5)

loading configuration file saved_model/config.json
Model config BertConfig {
  "_name_or_path": "saved_model",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.26.0",
  "type_vocab_size": 2,
  "use_cache": true

### Generowanie wyników

In [67]:
def get_result(model, tokenizer, text):
    encoded_input = tokenizer(text, return_tensors="pt")
    if encoded_input["input_ids"].shape[1] > 512:
        encoded_input["input_ids"] = encoded_input["input_ids"][:, :512]
        encoded_input["attention_mask"] = encoded_input["attention_mask"][:, :512]
        encoded_input["token_type_ids"] = encoded_input["token_type_ids"][:, :512]
    result = model(**encoded_input)
    result = result[0][0].detach().numpy()
    result = softmax(result)
    return np.argmax(result)

In [68]:
import pandas as pd

test_data = pd.read_csv("p6/test_data.csv", header=None)

In [69]:
results = []
for i, text in enumerate(test_data[0]):
    results.append(get_result(model, tokenizer, text))

In [72]:
pd.DataFrame(results).to_csv("results.csv", index=False, header=False)