# imdb 데이터셋을 활용하여 finetuning 진행

In [29]:
!pip install transformers



## SentencePiece라이브러리와 transforemr간 충돌할수 있기 때문에 설치후 런타임 재실행 해야한다.

In [30]:
!pip install SentencePiece



In [31]:
!pip install datasets



In [32]:
import torch
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup ## 훈련에 사용하는 method
from sklearn.model_selection import train_test_split  ## train_split
from datasets import load_dataset # load ddataset

In [33]:
# Load the tokenizer and pre-trained model
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
model = XLMRobertaForSequenceClassification.from_pretrained("xlm-roberta-base", num_labels=2)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_proj.bias', 'classifier.den

In [6]:
# Load the IMDb dataset
dataset = load_dataset("imdb")

Downloading builder script:   0%|          | 0.00/4.31k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.59k [00:00<?, ?B/s]

Downloading and preparing dataset imdb/plain_text to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0...


Downloading data:   0%|          | 0.00/84.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

Dataset imdb downloaded and prepared to /root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/d613c88cf8fa3bab83b4ded3713f1f74830d1100e171db75bbddb80b3345c9c0. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [8]:
subset = dataset["train"].select(range(2000))

### Google Colab에서는 무료 GPU로 25000개에 달하는 embedding을 사용하여 훈련및 테스팅하는 거대모델 훈련 불가능

In [9]:
subset

Dataset({
    features: ['text', 'label'],
    num_rows: 2000
})

In [10]:
# Split the dataset into training and validation sets
train_data, val_data = train_test_split(subset, test_size=0.2, random_state=42)

In [11]:
type(train_data)

dict

In [12]:
train_data["text"][0]

'*** May contain spoilers. *** <br /><br />If LIVING ON TOKYO TIME were some bold experiment where real-life wanna-be actors were given film parts on the condition that they would be required to take a combination of powerful prescription anti-anxiety, anti-depression, and anti-psychotic medications (this is the classic psych ward combo that renders patients into drooling zombies) all during filming, then this movie would hold far more interest. Or, if the film production was another type of experiment where all of the actors were sleep deprived before and during filming, then TOKYO TIME could be more easily explained.<br /><br />As it is, this film is filled with lifeless, low-energy actors. In the scene where the new husband was sitting on the stairs talking with his sister, it appeared that he was having trouble keeping his eyes open. In almost every scene he speaks his lines sitting down with every part of his body motionless. From beginning to end, his facial expression is best de

In [13]:
train_data["label"][0]

0

In [14]:
# Tokenize and encode the input sequences
train_encodings = tokenizer(
    train_data["text"],
    truncation=True, # truncation -=> 데이터를 자를건지?
    padding=True, # 작다면 padding 추가
    max_length=128 # 데이터 자름
)
val_encodings = tokenizer(
    val_data["text"],
    truncation=True,
    padding=True,
    max_length=128
)

In [15]:
# Convert the labels to tensors
train_labels = torch.tensor(train_data["label"])
val_labels = torch.tensor(val_data["label"])

## Custom Dataset

In [16]:
# Create a PyTorch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
      ## 변수 초기화
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx): # idx가 전달됌에 따라서 dataset의 idx에 encoding 진행
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [17]:
train_dataset = Dataset(train_encodings, train_labels)
val_dataset = Dataset(val_encodings, val_labels)

## Datalodaer에 전달하여 반복할수 있도록 함

In [18]:
# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=16, shuffle=False)

In [42]:
model.parameters

<bound method Module.parameters of XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear

In [19]:
num_epochs = 5

# Prepare optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=1e-5)
## scheduler 정의
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0, ## warmup 즉 말그대로 웜업 스텝임
    num_training_steps=len(train_loader) * num_epochs
)



In [20]:
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

In [21]:
for epoch in range(num_epochs):
  ## epoch만큼 반복
    model.train()
    ## hugging face transfomre에 모델 훈련중이라고 알림
    total_loss = 0

    for batch in train_loader:
      ## train_loader에 대한 각 batch
      ## 각 batch에 대해 입력 id, attention mask , label을 가져옴
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        ## 이들을 모델에 입력해줌
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        ## 손실을 계산하는데 사용
        loss = outputs.loss
        total_loss += loss.item()
        ## batch의 총 손실 계산하기 위해
        loss.backward()
        ## 역전파 수행
        optimizer.step()
        scheduler.step()
        ## optimizer와 scheduler update 사용하여 가중치 update

    avg_train_loss = total_loss / len(train_loader)
    # 평균 loss 구함



    # Validation
    ## epoch 훈련 끝난후 모델 평가
    model.eval()
    total_val_loss = 0
    correct = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits

            total_val_loss += loss.item()
            ## 다른점은 logits(최종 층, 혹은 최종 출력)
            ### 에 대하여 argmax() 가장큰 확률값을 찾는다.
            preds = torch.argmax(logits, dim=1)
            correct += (preds == labels).sum().item()

    avg_val_loss = total_val_loss / len(val_loader)
    val_accuracy = correct / len(val_dataset)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f} | Val Accuracy: {val_accuracy:.4f}")

  item["labels"] = torch.tensor(self.labels[idx])


Epoch 1/5
Train Loss: 0.0924 | Val Loss: 0.0019 | Val Accuracy: 1.0000
Epoch 2/5
Train Loss: 0.0028 | Val Loss: 0.0007 | Val Accuracy: 1.0000
Epoch 3/5
Train Loss: 0.0015 | Val Loss: 0.0005 | Val Accuracy: 1.0000
Epoch 4/5
Train Loss: 0.0011 | Val Loss: 0.0004 | Val Accuracy: 1.0000
Epoch 5/5
Train Loss: 0.0010 | Val Loss: 0.0004 | Val Accuracy: 1.0000


In [22]:
# Save the fine-tuned model
model.save_pretrained("path/to/save/model")
tokenizer.save_pretrained("path/to/save/tokenizer")

('path/to/save/tokenizer/tokenizer_config.json',
 'path/to/save/tokenizer/special_tokens_map.json',
 'path/to/save/tokenizer/sentencepiece.bpe.model',
 'path/to/save/tokenizer/added_tokens.json')

In [23]:
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer

model = XLMRobertaForSequenceClassification.from_pretrained("/content/path/to/save/model")
tokenizer = XLMRobertaTokenizer.from_pretrained("path/to/save/tokenizer")

In [76]:
sentence_example = val_data["text"][0]
sentence_example

'A "friend", clearly with no taste or class, suggested I take a look at the work of Ron Atkins. If this is representative of his oeuvre, I never want to see anything else by him. It is amateurish, self-indulgent, criminally shoddy and self-indulgent rubbish. The "whore mangler" of the title is an angry low budget filmmaker who murders a bunch of hookers. There is a little nudity and some erections, but no single element could possibly save this from the hangman\'s noose. The lighting is appalling, the dialog is puerile and mostly shouted, and the direction is clueless. I saw a doco on American exploitation filmmakers during the recent Fangoria convention. Atkins was one of those featured. He spoke like there was something important about his work, but after a viewing of this, I see nothing of any import whatsoever. There is no style, either, and the horrible video effects (like solarization) only enhance the amateurishness. Not even so bad it\'s fun. Avoid.'

In [72]:
text = "maybe is not not good."
inputs = tokenizer(text, truncation=True, padding=True, return_tensors="pt")

In [73]:
model(**inputs)

SequenceClassifierOutput(loss=None, logits=tensor([[-0.3730, -0.0212]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [74]:
outputs = model(**inputs)
logits = outputs.logits
predicted_label = torch.argmax(logits, dim=1).item()

In [75]:
predicted_label

1

In [62]:
val_data["label"][0]

0