## Self-supervised learning을 위한 "경량화" Bert

## bert는 매우 큰 모델이라 해결책이나 문제 또는 프로젝트를 위해서 Bert를 파인튜닝하는 것은 어려움

### Albert : A Lite Bert
#### 모바일이나 엣지장치에서 배포 가능

In [12]:
!pip install transformers



In [13]:
!pip install SentencePiece



In [14]:
import torch
from transformers import AlbertTokenizer, AlbertForMaskedLM

In [15]:
# Load the pretrained ALBERT model and tokenizer
model_name = 'albert-base-v2'
tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForMaskedLM.from_pretrained(model_name)

# 나머지는 이전 Bert와 똑같음

In [16]:
# Set device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [17]:
# Next word prediction function
def predict_next_word(text):
    # Tokenize input text
    tokenized_text = tokenizer.tokenize(text)
    masked_index = tokenized_text.index('[MASK]')
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Convert tokens to tensor
    tokens_tensor = torch.tensor([indexed_tokens]).to(device)

    # Forward pass through the model
    with torch.no_grad():
        outputs = model(tokens_tensor)

    predictions = outputs[0][0, masked_index].topk(k=5).indices.tolist()

    predicted_tokens = []
    for token_index in predictions:
        predicted_token = tokenizer.convert_ids_to_tokens([token_index])[0]
        predicted_tokens.append(predicted_token)

    return predicted_tokens

In [18]:
# Example usage
text_with_mask = "I want to [MASK] a pizza for dinner."
predicted_tokens = predict_next_word(text_with_mask)

# Print the predicted tokens
print(predicted_tokens)

['▁my', '▁the', '▁our', '▁your', '▁fro']


## 예측이 구리다
## 따라서 Albert를 조정하는 것 필요
### 파인튜닝
### 다만 여기선 label로써 분류하는 작업을 했음

In [19]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AdamW

# Load the pretrained ALBERT model and tokenizer
model_name = 'albert-base-v2'
tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# 같은 모델을 가져온다.

# Set device (CPU or GPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define your own dataset and dataloader
## 파이토치를 사용해서 Dataset 설정하려면 torch.utils.Dataset필요

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]
        return text, label

    def __len__(self):
        return len(self.texts)

# Example training data
## 단순히 예제용 , 자신만의 데이터셋 필요
train_texts = ['This is the first sentence.', 'This is the second sentence.']
train_labels = [0, 1]

# Create the dataset and dataloader
train_dataset = MyDataset(train_texts, train_labels)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# Move the model to the device
model.to(device)

# Training settings
epochs = 10
lr = 2e-5
optimizer = AdamW(model.parameters(), lr=lr)
## 분류문제이기 때뭉네
loss_fn = nn.CrossEntropyLoss()

# Training loop
for epoch in range(epochs):
    total_loss = 0
    model.train()

    for texts, labels in train_dataloader:
        # Tokenize input texts
        input_ids = []
        attention_masks = []
        for text in texts:
            encoded = tokenizer.encode_plus(
                text,
                add_special_tokens=True,
                padding='max_length',
                truncation=True,
                max_length=128,
                return_tensors='pt'
            )
            input_ids.append(encoded['input_ids'].squeeze())
            attention_masks.append(encoded['attention_mask'].squeeze())
        # 모두 넣은후 stack으로 행렬화 해서 한번에 전달
        input_ids = torch.stack(input_ids).to(device)
        attention_masks = torch.stack(attention_masks).to(device)
        labels = torch.tensor(labels).to(device)

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_masks)
        logits = outputs.logits

        # Compute loss
        loss = loss_fn(logits, labels)
        total_loss += loss.item()

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs} - Loss: {average_loss:.4f}")

# Save the fine-tuned model
model.save_pretrained('path/to/save/model')
tokenizer.save_pretrained('path/to/save/tokenizer')


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForSequenceClassification: ['predictions.dense.bias', 'predictions.LayerNorm.bias', 'predictions.decoder.bias', 'predictions.LayerNorm.weight', 'predictions.bias', 'predictions.dense.weight']
- This IS expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model 

Epoch 1/10 - Loss: 0.7318
Epoch 2/10 - Loss: 0.7358
Epoch 3/10 - Loss: 0.6278
Epoch 4/10 - Loss: 0.5126
Epoch 5/10 - Loss: 0.5879
Epoch 6/10 - Loss: 0.3802
Epoch 7/10 - Loss: 0.4190
Epoch 8/10 - Loss: 0.3830
Epoch 9/10 - Loss: 0.2662
Epoch 10/10 - Loss: 0.2022


('path/to/save/tokenizer/tokenizer_config.json',
 'path/to/save/tokenizer/special_tokens_map.json',
 'path/to/save/tokenizer/spiece.model',
 'path/to/save/tokenizer/added_tokens.json')

In [20]:
2e-5

2e-05

## finetuning 하고 사용해보는 부분
## 오류가 난이유
### label을 2개로 설정하고 분류하는 작업을한다.
## 이후 model의 prediction을 통하면 2개의 label 나오는데 기존의 masked된 부분을 넘어선 index를 통해 예측을 하려고 하여 오류가 남

In [32]:
model_name = 'path/to/save/'
tokenizer = AlbertTokenizer.from_pretrained(model_name+'tokenizer')
model = AlbertForSequenceClassification.from_pretrained(model_name+'model')

In [33]:
# Next word prediction function
def predict_next_word(text):
    # Tokenize input text
    tokenized_text = tokenizer.tokenize(text)
    masked_index = tokenized_text.index('[MASK]')
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

    # Convert tokens to tensor
    tokens_tensor = torch.tensor([indexed_tokens]).to(device)
    print(tokens_tensor)
    # Forward pass through the model
    with torch.no_grad():
        outputs = model(tokens_tensor)
    print(outputs)

    predictions = outputs[0][0, masked_index].topk(k=5).indices.tolist()

    predicted_tokens = []
    for token_index in predictions:
        predicted_token = tokenizer.convert_ids_to_tokens([token_index])[0]
        predicted_tokens.append(predicted_token)

    return predicted_tokens

In [34]:
# Example usage
text_with_mask = "I want to [MASK] a pizza for dinner."
predicted_tokens = predict_next_word(text_with_mask)

# Print the predicted tokens
print(predicted_tokens)

tensor([[   31,   259,    20,     4,    21, 10684,    26,  2860,     9]])
SequenceClassifierOutput(loss=None, logits=tensor([[0.1506, 0.9279]]), hidden_states=None, attentions=None)


IndexError: ignored