In [1]:
!pip install transformers
!pip install sentencepiece
!pip install 'git+https://github.com/SKTBrain/KoBERT.git#egg=kobert_tokenizer&subdirectory=kobert_hf'

Collecting kobert_tokenizer
  Cloning https://github.com/SKTBrain/KoBERT.git to /tmp/pip-install-enalwj4h/kobert-tokenizer_449c36b1376d467e9be87354091f2487
  Running command git clone --filter=blob:none --quiet https://github.com/SKTBrain/KoBERT.git /tmp/pip-install-enalwj4h/kobert-tokenizer_449c36b1376d467e9be87354091f2487
  Resolved https://github.com/SKTBrain/KoBERT.git to commit 47a69af87928fc24e20f571fe10c3cc9dd9af9a3
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: kobert_tokenizer
  Building wheel for kobert_tokenizer (setup.py) ... [?25l[?25hdone
  Created wheel for kobert_tokenizer: filename=kobert_tokenizer-0.1-py3-none-any.whl size=4633 sha256=216946ef4939f0987d33f2d73c5cc4f6c9eb318810208089b51365d919b50ed8
  Stored in directory: /tmp/pip-ephem-wheel-cache-cce8blwt/wheels/e9/1a/3f/a864970e8a169c176befa3c4a1e07aa612f69195907a4045fe
Successfully built kobert_tokenizer
Installing collected packages: kobert_tokenizer
Successfully ins

In [2]:
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
class NewsDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.title = dataframe.title
        self.targets = dataframe.label
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )

        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.long)
        }

# 데이터프레임 불러오기
df = pd.read_csv('/content/drive/MyDrive/프로젝트/증권 뉴스 분류 및 개체명 인식/news.csv') # 데이터 파일 경로

# 데이터셋을 훈련과 검증 데이터로 분할하고 shuffle 적용
train_dataset, val_dataset = train_test_split(df, test_size=0.2, random_state=200, shuffle=True)

# 인덱스 재설정
train_dataset = train_dataset.reset_index(drop=True)
val_dataset = val_dataset.reset_index(drop=True)

# 토크나이저와 데이터셋 설정
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
max_len = 128

training_set = NewsDataset(train_dataset, tokenizer, max_len)
validation_set = NewsDataset(val_dataset, tokenizer, max_len)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/371k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/244 [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [4]:
import torch.nn as nn
from transformers import BertModel

class KoBERTClass(nn.Module):
    def __init__(self):
        super(KoBERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained('skt/kobert-base-v1')
        self.pre_classifier = nn.Linear(768, 768)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(768, 6) # 라벨 수에 맞게 조정

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = KoBERTClass()

config.json:   0%|          | 0.00/535 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/369M [00:00<?, ?B/s]

In [5]:
from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup

# GPU 설정
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

train_params = {'batch_size': 16,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': 16,
               'shuffle': False,
               'num_workers': 0
               }

training_loader = DataLoader(training_set, **train_params)
validation_loader = DataLoader(validation_set, **test_params)

optimizer = AdamW(model.parameters(), lr=1e-5)
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(training_loader) * 10
)

loss_function = nn.CrossEntropyLoss()

def train(epoch):
    model.train()
    for _, data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype=torch.long)
        mask = data['mask'].to(device, dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
        targets = data['targets'].to(device, dtype=torch.long)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_function(outputs, targets)

        if _ % 500 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

for epoch in range(10):
    train(epoch)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  1.7864547967910767
Epoch: 0, Loss:  0.807042121887207
Epoch: 1, Loss:  0.8711310625076294
Epoch: 1, Loss:  0.5774560570716858
Epoch: 2, Loss:  0.4943704903125763
Epoch: 2, Loss:  0.41491827368736267
Epoch: 3, Loss:  0.22469495236873627
Epoch: 3, Loss:  0.5379264950752258
Epoch: 4, Loss:  0.2305298000574112
Epoch: 4, Loss:  0.3804149627685547
Epoch: 5, Loss:  0.22741912305355072
Epoch: 5, Loss:  0.3411542475223541
Epoch: 6, Loss:  0.1733327955007553
Epoch: 6, Loss:  0.1962546706199646
Epoch: 7, Loss:  0.29623129963874817
Epoch: 7, Loss:  0.5834771394729614
Epoch: 8, Loss:  0.056592781096696854
Epoch: 8, Loss:  0.12755858898162842
Epoch: 9, Loss:  0.3653799295425415
Epoch: 9, Loss:  0.08633960038423538


In [6]:
def validation():
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for _, data in enumerate(validation_loader, 0):
            ids = data['ids'].to(device, dtype=torch.long)
            mask = data['mask'].to(device, dtype=torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype=torch.long)
            targets = data['targets'].to(device, dtype=torch.long)

            outputs = model(ids, mask, token_type_ids)
            fin_outputs.extend(torch.argmax(outputs, dim=1).cpu().numpy())
            fin_targets.extend(targets.cpu().numpy())
    return fin_outputs, fin_targets

from sklearn.metrics import accuracy_score

outputs, targets = validation()
accuracy = accuracy_score(targets, outputs)
print(f'Validation Accuracy: {accuracy}')

Validation Accuracy: 0.7650402652771199


In [7]:
import os
import torch

# 모델 저장 경로 설정
MODEL_SAVE_PATH = "kobert_model.pth"
TOKENIZER_SAVE_PATH = "kobert_tokenizer"

# 디렉토리 생성
if not os.path.exists(TOKENIZER_SAVE_PATH):
    os.makedirs(TOKENIZER_SAVE_PATH)

# 모델 저장
torch.save(model.state_dict(), MODEL_SAVE_PATH)

# 토크나이저 저장
tokenizer.save_pretrained(TOKENIZER_SAVE_PATH)

('kobert_tokenizer/tokenizer_config.json',
 'kobert_tokenizer/special_tokens_map.json',
 'kobert_tokenizer/spiece.model',
 'kobert_tokenizer/added_tokens.json')