In [1]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

import os
import numpy as np
import pandas as pd
import random
from torch.utils.tensorboard import SummaryWriter

import tqdm

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [21]:
from transformers import AutoModel, AutoTokenizer

MBTI = ['IE', 'SN', 'TF', 'JP']

class Model(nn.Module):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.pre_model = AutoModel.from_pretrained("klue/roberta-base")
        self.post_model = nn.Sequential(
            nn.Linear(768+2+3, 1024),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(1024, len(MBTI)),
            nn.Sigmoid(),
        )

    def forward(self, x, attach):
        embed = self.pre_model(**x, output_hidden_states=True)
        embed = embed.last_hidden_state[:,0,:]
        input = torch.cat((embed, attach), dim=1)
        output = self.post_model(input)
        return input, output

model = Model().to(device)
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")
model

Some weights of the model checkpoint at klue/roberta-base were not used when initializing RobertaModel: ['lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for

Model(
  (pre_model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(32000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((

In [5]:
df = pd.read_csv('../data/hackathon_train.csv', encoding='cp949', index_col=0)

# split train and test dataframe
train_df_list = []
test_df_list = []
for idx in df['User_ID'].unique():
    train_df_list.append(df[df['User_ID']==idx][0:40])
    test_df_list.append(df[df['User_ID']==idx][40:])

train_df = pd.concat(train_df_list, ignore_index=True)
test_df = pd.concat(test_df_list, ignore_index=True)

In [6]:
gender_map = {0:[1,0], 1:[0,1]}
age_map = {20:[1,0,0], 30:[0,1,0], 40:[0,0,1]}
answer_map = {'<그렇다>':[1,0,0], '<그렇자>':[1,0,0],
              '<중립>':[0,1,0], '<중랍>':[0,1,0], '<중간>':[0,1,0], '<보통>':[0,1,0],
              '<어렵다>':[0,0,1], '<아니다>':[0,0,1], ',<아니다>':[0,0,1], '<아니요>':[0,0,1], '<아니오>':[0,0,1]}

In [7]:
def set_random(SEED=0):
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    np.random.seed(SEED)
    random.seed(SEED)

class MyDataset(Dataset):
    def __init__(self, data, label, df):
        self.data = data
        self.label = label
        self.df = df
        self.attach = []
        for i in range(len(self)):
            self.attach.append([])
            self.attach[i].extend(gender_map[self.df.Gender[i]])
            self.attach[i].extend(age_map[self.df.Age[i]])
        self.attach = torch.tensor(self.attach)

    def __len__(self):
        return len(self.data['input_ids'])

    def __getitem__(self, idx):
        data = {k:v[idx] for k,v in self.data.items()}
        return data, torch.tensor(self.label[idx])[:len(MBTI)], self.attach[idx]

def convert_mbti_to_label(mbti):
    stand = 'ISTJ'  # [0, 0, 0, 0]
    label = []
    for i in range(len(stand)):
        if stand[i] == mbti[i]:
            label.append(0)
        else:
            label.append(1)
    return label

In [19]:
def fine_train(model, dl, optimizer, criterion):
    model.train()
    loss_all, acc_all = 0, torch.zeros(len(MBTI))

    for data, y, attach in tqdm.tqdm(dl):
        data = {k:v.to(device) for k,v in data.items()}
        y, attach = y.to(device), attach.to(device)
        _, output = model(data, attach)
        loss = criterion(output, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        predict = (output>0.5)
        acc = (predict == y).sum(0) / len(y)

        loss_all += loss.item()
        acc_all += acc


    loss = loss_all / len(dl)
    acc = acc_all / len(dl)

    return loss, acc

def fine_valid(model, dl, optimizer=None, criterion=None):
    model.eval()
    loss_all, acc_all = 0, torch.zeros(len(MBTI))

    output_list = []
    for data, y, attach in tqdm.tqdm(dl):
        data = {k:v.to(device) for k,v in data.items()}
        y, attach = y.to(device), attach.to(device)
        _, output = model(data, attach)
        loss = criterion(output, y)

        predict = (output>0.5)
        acc = (predict == y).sum(0) / len(y)

        loss_all += loss.item()
        acc_all += acc

        output_list.append(output.argmax(dim=1).cpu())

    loss = loss_all / len(dl)
    acc = acc_all / len(dl)
    return loss, acc


In [9]:
train_tensor = torch.load('roberta/train_tokens.pt')
test_tensor = torch.load('roberta/test_tokens.pt')

In [13]:
tokenizer.decode(train_tensor.input_ids[0])

'[CLS] 주기적으로 새로운 친구를 만들지 않아요. 어릴 때 왕따 당한 경험이 있고 외부 활동을 좋아하지 않기 때문에 소수의 친구와만 지냅니다. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] 

In [14]:
train_label = train_df['MBTI'].map(convert_mbti_to_label)
test_label = test_df['MBTI'].map(convert_mbti_to_label)
train_ds = MyDataset(train_tensor, train_label, train_df)
test_ds = MyDataset(test_tensor, test_label, test_df)
train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
test_dl = DataLoader(test_ds, batch_size=32, shuffle=False)

In [15]:
for x in train_dl:
    print(x)
    break

[{'input_ids': tensor([[    0,  9492,  2116,  ...,     1,     1,     1],
        [    0,  7267, 11187,  ...,     1,     1,     1],
        [    0,  1535,  2522,  ...,     1,     1,     1],
        ...,
        [    0,  1535, 22883,  ...,     1,     1,     1],
        [    0,  3656,  3611,  ...,     1,     1,     1],
        [    0,   723,  2069,  ...,     1,     1,     1]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}, tensor([[1, 1, 0, 1],
        [1, 1, 0, 1],
        [0, 0, 1, 1],
        [1, 0, 1, 1],
        [0, 0, 1, 0],
        [0, 0, 0, 0],
        [0, 0, 

In [17]:
def fine_tune(folder='ckpt'):
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)

    train_final = []
    val_final = []
    save_dir = f'./{folder}'
    for epoch in range(3):
        print(f'Epoch {epoch}')
        train_loss, train_acc = fine_train(model, train_dl, optimizer, criterion)
        val_loss, val_acc = fine_valid(model, test_dl, criterion=criterion)

        for i in range(len(MBTI)):
            writer.add_scalars(f'Acc/Train', {MBTI[i]:train_acc[i] for i in range(len(MBTI))}, epoch)
            writer.add_scalars(f'Acc/Test', {MBTI[i]:val_acc[i] for i in range(len(MBTI))}, epoch)
        writer.add_scalar('Loss/Train', train_loss, epoch)
        writer.add_scalar('Loss/Test', val_loss, epoch)

        train_final.append([train_loss, train_acc])
        val_final.append([val_loss, val_acc])

    return train_final, val_final

In [22]:
test_number = 37
if os.path.isdir(f'./tensorboard/test{test_number}'):
    print('Tensorboard folder already occupied.')
else:
    writer = SummaryWriter(f'./tensorboard/test{test_number}/')
    set_random(422)
    result = fine_tune(f'ckpt/ckpt{test_number}')
    print(result)

Epoch 0


  0%|          | 0/300 [00:12<?, ?it/s]


KeyboardInterrupt: 

In [None]:
torch.save(model, 'tuned/kobert_full3.pt')