In [39]:
import torch
from sklearn.metrics import roc_auc_score
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

import os
import numpy as np
import pandas as pd
import random
from torch.utils.tensorboard import SummaryWriter

import tqdm

In [104]:
final = True

In [105]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [106]:
MBTI = ['IE', 'SN', 'TF', 'JP']

In [107]:
df = pd.read_csv('../data/hackathon_train.csv', encoding='cp949', index_col=0)
df.index = range(len(df))
df.tail()

Unnamed: 0,User_ID,Gender,Age,MBTI,Q_number,Answer
11515,240,0,40,ISTJ,44,<그렇다> 저는 계획에 차질이 생기면 돌아가기 위해 노력을 합니다. 이유는 그 계획...
11516,240,0,40,ISTJ,45,<그렇다> 저는 예전의 실수를 후회할 때가 많습니다. 이유는 그만큼 나태하게 산 적...
11517,240,0,40,ISTJ,46,<아니다> 저는 인간의 존재와 삶의 이유에 대해 깊이 생각하지 않습니다. 이유는 이...
11518,240,0,40,ISTJ,47,<아니다> 저는 감정에 휘둘리는 편이 아닙니다. 이유는 감정을 감추고 밖으로 표현하...
11519,240,0,40,ISTJ,48,<아니다> 저는 상대방 잘못일 때 상대방의 체면을 살려주기 위해 노력하지 않습니다....


In [108]:
df2 = pd.read_excel('../data2/train_data.xlsx', index_col=0)
df2.index = range(len(df2))
df2.tail()

Unnamed: 0,User_ID,Gender,Age,MBTI,Q_number,Short_Answer,Long_Answer
7195,240,female,30,ISTP,56,그렇다,거래처에 가격 조정 때문에 3군데를 가야 하는 상황이었는데 이야기 잘 통하는 곳 2...
7196,240,female,30,ISTP,57,아니다,상대방과 논쟁을 불러드릴 주제에는 관심이 없습니다 괜히 싸움을 일으키기 싫습니다
7197,240,female,30,ISTP,58,아니다,나에게 온 기회를 포기할 수 없다 양보를 하게 되면 나에게 기회는 없어지니깐
7198,240,female,30,ISTP,59,아니다,마감 기한이 정해지면 그 일을 끝날 때까지 늦게까지 일을 하고 퇴근하곤 합니다
7199,240,female,30,ISTP,60,그렇다,일을 할 때는 항상 진행될 때 이외의 상황을 생각하여 대처할 준비를 한다


In [109]:
answer_map = {'그렇다':0, '그렇자':0,
              '중립':1, '중랍':1, '중간':1, '보통':1, '중립/모르겠다': 1,
              '어렵다':2, '아니다':2, '<아니다':2, '아니요':2, '아니오':2}

In [110]:
#question_result = torch.load('question_embed.pt')
#question_list = question_result[1].tolist()
gender_map = {0:0, 1:3}
age_map = {20:0, 30:1, 40:2}
def attach(result, df, filenum):
    result = result.tolist()
    for i in range(len(result)):
        extend_l = [0 for i in range(6)]
        if filenum == 1:
            extend_l[gender_map[df.Gender[i]]+age_map[df.Age[i]]] = 1
        result[i].extend(extend_l)
    return torch.tensor(result)

In [111]:
def split(tensor):
    vector_len = tensor.shape[1]
    tensor = torch.reshape(tensor, (240, 48, vector_len))
    train = torch.reshape(tensor[:,:40,:], (240*40, vector_len))
    test = torch.reshape(tensor[:,40:,:], (240*8, vector_len))
    return train, test

In [112]:
result = torch.load('roberta/tuned_embed.pt')[1][:11520,:]
result_real = attach(result, df, 1).float()
if not final:
    train_result, test_result = split(result_real)
#result1, result2 = result[:11520], result[11520:]
#result_real = torch.cat((attach(result1, df, 1), attach(result2, df2, 2))).float()

In [113]:
def set_random(SEED=0):
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    np.random.seed(SEED)
    random.seed(SEED)

class MyDataset(Dataset):
    def __init__(self, data, label):
        self.data = data
        self.label = label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.label[idx]
    
def convert_mbti_to_label(mbti):
    stand = 'ISTJ'  # [0, 0, 0, 0]
    label = []
    for i in range(len(stand)):
        if stand[i] == mbti[i]:
            label.append(0.0)
        else:
            label.append(1.0)
    return label

In [114]:
def run(model, dl, optimizer, criterion, train=True):
    model = model.to(device)
    if train:
        model.train()
    else:
        model.eval()
    predict_list = []
    answer_list = []
    loss_all = 0

    for x, y in dl:
        x, y = x.to(device), y.to(device)
        output = model(x)
        loss = criterion(output, y)
        if train:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        predict_list.append(output)
        answer_list.append(y)
        loss_all += loss.item()

    predict = torch.cat(predict_list)
    answer = torch.cat(answer_list)
    auc = [roc_auc_score(answer[:,i].tolist(), predict[:,i].tolist()) for i in range(len(answer[0]))]
    loss = loss_all / len(dl)

    return loss, torch.tensor(auc)

In [115]:
#label = list(map(convert_mbti_to_label, df['MBTI'].tolist()+df2['MBTI'].tolist()))
label = torch.tensor(list(map(convert_mbti_to_label, df['MBTI'])))
ds = MyDataset(result_real, label)
dl = DataLoader(ds, batch_size=32, shuffle=True)
if not final:
    train_label, test_label = split(label)
    train_ds = MyDataset(train_result, train_label)
    test_ds = MyDataset(test_result, test_label)
    train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
    test_dl = DataLoader(test_ds, batch_size=32, shuffle=False)

In [116]:
result_real[0], result_real[41]

(tensor([-0.3165, -0.0164, -0.4889,  ...,  0.0000,  1.0000,  0.0000]),
 tensor([-0.3467, -0.0257, -0.4330,  ...,  0.0000,  1.0000,  0.0000]))

In [117]:
train_result[0], test_result[1]

(tensor([-0.3165, -0.0164, -0.4889,  ...,  0.0000,  1.0000,  0.0000]),
 tensor([-0.3467, -0.0257, -0.4330,  ...,  0.0000,  1.0000,  0.0000]))

In [118]:
for x, y in train_dl:
    print(x, y)
    break

tensor([[-0.3224, -0.0817, -0.4712,  ...,  1.0000,  0.0000,  0.0000],
        [-0.2800, -0.0304, -0.4509,  ...,  0.0000,  0.0000,  0.0000],
        [-0.4031, -0.1483, -0.5109,  ...,  0.0000,  1.0000,  0.0000],
        ...,
        [-0.3459, -0.0739, -0.3754,  ...,  0.0000,  0.0000,  1.0000],
        [-0.3661, -0.2026, -0.5535,  ...,  0.0000,  1.0000,  0.0000],
        [-0.3497, -0.2075, -0.5054,  ...,  0.0000,  0.0000,  1.0000]]) tensor([[1., 1., 1., 1.],
        [0., 1., 0., 1.],
        [1., 1., 1., 0.],
        [1., 0., 0., 0.],
        [0., 1., 1., 1.],
        [0., 0., 1., 1.],
        [0., 1., 0., 1.],
        [1., 1., 0., 0.],
        [1., 1., 1., 0.],
        [1., 1., 1., 0.],
        [0., 0., 1., 0.],
        [1., 0., 0., 0.],
        [0., 1., 0., 1.],
        [0., 0., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [0., 0., 0., 0.],
        [1., 1., 0., 0.],
        [0., 0., 1., 1.],
        [0., 1., 0., 0.],
        [0., 0., 0., 1.],
        [0., 0., 0.,

In [119]:
def main(folder='ckpt'):
    model = nn.Sequential(
        nn.Linear(1024+6, 512),
        nn.ReLU(),
        nn.Dropout(0.1),
        nn.Linear(512, 4),
        nn.Sigmoid(),
    )
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

    train_final = []
    val_final = []
    save_dir = f'./{folder}'
    for epoch in tqdm.tqdm(range(500)):
        if final:
            run(model, dl, optimizer, criterion)
            if epoch%50 == 50-1:
                os.makedirs(save_dir, exist_ok=True)
                torch.save(model, f"{save_dir}/epoch_{epoch}.pt")
        else:
            train_loss, train_auc = run(model, train_dl, optimizer, criterion)
            val_loss, val_auc = run(model, test_dl, optimizer, criterion, train=False)
            #writer.add_scalars('AUC/Train', {MBTI[i]:train_auc[i] for i in range(len(MBTI))}, epoch)
            #writer.add_scalars('AUC/Test', {MBTI[i]:val_auc[i] for i in range(len(MBTI))}, epoch)
            writer.add_scalar('AUC/Train', torch.mean(train_auc), epoch)
            writer.add_scalar('AUC/Test', torch.mean(val_auc), epoch)
            writer.add_scalar('Loss/Train', train_loss, epoch)
            writer.add_scalar('Loss/Test', val_loss, epoch)
            train_final.append([train_loss, train_auc])
            val_final.append([val_loss, val_auc])
            if epoch%50 == 50-1:
                os.makedirs(save_dir, exist_ok=True)
                torch.save(model, f"{save_dir}/epoch_{epoch}.pt")

    return train_final, val_final

In [120]:
# Train all
test_number = 85
if os.path.isdir(f'./tensorboard/test{test_number}'):
    print('Tensorboard folder already occupied.')
else:
    writer = SummaryWriter(f'./tensorboard/test{test_number}/')
    set_random(422)
    result = main(f'ckpt/ckpt{test_number}')

 65%|██████▍   | 323/500 [18:08<09:56,  3.37s/it]


KeyboardInterrupt: 