In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m73.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m79.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd "/content/drive/MyDrive/kaggle/QIA2023"

/content/drive/MyDrive/kaggle/QIA2023


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset


from transformers import *
import os
import sys
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
import numpy as np
import re
import pickle
import time
import pandas as pd
from pathlib import Path
import random
from torch.utils.tensorboard import SummaryWriter

Xformers is not installed correctly. If you want to use memorry_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [None]:
df = pd.read_csv('./data/train_data.csv', index_col=0)

In [None]:
df['Gender'] = df['Gender'].map({'female':1,'male':0})

df

Unnamed: 0_level_0,User_ID,Gender,Age,MBTI,Q_number,Short_Answer,Long_Answer
Data_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
481,9,1,30,INTJ,1,아니다,새로운 사람을 만나서 이야기를 나누고 마음이 통하는 친구가 되기까지의 과정은 많은 ...
482,9,1,30,INTJ,2,그렇다,자유시간이 생기면 평소에 관심을 두고 있던 분야에 대해 공부하고 탐구하는 시간을 주...
483,9,1,30,INTJ,3,그렇다,다른 사람이 울고 있는 모습만 볼 때는 왜 울고 있는지 이유를 몰라 공감 되지 않는...
484,9,1,30,INTJ,4,그렇다,모든 일이 마음먹은 대로 진행되는 것은 아니기에 미리 여러 가지 대책을 세워둬야 계...
485,9,1,30,INTJ,5,그렇다,퇴사 면담을 하면서 대외적인 이유가 아닌 진짜 퇴사의 이유를 솔직하게 말해 달라는 ...
...,...,...,...,...,...,...,...
14396,240,1,30,ISTP,56,그렇다,거래처에 가격 조정 때문에 3군데를 가야 하는 상황이었는데 이야기 잘 통하는 곳 2...
14397,240,1,30,ISTP,57,아니다,상대방과 논쟁을 불러드릴 주제에는 관심이 없습니다 괜히 싸움을 일으키기 싫습니다
14398,240,1,30,ISTP,58,아니다,나에게 온 기회를 포기할 수 없다 양보를 하게 되면 나에게 기회는 없어지니깐
14399,240,1,30,ISTP,59,아니다,마감 기한이 정해지면 그 일을 끝날 때까지 늦게까지 일을 하고 퇴근하곤 합니다


In [None]:
# split train and test dataframe
train_df_list = []
test_df_list = []
for idx in df['User_ID'].unique():
    train_df_list.append(df[df['User_ID']==idx][0:50])
    test_df_list.append(df[df['User_ID']==idx][50:])
    
train_df = pd.concat(train_df_list, ignore_index=True)
test_df = pd.concat(test_df_list, ignore_index=True)

In [None]:
# load embedding
train_result = torch.load('train_embed_only_p2.pt')
test_result = torch.load('test_embed_only_p2.pt')

In [None]:
def set_random(SEED=0):
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    np.random.seed(SEED)
    random.seed(SEED)

class MyDataset(Dataset):
    def __init__(self, data, label, label_idx=0):
        self.data = data
        self.label = label
        self.label_idx = label_idx

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], torch.tensor(self.label[idx][self.label_idx])
    
def convert_mbti_to_label(mbti: str):
    """
    :param mbti: string. length=4
    :return:
    """
    stand = 'ISTJ'  # [0, 0, 0, 0]
    result = []
    for i in range(4):
        if stand[i] == mbti[i]:
            result.append(0)
        else:
            result.append(1)

    return result

# def convert_label_to_mbti(num, label_idx):
#     stand = 'ISTJ'
#     mbti = stand[label_idx]
    

In [None]:
def train(model, dl, optimizer, criterion, device=0):
    model = model.cuda(device)
    model.train()
    loss_all, acc_all = 0, 0
    
    for x, y in dl:
        x, y = x.cuda(device), y.cuda(device)
        output = model(x)
        loss = criterion(output, y)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        acc = (output.argmax(axis=1) == y).sum() / len(y)

        loss_all += loss.item()
        acc_all += acc.item()


    loss = loss_all / len(dl)
    acc = acc_all / len(dl)

    return loss, acc

def valid(model, dl, optimizer=None, criterion=None, device=0):
    model = model.cuda(device)
    model.eval()
    loss_all, acc_all = 0, 0
    
    output_list = []
    for x, y in dl:
        x, y = x.cuda(device), y.cuda(device)
        output = model(x)
        loss = criterion(output, y)

        acc = (output.argmax(axis=1) == y).sum() / len(y)

        loss_all += loss.item()
        acc_all += acc.item()

        output_list.append(output.argmax(dim=1).cpu())
        
    loss = loss_all / len(dl)
    acc = acc_all / len(dl)
    
    
#     # userid accuracy
#     result = 0
#     a = torch.cat(output_list)
#     for uid in test_df['User_ID'].unique():
#         idx = test_df[test_df['User_ID']==uid].index
#         if a[idx].count_nonzero().item() > len(a[idx])//2:
#             label = 1
#         else:
#             label = 0
            
#         result += convert_mbti_to_label(test_df[test_df['User_ID']==uid]['MBTI'].unique()[0])[label_idx] == label
        
    
    return loss, acc


In [None]:
def forward(model, dl, device=0):
    pooled = []
    hidden = []
    model.cuda(device)
    model.eval()
    for data in dl:
        data = {k:v.cuda(device) for k,v in data.items()}
        with torch.no_grad():
            output = model(**data, output_hidden_states=True)
        p, h = output.pooler_output, output.hidden_states
        pooled.append(p) # pooler output
        hidden.append(h[-1][:,0,:]) # only [CLS] token embedding 
    return torch.cat(pooled), torch.cat(hidden)

#merging age&gender

In [None]:
def main(label_idx=0, device=0, name='test'):
    
    model = nn.Sequential(nn.Linear(768, 50),   #768
                              nn.ReLU(),
                              nn.Linear(50, 2))  
    
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
    
    
    # dataset / dataloader
    train_data = train_result[0] # pooler output
    #train_data = train_tensor
    train_label = train_df['MBTI'].map(convert_mbti_to_label)
    
    test_data = test_result[0]
    #test_data = test_tensor
    test_label = test_df['MBTI'].map(convert_mbti_to_label)
    
    train_ds = MyDataset(train_data, train_label, label_idx)
    test_ds = MyDataset(test_data, test_label, label_idx)

    train_dl = DataLoader(train_ds, batch_size=1024, shuffle=True) #1024
    test_dl = DataLoader(test_ds, batch_size=1024, shuffle=False)
    
    # train
    train_final = []
    val_final = []
    
    save_dir = f'./ckpt_only_p2/{name}'
    for epoch in range(0,600,2):
        train_loss, train_acc = train(model, train_dl, optimizer, criterion, device=device)
        # validation
        val_loss, val_acc = valid(model, test_dl, criterion=criterion, device=device)

        writer.add_scalar('Loss/Train', train_loss, epoch)
        writer.add_scalar('Acc/Train', train_acc, epoch)
#         wandb.log({'train_loss': train_loss, 'train_acc': train_acc, 'epoch': epoch})
        writer.add_scalar('Loss/Test', val_loss, epoch)
        writer.add_scalar('Acc/Test', val_acc, epoch)
#         writer.add_scalar('Acc/userid', acc, epoch)
#         wandb.log({'val_loss': val_loss, 'val_acc': val_acc, 'epoch': epoch})
        train_final.append([train_loss, train_acc])
        val_final.append([val_loss, val_acc])
        
        os.makedirs(save_dir, exist_ok=True)
        torch.save(model, f"{save_dir}/epoch_{epoch}.pt")
    writer.close()
    return train_final, val_final

In [None]:
# Train all
MBTI = ['IE', 'SN', 'TF', 'JP']
set_random(422)
for i in range(4):
    writer = SummaryWriter(f'./tensorboard/test13_only_p2/{MBTI[i]}/')
    result = main(i, 0, MBTI[i])