## 준비

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install mxnet
!pip install gluonnlp
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting mxnet
  Downloading mxnet-1.9.1-py3-none-manylinux2014_x86_64.whl (49.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.1/49.1 MB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
Collecting graphviz<0.9.0,>=0.8.1 (from mxnet)
  Downloading graphviz-0.8.4-py2.py3-none-any.whl (16 kB)
Installing collected packages: graphviz, mxnet
  Attempting uninstall: graphviz
    Found existing installation: graphviz 0.20.1
    Uninstalling graphviz-0.20.1:
      Successfully uninstalled graphviz-0.20.1
Successfully installed graphviz-0.8.4 mxnet-1.9.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gluonnlp
  Downloading gluonnlp-0.10.0.tar.gz (344 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m344.5/344.5 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... 

In [None]:
import pandas as pd
import numpy as np
import os
import gc

import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# seed 값 설정
torch.manual_seed(555)

from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score

import transformers
from transformers import AdamW

from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")

In [None]:
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification

# memory 문제 발생하면 xlm-roverta-base
MODEL_TYPE = 'xlm-roberta-large'
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_TYPE)

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

In [None]:
# XLM-RoBERTa vocab크기 확인
tokenizer.vocab_size

250002

In [None]:
# XLM-RoBERTa vocab 확인
list(tokenizer.get_vocab())[:10]

['<s>', '<pad>', '</s>', '<unk>', ',', '.', '▁', 's', '▁de', '-']

In [None]:
device= torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print(device)

cuda:0


## 데이터 불러오기-원본과 prompt 데이터 결합

In [None]:
import pandas as pd

path = '/content/drive/MyDrive/jbnu-swuniv-ai'
train_o = pd.read_csv(path+"/train_data.csv")
test_o = pd.read_csv(path+"/test_data.csv")

train_p = pd.read_csv(path+"/merged_desc.csv")
test_p = pd.read_csv(path+"/merged_test.csv")

In [None]:
print(len(train_o))
print(len(test_o))

print(len(train_p))
print(len(test_p))

68682
29436
68682
29436


In [None]:
test_p.head()

Unnamed: 0,File,Prompt
0,0.jpg,a close up of a book cover with a bunch of dif...
1,1.jpg,making color sing practical lessons in color a...
2,10.jpg,a book cover of the book tibet through the red...
3,100.jpg,"a book cover of computer hardware, software, a..."
4,1000.jpg,arafed image of a city with a river and a rive...


In [None]:
train = pd.merge(train_o, train_p, left_on='Filename', right_on='image', how='inner')
test= pd.merge(test_o, test_p,left_on='Filename', right_on='File', how='inner')

In [None]:
print(len(train))
print(len(test))

68682
29436


In [None]:
# 라벨 넘버링
label_mapping = {label: i for i, label in enumerate(train["label"].unique())}
num_labels = len(label_mapping)
train["label_num"] = train["label"].map(label_mapping)

## 부호, 불용어 제거

In [None]:
import re

# 부호 제거
def alpha_num(Title):
    return re.sub(r'[^A-Za-z0-9 ]', '', Title)

In [None]:
train['Title']=train['Title'].apply(alpha_num)
test['Title']=test['Title'].apply(alpha_num)

train['prompt']=train['prompt'].apply(alpha_num)
test['Prompt']=test['Prompt'].apply(alpha_num)

In [None]:
# 불용어 제거
def remove_stopwords(Title):
    final_text = []
    for i in Title.split():
        if i.strip().lower() not in stopwords:
            final_text.append(i.strip())
    return " ".join(final_text)

In [None]:
stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", 
             "at", "be", "because", "been", "before", "being", "below", "between", "both", "but", "by", "could", 
             "did", "do", "does", "doing", "down", "during", "each", "few", "for", "from", "further", "had", "has", 
             "have", "having", "he", "he'd", "he'll", "he's", "her", "here", "here's", "hers", "herself", "him", "himself", 
             "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into", "is", "it", "it's", "its", "itself", 
             "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or", "other", "ought", "our", "ours", 
             "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", "so", "some", "such", "than", "that", 
             "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's", "these", "they", "they'd", "they'll", 
             "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up", "very", "was", "we", "we'd", "we'll", 
             "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's", "which", "while", "who", "who's", "whom", 
             "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've", "your", "yours", "yourself", "yourselves" ]

In [None]:
# 중복제거
def delete_duplication(text):
  words = text.split()

  unique_words = []

  for word in words:
    if word not in unique_words:
      unique_words.append(word)

  result = ' '.join(unique_words)
  return result

In [None]:
# train['Title'] = train['Title'].str.lower()
# test['Title'] = test['Title'].str.lower()
# train['Title'] = train['Title'].apply(alpha_num).apply(remove_stopwords)
# test['Title'] = test['Title'].apply(alpha_num).apply(remove_stopwords)

train['prompt'] = train['prompt'].str.lower()
test['Prompt'] = test['Prompt'].str.lower()
train['prompt'] = train['prompt'].apply(alpha_num).apply(remove_stopwords).apply(delete_duplication)
test['Prompt'] = test['Prompt'].apply(alpha_num).apply(remove_stopwords).apply(delete_duplication)

In [None]:
train.head()

Unnamed: 0,id,Filename,Title,label,image,prompt,label_num
0,0,1101903236.jpg,The Oz Family Kitchen More Than 100 Simple and...,"Cookbooks, Food, Wine",1101903236.jpg,cookbook cover oz family kitchen photo paperba...,0
1,1,0804139857.jpg,Living with Intent My Somewhat Messy Journey t...,Self Help,0804139857.jpg,living intent somewhat new journey purpose pea...,1
2,2,0765334798.jpg,Redshirts A Novel with Three Codas,"Science Fiction, Fantasy",0765334798.jpg,redshirts john scalzi hard science fiction red...,2
3,3,0446310786.jpg,To Kill a Mockingbird,"Mystery, Thriller, Suspense",0446310786.jpg,book cover night mocking bird best design pape...,3
4,4,1143002598.jpg,Canning and Preserving of Food Products with B...,"Cookbooks, Food, Wine",1143002598.jpg,book two keys top map science journal cover ge...,0


In [None]:
train = train[['id', 'Filename', 'Title', 'prompt', 'label', 'label_num']]
train.columns = ['id', 'Filename', 'Title', 'prompt', 'label', 'label_num']
train.head()

Unnamed: 0,id,Filename,Title,prompt,label,label_num
0,0,1101903236.jpg,The Oz Family Kitchen More Than 100 Simple and...,cookbook cover oz family kitchen photo paperba...,"Cookbooks, Food, Wine",0
1,1,0804139857.jpg,Living with Intent My Somewhat Messy Journey t...,living intent somewhat new journey purpose pea...,Self Help,1
2,2,0765334798.jpg,Redshirts A Novel with Three Codas,redshirts john scalzi hard science fiction red...,"Science Fiction, Fantasy",2
3,3,0446310786.jpg,To Kill a Mockingbird,book cover night mocking bird best design pape...,"Mystery, Thriller, Suspense",3
4,4,1143002598.jpg,Canning and Preserving of Food Products with B...,book two keys top map science journal cover ge...,"Cookbooks, Food, Wine",0


In [None]:
test = test[['id', 'Filename', 'Title', 'Prompt']]
test.columns = ['id', 'Filename', 'Title', 'prompt']
test.head()

Unnamed: 0,id,Filename,Title,prompt
0,0,0.jpg,Elementary and Middle School Mathematics Teach...,close book cover bunch different colored objec...
1,1,1.jpg,Making Color Sing 25th Anniversary Edition Pra...,making color sing practical lessons design pap...
2,2,2.jpg,Nursing Fundamentals DeMYSTiFieD A SelfTeachin...,book cover nursing fundamentals demystified me...
3,3,3.jpg,Allen and Greenoughs New Latin Grammar Dover L...,book cover gold background red text latin writ...
4,4,4.jpg,The Encyclopedia of Fantasy,encyclopedia fantasy john cutt grant genre boo...


## 토큰화

In [None]:
from sklearn.model_selection import train_test_split
train_dataset, val_dataset = train_test_split(train, test_size = 0.1)
print(len(train_dataset))
print(len(val_dataset))

61813
6869


In [None]:
# dataloader에서 오류가 나서 인덱스 재설정
train_dataset.index=[i for i in range(len(train_dataset))]
val_dataset.index=[i for i in range(len(val_dataset))]
val_dataset

Unnamed: 0,id,Filename,Title,prompt,label,label_num
0,54753,1285753801.jpg,ASE Technician Test Preparation Automotive Mai...,delmar automotive maintenance light repair tes...,"Engineering, Transportation",11
1,57208,1330250435.jpg,The Sikh Religion Its Gurus Sacred Writings an...,six religion gurus sacred writings authors vol...,"Religion, Spirituality",6
2,29327,0061144894.jpg,When the Heart Waits Spiritual Direction for L...,book cover heart waits howard butterworth eliz...,"Biographies, Memoirs",22
3,39890,0470574410.jpg,Overcoming Anxiety For Dummies,overcoming anxiety dummies third edition paper...,Self Help,1
4,41065,0312033567.jpg,Anatomy of a Murder,poster original classic comedy film anatomy mu...,"Mystery, Thriller, Suspense",3
...,...,...,...,...,...,...
6864,41644,0764306766.jpg,Third Reich Belt Buckles An Illustrated Handbo...,cover third reich belt buckles wearing ammo be...,"Crafts, Hobbies, Home",20
6865,4102,0983402159.jpg,The Art of Sensual Massage 40th Anniversary Ed...,art sensual massage 1 000 copies sold paradise...,Self Help,1
6866,8113,0813124204.jpg,James Archambeaults Historic Kentucky,book cover james archambeaus historic kentucky...,Travel,19
6867,65351,1853433748.jpg,The Lone Twin Understanding Twin Bereavement a...,close book cover picture two children paperbac...,"Parenting, Relationships",15


In [None]:
MAX_LEN = 400

In [None]:
# train, val에 사용
class CompDataset(Dataset):

    def __init__(self, df):
        self.df_data = df

    def __getitem__(self, index):

        # 데이터프레임 칼럼 들고오기
        sentence1 = self.df_data.loc[index, 'Title']
        sentence2 = self.df_data.loc[index, 'prompt']

        encoded_dict = tokenizer.encode_plus(
                    sentence1, sentence2,           
                    add_special_tokens = True,      
                    max_length = MAX_LEN,           
                    pad_to_max_length = True,
                    truncation=True,
                    return_attention_mask = True,   
                    return_tensors = 'pt',          
               )
        
        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        
        # 숫자로 변환된 label을 텐서로 변환
        target = torch.tensor(self.df_data.loc[index, 'label_num'])
        # input_ids, attention_mask, label을 하나의 인풋으로 묶음
        sample = (padded_token_list, att_mask, target)

        return sample

    def __len__(self):
        return len(self.df_data)

In [None]:
# test 예측에 사용
class TestDataset(Dataset):

    def __init__(self, df):
        self.df_data = df

    def __getitem__(self, index):

        sentence1 = self.df_data.loc[index, 'Title']
        sentence2 = self.df_data.loc[index, 'prompt']


        encoded_dict = tokenizer.encode_plus(
                    sentence1, sentence2,           
                    add_special_tokens = True,      
                    max_length = MAX_LEN,           
                    pad_to_max_length = True,
                    return_attention_mask = True,   
                    truncation=True,
                    return_tensors = 'pt',          
               )
        
        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        # input_ids, attention_mask를 하나의 인풋으로 묶음
        sample = (padded_token_list, att_mask)

        return sample


    def __len__(self):
        return len(self.df_data)

## 모델

In [None]:
L_RATE = 1e-5

BATCH_SIZE = 16
NUM_CORES = os.cpu_count()

NUM_CORES

12

In [None]:
from transformers import XLMRobertaForSequenceClassification

model = XLMRobertaForSequenceClassification.from_pretrained(
    MODEL_TYPE,
    num_labels = 24, # 출력 label의 개수
)

model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_fe

In [None]:
optimizer = AdamW(model.parameters(),
              lr = L_RATE,
              eps = 1e-8
            )

In [None]:
train_data = CompDataset(train_dataset)
val_data = CompDataset(val_dataset)
test_data = TestDataset(test)


train_dataloader = DataLoader(train_data,
                                batch_size=BATCH_SIZE,
                                shuffle=True,
                                num_workers=0)

val_dataloader = DataLoader(val_data,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            num_workers=0)

test_dataloader = DataLoader(test_data,
                                batch_size=BATCH_SIZE,
                                shuffle=False,
                                num_workers=0)



print(len(train_dataloader))
print(len(val_dataloader))
print(len(test_dataloader))

3864
430
1840


In [None]:
# 학습 횟수
NUM_EPOCHS=8

# loss값 저장
loss_values = []

# 학습 시작
for epoch in range(NUM_EPOCHS):

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, NUM_EPOCHS))

    stacked_val_labels = []
    targets_list = []

    print('Training...')

    # train mode 변환
    model.train()
    torch.set_grad_enabled(True)
    total_train_loss = 0
    for i, batch in enumerate(train_dataloader):
        train_status = 'Batch ' + str(i) + ' of ' + str(len(train_dataloader))
        print(train_status, end='\r')

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(b_input_ids,
                    attention_mask=b_input_mask,
                    labels=b_labels)

        loss = outputs[0]

        total_train_loss = total_train_loss + loss.item()
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    print('Train loss:' ,total_train_loss)


    # ========================================
    #               Validation
    # ========================================

    print('\nValidation...')

    # evaluation mode로 변환
    model.eval()
    torch.set_grad_enabled(False)
    total_val_loss = 0

    for j, batch in enumerate(val_dataloader):

        val_status = 'Batch ' + str(j) + ' of ' + str(len(val_dataloader))
        print(val_status, end='\r')

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        outputs = model(b_input_ids,
                attention_mask=b_input_mask,
                labels=b_labels)

        loss = outputs[0]

        total_val_loss = total_val_loss + loss.item()

        preds = outputs[1]
        val_preds = preds.detach().cpu().numpy()
        targets_np = b_labels.to('cpu').numpy()

        targets_list.extend(targets_np)

        if j == 0:  # 첫 번째 batch일 떄
            stacked_val_preds = val_preds

        else:
            stacked_val_preds = np.vstack((stacked_val_preds, val_preds))

    # validation accuracy 계산
    y_true = targets_list
    y_pred = np.argmax(stacked_val_preds, axis=1)

    val_acc = accuracy_score(y_true, y_pred)


    print('Val loss:' ,total_val_loss)
    print('Val acc: ', val_acc)

    torch.save(model.state_dict(), 'epoch:{}_model.pt'.format(epoch))

    # 메모리 관리
    gc.collect()


Training...
Train loss: 4926.647504881024

Validation...
Val loss: 437.2775624990463
Val acc:  0.7062163342553501

Training...
Train loss: 3537.8736435696483

Validation...
Val loss: 396.32707211375237
Val acc:  0.7308196243994759

Training...
Train loss: 2931.2428539171815

Validation...
Val loss: 407.07199585437775
Val acc:  0.7321298587858495

Training...
Train loss: 2416.196824517101

Validation...
Val loss: 421.8548582457006
Val acc:  0.7347503275585966

Training...
Train loss: 1961.1876691542566

Validation...
Val loss: 450.4310531914234
Val acc:  0.7357693987479983

Training...


KeyboardInterrupt: ignored

## 테스트

In [None]:
# 모델 불러오기 (선택)
print('test')

from transformers import XLMRobertaForSequenceClassification

model = XLMRobertaForSequenceClassification.from_pretrained(
    MODEL_TYPE,
    num_labels = 24,
)
model.to(device)
model.load_state_dict(torch.load('epoch:4_model.pt'))

test


Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.den

<All keys matched successfully>

In [42]:
device = torch.device('cpu')
model.to(device)
model.load_state_dict(torch.load('epoch:4_model.pt', map_location=device))

<All keys matched successfully>

In [44]:
torch.cuda.empty_cache() 

In [45]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
model.load_state_dict(torch.load('epoch:4_model.pt', map_location=device))

<All keys matched successfully>

In [46]:
# test 시작
for j, batch in enumerate(test_dataloader):

        inference_status = 'Batch ' + str(j+1) + ' of ' + str(len(test_dataloader))

        print(inference_status, end='\r')

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)


        outputs = model(b_input_ids,
                attention_mask=b_input_mask)

        # 예측값
        preds = outputs[0]
        preds = preds.detach().cpu().numpy()


        if j == 0:
            stacked_preds = preds

        else:
            stacked_preds = np.vstack((stacked_preds, preds))



In [47]:
preds = np.argmax(stacked_preds, axis=1)

preds

array([23, 13,  9, ...,  5, 16, 11])

In [48]:
predicted_genres = [list(label_mapping.keys())[list(label_mapping.values()).index(pred)] for pred in preds]

In [49]:
test['label'] = predicted_genres

In [50]:
print(test[['Title', 'label']])

                                                   Title  \
0      Elementary and Middle School Mathematics Teach...   
1      Making Color Sing 25th Anniversary Edition Pra...   
2      Nursing Fundamentals DeMYSTiFieD A SelfTeachin...   
3      Allen and Greenoughs New Latin Grammar Dover L...   
4                            The Encyclopedia of Fantasy   
...                                                  ...   
29431  Sterling MCAT 2015 Practice Tests  Chemical  P...   
29432  Nursing Diagnoses in Psychiatric Nursing Care ...   
29433  The New Awakening an improved bidding system i...   
29434                  Occidental Mythology Masks of God   
29435  Transit Maps of the World Expanded and Updated...   

                             label  
0              Education, Teaching  
1                Arts, Photography  
2                    Medical Books  
3                        Reference  
4                        Reference  
...                            ...  
29431             Te

In [51]:
test.to_csv(path+'/submission_xlmrobert.csv', index=False, columns=['id', 'label'])