In [1]:
!pip install transformers
!pip install sentencepiece

[0m

In [2]:
import pandas as pd
import numpy as np
import torch
from transformers import XLNetTokenizer, XLNetForSequenceClassification, XLNetConfig, AdamW
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [4]:
train_ori = pd.read_csv("/kaggle/input/jbnu-swuniv-ai/train_data.csv")
test_ori = pd.read_csv("/kaggle/input/jbnu-swuniv-ai/test_data.csv")
train_desc = pd.read_csv("/kaggle/input/jbnu-swuniv-ai/merged_desc.csv")
test_desc = pd.read_csv("/kaggle/input/jbnu-swuniv-ai/merged_test.csv")

In [5]:
train = pd.merge(train_ori, train_desc, left_on = 'Filename', right_on='image', how='inner')
train = train[['id', 'Filename', 'Title', 'label', 'prompt']]

test = pd.merge(test_ori, test_desc, left_on = 'Filename', right_on='File', how='inner')
test = test[['id', 'Filename', 'Title', 'Prompt']]
test.columns = ['id', 'Filename', 'Title', 'prompt']

In [7]:
import random
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

In [8]:
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased', do_lower_case=True)

max_length = 600 #입력 시퀀스 최대 길이 (조정 가능)
train_input_texts = (train["Title"] + ", "+ train["prompt"]).tolist()
test_input_texts = (test["Title"] + ", "+ test["prompt"]).tolist()

train_encoded_inputs = tokenizer.batch_encode_plus(
    train_input_texts,
    max_length=max_length,
    padding="longest",
    truncation=True,
    return_tensors="pt"
)

test_encoded_inputs = tokenizer.batch_encode_plus(
    test_input_texts,
    max_length=max_length,
    padding="longest",
    truncation=True,
    return_tensors="pt"
)

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

In [9]:
# train
train_input_ids = train_encoded_inputs["input_ids"].to(device)
train_attention_mask = train_encoded_inputs["attention_mask"].to(device)

label_mapping = {label: i for i, label in enumerate(train["label"].unique())}
num_labels = len(label_mapping)
train_labels = train["label"].map(label_mapping).tolist()
train_labels = torch.tensor(train_labels).to(device)

# test
test_input_ids = test_encoded_inputs["input_ids"].to(device)
test_attention_mask = test_encoded_inputs["attention_mask"].to(device)

In [10]:
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=num_labels).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.weight', 'lm_loss.bias']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.weight', 'sequence_summary.summary.bias', 'logits_proj.weight', 'logits_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [11]:
batch_size = 16

train_data = torch.utils.data.TensorDataset(train_input_ids, train_attention_mask, train_labels)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)

test_data = torch.utils.data.TensorDataset(test_input_ids, test_attention_mask)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False)

In [12]:
# 최적화 설정 (선택 사항)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 10

In [16]:
import tqdm

model.train()

for epoch in tqdm.tqdm(range(num_epochs)):
    train_loss = 0.0
    for i, batch in enumerate(train_loader):

        inference_status = 'Batch ' + str(i+1) + ' of ' + str(len(train_loader))

        print(inference_status, end='\r')

        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()

        outputs = model(input_ids.to(device), attention_mask=attention_mask.to(device), labels=labels.to(device))
        
        loss = outputs.loss
        train_loss += loss.item()

        loss.backward()
        optimizer.step()

        avg_train_loss = train_loss / len(train_loader)

    print(f"Epoch: {epoch+1}, Average Training Loss: {avg_train_loss}")
    torch.save(model.state_dict(), f'/kaggle/working/epoch{epoch+1}_model.pt')

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch: 1, Average Training Loss: 1.2463618687766431


 10%|█         | 1/10 [29:04<4:21:38, 1744.31s/it]

Epoch: 2, Average Training Loss: 0.8820118180038128


 20%|██        | 2/10 [57:58<3:51:48, 1738.50s/it]

Epoch: 3, Average Training Loss: 0.6937840781488078


 30%|███       | 3/10 [1:26:53<3:22:37, 1736.72s/it]

Epoch: 4, Average Training Loss: 0.5394902899852658


 40%|████      | 4/10 [1:55:47<2:53:34, 1735.79s/it]

Epoch: 5, Average Training Loss: 0.41500884839028795


 50%|█████     | 5/10 [2:24:41<2:24:35, 1735.05s/it]

Epoch: 6, Average Training Loss: 0.31383246518806013


 60%|██████    | 6/10 [2:53:35<1:55:38, 1734.68s/it]

Epoch: 7, Average Training Loss: 0.24154075882921502


 70%|███████   | 7/10 [3:22:29<1:26:43, 1734.43s/it]

Epoch: 8, Average Training Loss: 0.1949392829842921


 80%|████████  | 8/10 [3:51:23<57:48, 1734.22s/it]  

Epoch: 9, Average Training Loss: 0.15932890277931017


 90%|█████████ | 9/10 [4:20:16<28:54, 1734.05s/it]

Epoch: 10, Average Training Loss: 0.13101203997120814


100%|██████████| 10/10 [4:49:09<00:00, 1734.99s/it]


In [26]:
result_df = pd.DataFrame()

for epoch in tqdm.tqdm(range(num_epochs)):
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, num_epochs))

    model.load_state_dict(torch.load(f'/kaggle/working/epoch{epoch+1}_model.pt'))
    for j, batch in enumerate(test_loader):

        inference_status = 'Batch ' + str(j+1) + ' of ' + str(len(test_loader))

        print(inference_status, end='\r')

        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)


        outputs = model(b_input_ids,
                attention_mask=b_input_mask)

        # 예측값
        preds = outputs[0]
        preds = preds.detach().cpu().numpy()


        if j == 0:
            stacked_preds = preds

        else:
            stacked_preds = np.vstack((stacked_preds, preds))

    preds = np.argmax(stacked_preds, axis=1)
    predicted_genres = [list(label_mapping.keys())[list(label_mapping.values()).index(pred)] for pred in preds]
    result_df[f'label_{epoch+1}'] = predicted_genres

  0%|          | 0/10 [00:00<?, ?it/s]

Batch 1840 of 1840

 10%|█         | 1/10 [05:31<49:39, 331.10s/it]

Batch 1840 of 1840

 20%|██        | 2/10 [11:01<44:05, 330.66s/it]

Batch 1840 of 1840

 30%|███       | 3/10 [16:33<38:37, 331.11s/it]

Batch 1840 of 1840

 40%|████      | 4/10 [22:05<33:09, 331.55s/it]

Batch 1840 of 1840

 50%|█████     | 5/10 [27:36<27:36, 331.36s/it]

Batch 1840 of 1840

 60%|██████    | 6/10 [33:06<22:04, 331.06s/it]

Batch 1840 of 1840

 70%|███████   | 7/10 [38:37<16:32, 330.95s/it]

Batch 1840 of 1840

 80%|████████  | 8/10 [44:08<11:01, 330.85s/it]

Batch 1840 of 1840

 90%|█████████ | 9/10 [49:39<05:30, 330.86s/it]

Batch 1840 of 1840

100%|██████████| 10/10 [55:09<00:00, 330.96s/it]


In [27]:
test = pd.read_csv("/kaggle/input/jbnu-swuniv-ai/test_data.csv")
print(len(result_df))
print(len(test))

29436
29436


In [28]:
result_df['id'] = result_df.index
result_df

Unnamed: 0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,label_9,label_10,id
0,"Education, Teaching","Education, Teaching","Education, Teaching","Education, Teaching","Education, Teaching","Education, Teaching","Education, Teaching","Education, Teaching","Education, Teaching","Education, Teaching",0
1,"Arts, Photography","Arts, Photography","Arts, Photography","Arts, Photography","Arts, Photography","Arts, Photography","Arts, Photography","Arts, Photography","Arts, Photography","Arts, Photography",1
2,Medical Books,Medical Books,Medical Books,Medical Books,Medical Books,Medical Books,Medical Books,Medical Books,Medical Books,Medical Books,2
3,Reference,Reference,Reference,Reference,Reference,Reference,Reference,Travel,Reference,Travel,3
4,"Humor, Entertainment",Reference,Reference,Reference,Reference,Reference,Reference,Reference,Reference,Reference,4
...,...,...,...,...,...,...,...,...,...,...,...
29431,Test Preparation,Test Preparation,Test Preparation,Test Preparation,Test Preparation,Test Preparation,Test Preparation,Test Preparation,Test Preparation,Test Preparation,29431
29432,Medical Books,Medical Books,Medical Books,Medical Books,Medical Books,Medical Books,Medical Books,Medical Books,Medical Books,Medical Books,29432
29433,"Humor, Entertainment","Humor, Entertainment","Humor, Entertainment","Humor, Entertainment","Humor, Entertainment","Humor, Entertainment","Humor, Entertainment","Humor, Entertainment","Humor, Entertainment","Humor, Entertainment",29433
29434,"Religion, Spirituality","Religion, Spirituality","Religion, Spirituality","Religion, Spirituality","Literature, Fiction","Science Fiction, Fantasy","Religion, Spirituality","Religion, Spirituality","Religion, Spirituality","Religion, Spirituality",29434


In [29]:
result_df.to_csv("/kaggle/working/xlnet_results.csv", index=False)

In [30]:
result_df["label"] = result_df.filter(like='label').apply(lambda x: x.value_counts().idxmax(), axis=1)

In [31]:
result_df[["id", "label"]].to_csv("/kaggle/working/ensemble_xlnetmodels.csv", index=False)