In [5]:
import os
import random
from pathlib import Path
import sys

import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from scipy.special import softmax
from torch.utils.data import Dataset, DataLoader

from transformers import AutoTokenizer
from matplotlib import pyplot as plt
import seaborn as sns
import torch
from torchvision.models import ResNet152_Weights

sys.path.append('../')
from src.data.prepare import load_base_df
from src.models.MMBT.dataset import (BokeTextImageDataset, collate_fn)
from src.models.MMBT.mmbt import load_model
from src.models.utils import fix_seed, plot_confusion_matrix

## データ読み込み

In [6]:
train_df, test_df, submission_df = load_base_df('../dataset/csv/', '../dataset/imgs/')

In [7]:
print(f'train_df.shape: {train_df.shape}')
display(train_df.head())

print(f'test_df.shape: {test_df.shape}')
display(test_df.head())

train_df.shape: (24962, 5)


Unnamed: 0,id,odai_photo_file_name,text,is_laugh,img_path
0,ge5kssftl,9fkys1gb2r.jpg,君しょっちゅうソレ自慢するけど、ツムジ２個ってそんなに嬉しいのかい？,0,../dataset/imgs/train/9fkys1gb2r.jpg
1,r7sm6tvkj,c6ag0m1lak.jpg,これでバレない？授業中寝てもバレない？,0,../dataset/imgs/train/c6ag0m1lak.jpg
2,yp5aze0bh,whtn6gb9ww.jpg,「あなたも感じる？」\n『ああ…、感じてる…』\n「後ろに幽霊いるよね…」\n『女のな…』,0,../dataset/imgs/train/whtn6gb9ww.jpg
3,ujaixzo56,6yk5cwmrsy.jpg,大塚愛聞いてたらお腹減った…さく、らんぼと牛タン食べたい…,0,../dataset/imgs/train/6yk5cwmrsy.jpg
4,7vkeveptl,0i9gsa2jsm.jpg,熊だと思ったら嫁だった,0,../dataset/imgs/train/0i9gsa2jsm.jpg


test_df.shape: (6000, 5)


Unnamed: 0,id,odai_photo_file_name,text,img_path,is_laugh
0,rfdjcfsqq,nc1kez326b.jpg,僕のママ、キャラ弁のゆでたまごに８時間かかったんだ,../dataset/imgs/test/nc1kez326b.jpg,0
1,tsgqmfpef,49xt2fmjw0.jpg,かわいいが作れた！,../dataset/imgs/test/49xt2fmjw0.jpg,0
2,owjcthkz2,9dtscjmyfh.jpg,来世の志茂田景樹,../dataset/imgs/test/9dtscjmyfh.jpg,0
3,rvgaocjyy,osa3n56tiv.jpg,ちょ、あの、オカン、これ水風呂やねんけど、なんの冗談??,../dataset/imgs/test/osa3n56tiv.jpg,0
4,uxtwu5i69,yb1yqs4pvb.jpg,「今日は皆さんにザリガニと消防車の違いを知ってもらいたいと思います」『どっちも同じだろ。両方...,../dataset/imgs/test/yb1yqs4pvb.jpg,0


## データ準備

In [8]:
MAX_SEQENCE_LEN = 48
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")
train_ds = BokeTextImageDataset(train_df, tokenizer, MAX_SEQENCE_LEN, image_transform=ResNet152_Weights.IMAGENET1K_V2)
test_ds = BokeTextImageDataset(test_df, tokenizer, MAX_SEQENCE_LEN, image_transform=ResNet152_Weights.IMAGENET1K_V2)

## 特徴量抽出

In [9]:
def vectorize_ds(model, dataloader, device):
    model = model.to(device)
    vector = []
    for ds in tqdm(dataloader):
        ds = {k: v.to(device) for k, v in ds.items()}
        with torch.no_grad():
            # 最終隠れ層の始まりの文字のベクトルを取得して、特徴量として用いる。
            hidden_states = model(**ds).hidden_states[-1][:, 0]
            # numpy arrayにする。
            hidden_states = hidden_states.cpu().detach().numpy() if torch.cuda.is_available() else hidden_states.cpu().numpy()
            vector.append(hidden_states)
    return np.concatenate(vector)

In [10]:
model = load_model(output_hidden_states=True)
model.load_state_dict(torch.load('../model/mmbt_exp01/checkpoint-150/pytorch_model.bin'))
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)
model.eval()

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


MMBTForClassification(
  (mmbt): MMBTModel(
    (transformer): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(32000, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
   

In [11]:
train_dataloader = DataLoader(train_ds, batch_size=12, collate_fn=collate_fn, shuffle=False)
train_vector = vectorize_ds(model, train_dataloader, device)
train_vector_df = pd.DataFrame(train_vector).add_prefix('mmbt_vector')
train_vector_df.to_csv('../dataset/processed/train_vector.csv', index=False)

  0%|          | 0/2081 [00:00<?, ?it/s]

In [12]:
test_dataloader = DataLoader(test_ds, batch_size=12, collate_fn=collate_fn, shuffle=False)
test_vector = vectorize_ds(model, test_dataloader, device)
test_vector_df = pd.DataFrame(test_vector).add_prefix('mmbt_vector')
test_vector_df.to_csv('../dataset/processed/test_vector.csv', index=False)

  0%|          | 0/500 [00:00<?, ?it/s]