In [7]:
from pathlib import Path
from typing import List, Dict, Union
import os
import sys
import json

import optuna
import pandas as pd
from tqdm import tqdm
import torch
from torchvision.models import ResNet152_Weights
import numpy as np
from sklearn.metrics import log_loss, accuracy_score 
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from scipy.special import softmax
from lightgbm import LGBMClassifier, early_stopping
from torch.utils.data import DataLoader
from transformers import AutoTokenizer


sys.path.append('../')
from src.models.utils import fix_seed
from src.data.submission import to_submission
from src.data.prepare import (
    Create5FoldDataFrame,
)
from src.data.prepare import load_base_df
from src.models.MMBT.dataset import (BokeTextImageDataset, collate_fn)
from src.models.MMBT.mmbt import load_model

In [2]:
train_df, test_df, submission_df = load_base_df('../dataset/csv/', '../dataset/imgs/')

In [3]:
train_feature_csv_path_list = [
    '../dataset/processed/train_has_person.csv',
    '../dataset/processed/train_od_counts.csv',
    '../dataset/processed/train_text_len.csv',
    '../dataset/processed/train_tfidf_vector.csv',
    '../dataset/processed/train_similarity.csv'
]
test_feature_csv_path_list = [
    '../dataset/processed/test_has_person.csv',
    '../dataset/processed/test_od_counts.csv',
    '../dataset/processed/test_text_len.csv',
    '../dataset/processed/test_tfidf_vector.csv',
    '../dataset/processed/test_similarity.csv'
]

## foldごとのMMBTの特徴量をlightGBMで学習させた結果とMMBTモデルの予測値で学習

In [4]:
# LightGBM用特徴量
create_train_valid_test_dict = Create5FoldDataFrame(
    '../dataset/processed/5fold_stratified_mmbt_seed_0/',
    '../dataset/csv/train.csv',
    train_feature_csv_path_list,
    test_feature_csv_path_list
)
train_valid_test_dict = create_train_valid_test_dict()

In [12]:
MAX_SEQENCE_LEN = 48


class LightGBMInterFace:
    def __init__(self, args, stopping_rounds=100):
        self.model = LGBMClassifier(**args)
        self.stopping_rounds = stopping_rounds

    def fit(self, X_train, y_train, X_valid, y_valid):
        self.model.fit(
            X=X_train,
            y=y_train,
            eval_set=(X_valid, y_valid),
            callbacks=[early_stopping(stopping_rounds=self.stopping_rounds, verbose=True)],
            eval_metric='binary_logloss'
        )

    def predict(self, X_test):
        return self.model.predict_proba(X_test)[:, 1]


class LogisticRegressionInterFace:
    def __init__(self):
        self.model = LogisticRegression()

    def fit(self, X_train, y_train, X_valid, y_valid):
        self.model.fit(X_train, y_train.values.ravel())

    def predict(self, X_test):
        return self.model.predict_proba(X_test)[:, 1]


def create_valid_feat_with_train_valid_test_dict(train_valid_test_dict, model_list):
    assert len(train_valid_test_dict) == len(model_list)
    valid_features = []
    test_features = []
    log_loss_list = []
    for i, fold_name in enumerate(train_valid_test_dict):
        print(f'fold: {fold_name}')
        model = model_list[i]
        X_train, X_valid, X_test = (
            train_valid_test_dict[fold_name]['train']['X'],
            train_valid_test_dict[fold_name]['valid']['X'],
            train_valid_test_dict[fold_name]['test']['X']
        )
        y_train, y_valid = (
            train_valid_test_dict[fold_name]['train']['y'],
            train_valid_test_dict[fold_name]['valid']['y']
        )
        model.fit(X_train, y_train, X_valid, y_valid)
        y_val_pred = model.predict(X_valid)
        y_test_pred = model.predict(X_test)
        loss = log_loss(train_valid_test_dict[fold_name]['valid']['y'], y_val_pred)
        log_loss_list.append(loss)
        valid_features.append(y_val_pred)
        test_features.append(y_test_pred)
    print(f'log loss mean:{np.mean(log_loss_list):.3f}, std:{np.std(log_loss_list):.3f}')
    return np.concatenate(valid_features), np.mean(test_features, axis=0), np.mean(log_loss_list)



class MMBTInfer:
    def __init__(self, model, device):
        self.model = model
        self.device = device

    def predict(self, data_loader):
        preds = []
        for ds in tqdm(data_loader):
            ds = {k: v.to(self.device) for k, v in ds.items()}
            with torch.no_grad():
                pred = self.model(**ds).logits
                pred = pred.cpu().detach().numpy() if torch.cuda.is_available() else pred.cpu().numpy()
                preds.append(softmax(pred, axis=1)[:, 1])
        return np.concatenate(preds)


def load_pretrained_model(src, device):
    model = load_model()
    model.load_state_dict(torch.load(src))
    model.eval()
    model.to(device)
    return model


def create_mmbt_valid_feat_and_test_feat(
            train_df,
            test_df,
            log_json_and_model_path_dict,
            fold_name_list,
            tokenizer=AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking"),
            batch_size=32,
            device = 'cuda' if torch.cuda.is_available() else 'cpu'
        ):
    valid_pred_list = []
    test_pred_list = []
    log_loss_list = []
    test_ds = BokeTextImageDataset(test_df, tokenizer, MAX_SEQENCE_LEN, image_transform=ResNet152_Weights.IMAGENET1K_V2)
    test_dl = DataLoader(test_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)
    for fold_name in fold_name_list:
        # 1. log jsonファイルからvalidationに用いたindexを取得
        with open(log_json_and_model_path_dict[fold_name]['json'], 'r') as f:
            valid_idx = json.load(f)['valid_idx']
        # 2. 取得したindexを元にtrain_dfからvalidに用いたdfのみのDataLoaderを作成
        valid_ds = BokeTextImageDataset(
            train_df.iloc[valid_idx],
            tokenizer,
            MAX_SEQENCE_LEN,
            image_transform=ResNet152_Weights.IMAGENET1K_V2
        )
        valid_dl = DataLoader(valid_ds, batch_size=batch_size, collate_fn=collate_fn, shuffle=False)
        # 3. valid DataLoaderを予測 => valid_pred_listに追加(一応, log_lossも計算しておく)
        model = load_pretrained_model(log_json_and_model_path_dict[fold_name]['model'], device)
        infer = MMBTInfer(model, device)
        y_val_pred = infer.predict(valid_dl)
        log_loss_list.append(log_loss(train_df.iloc[valid_idx]['is_laugh'], y_val_pred))
        # 4. testデータの予測(こちらは、fold_k_submission.csvを直接読み込んでもよい)して、test_pred_listに追加
        y_test_pred = infer.predict(test_dl)
        valid_pred_list.append(y_val_pred)
        test_pred_list.append(y_test_pred)
    # 5. test_pred_listの平均を計算(5fold_mean_submission.csvを読み込んでもよい。)
    return np.concatenate(valid_pred_list), np.mean(test_pred_list, axis=0), np.mean(log_loss_list)


def create_valid_feat_with_1st_feat(model, X_train, y_train, X_test, shuffle_seed=0):
    preds = []
    val_idxes = []
    preds_test = []
    log_loss_list = []
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=shuffle_seed)
    for (train_idx, valid_idx) in cv.split(X_train, y_train):
        X_tr, X_val = X_train.iloc[train_idx], X_train.iloc[valid_idx]
        y_tr, y_val = y_train.iloc[train_idx], y_train.iloc[valid_idx]
        model.fit(X_tr, y_tr, X_val, y_val)
        val_pred = model.predict(X_val)
        preds.append(val_pred)
        val_idxes.append(valid_idx)
        log_loss_list.append(log_loss(y_val, val_pred))
        test_pred = model.predict(X_test)
        preds_test.append(test_pred)
    val_idxes = np.concatenate(val_idxes)
    preds = np.concatenate(preds)[np.argsort(val_idxes)]
    print(f'log loss mean:{np.mean(log_loss_list):.3f}, std:{np.std(log_loss_list):.3f}')
    return preds, np.mean(preds_test, axis=0), log_loss_list

In [18]:
MAX_SEQENCE_LEN = 48
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = load_pretrained_model(
    '../model/5fold_stratified_mmbt_seed_0/fold_2/checkpoint-175/pytorch_model.bin',
    device
)
tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-whole-word-masking")
test_ds = BokeTextImageDataset(test_df, tokenizer, MAX_SEQENCE_LEN, image_transform=ResNet152_Weights.IMAGENET1K_V2)
test_dl = DataLoader(test_ds, batch_size=12, collate_fn=collate_fn, shuffle=False)
infer = MMBTInfer(model, device)
preds = infer.predict(test_dl)

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100% 500/500 [01:08<00:00,  7.26it/s]


In [13]:
SEED = 0
log_json_and_model_path_dict = {
    'fold_1': {
        'json': '../results/5fold_stratified_mmbt_seed_0/fold_1_log.json',
        'model': '../model/5fold_stratified_mmbt_seed_0/fold_1/checkpoint-225/pytorch_model.bin'
    },
    'fold_2': {
        'json': '../results/5fold_stratified_mmbt_seed_0/fold_2_log.json',
        'model': '../model/5fold_stratified_mmbt_seed_0/fold_2/checkpoint-175/pytorch_model.bin'
    },
    'fold_3': {
        'json': '../results/5fold_stratified_mmbt_seed_0/fold_3_log.json',
        'model': '../model/5fold_stratified_mmbt_seed_0/fold_3/checkpoint-250/pytorch_model.bin'
    },
    'fold_4': {
        'json': '../results/5fold_stratified_mmbt_seed_0/fold_4_log.json',
        'model': '../model/5fold_stratified_mmbt_seed_0/fold_4/checkpoint-250/pytorch_model.bin'
    },
    'fold_5': {
        'json': '../results/5fold_stratified_mmbt_seed_0/fold_5_log.json',
        'model': '../model/5fold_stratified_mmbt_seed_0/fold_5/checkpoint-175/pytorch_model.bin'
    }
}
lgm_model_list = [
    LightGBMInterFace(
        {
            'num_leaves': 134,
            'subsample_freq': 2,
            'subsample': 0.5196796955706757,
            'colsample_bytree': 0.31998303280144247,
            'min_child_samples': 10,
            'max_depth': 4,
            'reg_alpha': 0.04220057397195014,
            'learning_rate': 0.01,
            'random_state': SEED,
            'n_estimators': 20000,
        }
    ),
    LightGBMInterFace(
        {
            'num_leaves': 162,
            'subsample_freq': 3,
            'subsample': 0.8024762586578099,
            'colsample_bytree': 0.20644698328203992,
            'min_child_samples': 39,
            'max_depth': 5,
            'reg_alpha': 0.6007249475906198,
            'learning_rate': 0.01,
            'random_state': SEED,
            'n_estimators': 20000,
        },
    ),
    LightGBMInterFace(
        {
            'num_leaves': 162,
            'subsample_freq': 3,
            'subsample': 0.8024762586578099,
            'colsample_bytree': 0.20644698328203992,
            'min_child_samples': 39,
            'max_depth': 5,
            'reg_alpha': 0.6007249475906198,
            'learning_rate': 0.01,
            'random_state': SEED,
            'n_estimators': 20000,
        }
    ),
    LightGBMInterFace(
        {
            'num_leaves': 27,
            'subsample_freq': 5,
            'subsample': 0.3764957033418272,
            'colsample_bytree': 0.3047369568241308,
            'min_child_samples': 53,
            'max_depth': 30,
            'reg_alpha': 0.4270897845895936,
            'learning_rate': 0.01,
            'random_state': SEED,
            'n_estimators': 20000,
        }
    ),
    LightGBMInterFace(
        {
            'num_leaves': 172,
            'subsample_freq': 4,
            'subsample': 0.3305356794741203,
            'colsample_bytree': 0.4523656800127492,
            'min_child_samples': 46,
            'max_depth': 4,
            'reg_alpha': 0.9263461787560893,
            'learning_rate': 0.01,
            'random_state': SEED,
            'n_estimators': 20000,
        }
    )
]
lgm_valid_features, lgm_test_features, lgm_log_loss_mean = create_valid_feat_with_train_valid_test_dict(train_valid_test_dict, lgm_model_list)
mmbt_valid_features, mmbt_test_features, mmbt_log_loss_mean = create_mmbt_valid_feat_and_test_feat(
    train_df,
    test_df,
    log_json_and_model_path_dict,
    list(train_valid_test_dict.keys())
)


lv2_model = LogisticRegressionInterFace()
train_lv1_feat = pd.DataFrame(
    {
        'mmbt': mmbt_valid_features,
        'lgm': lgm_valid_features
    }
)
test_lv1_feat = pd.DataFrame(
    {
        'mmbt': mmbt_test_features,
        'lgm': lgm_test_features
    }
)
y_train = pd.DataFrame(np.concatenate([train_valid_test_dict[fold_name]['valid']['y'] for fold_name in train_valid_test_dict]))
_, y_pred, _ = create_valid_feat_with_1st_feat(lv2_model, train_lv1_feat, y_train, test_lv1_feat)
to_submission('../dataset/csv/sample_submission.csv', y_pred, '../results/lgm_and_mmbt_stacking.csv')

fold: fold_3
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[131]	valid_0's binary_logloss: 0.638928
fold: fold_1
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[103]	valid_0's binary_logloss: 0.644614
fold: fold_5
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[129]	valid_0's binary_logloss: 0.640103
fold: fold_4
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[124]	valid_0's binary_logloss: 0.643792
fold: fold_2
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[116]	valid_0's binary_logloss: 0.649502
log loss mean:0.643, std:0.004


Downloading pytorch_model.bin:   0%|          | 0.00/424M [00:00<?, ?B/s]

Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading: "https://download.pytorch.org/models/resnet152-f82ba261.pth" to /root/.cache/torch/hub/checkpoints/resnet152-f82

  0%|          | 0.00/230M [00:00<?, ?B/s]

100% 156/156 [00:59<00:00,  2.60it/s]
100% 188/188 [01:04<00:00,  2.91it/s]
Some weights of the model checkpoint at cl-tohoku/bert-base-japanese-whole-word-masking were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100% 157/157 [00:51<00:00,  3.04it/s]
100% 188/18

log loss mean:0.642, std:0.001
