In [1]:
!pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.0-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.8/41.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.0
[0m

In [1]:
from pathlib import Path
from typing import List, Dict, Union
import os
import sys

import torch
import optuna
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss, accuracy_score 
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier, early_stopping

sys.path.append('../')
from src.models.utils import fix_seed
from src.data.submission import to_submission
from src.data.prepare import Create5FoldDataFrame

## 学習用データ作成用処理

1. ./dataset/preocessed/5fold_stratified_mmbt_seed_0/ 配下のfoldごとのtrain.csv, valid.csv, test.csvを読み込み、辞書として保持(データ構造は下記)

    ```
    {
        "fold_1": {
            "train": 学習用データフレーム,
            "valid": 検証用データフレーム,
            "test": テストデータフレーム(提出用データ),
        },
        ...
        "fold_5": fold_1と同様の構成
    }
    ```
2. ./dataset/preocessed/配下の以下のcsvをidをキーとして手順1で取得したデータフレームと外部結合

    - train(test)_has_person.csv: 文章中に人名を表す単語が出現したか

    - train(test)_od_counts.csv: yolov5で画像ごとに抜き出した物体数

    - train(test)_tfidf_vector.csv: tfidfベクトルの統計量
    
    - train(test)_text_len.csv: 文章の長さ

    - train(test)_similarity.csv: CLIPを使用した画像と文章のcosine類似度(あまり関係なさそうなので外してもいいかも)

3. ./dataset/train.csvを使用して、学習用データフレーム, 検証用データフレームにラベルを付与

4. 学習用, 検証用, テスト用データフレームから, idを抜く。学習用, 検証用データフレームにおいては、X: 特徴量のみのデータフレーム, y: ラベル としておく。

最終的に、以下のようなデータが得られる。

```
{
    "fold_1": {
        "train": {
            "X": 特徴量データフレーム,
            "y": ラベル,
        },
        "valid": {
            "X": 特徴量データフレーム,
            "y": ラベル,
        },
        "test": {
            "X": 特徴量データフレーム
        },
    },
    ...
    "fold_5": fold_1と同様の構成
}
```

In [2]:
train_feature_csv_path_list = [
    '../dataset/processed/train_has_person.csv',
    '../dataset/processed/train_od_counts.csv',
    '../dataset/processed/train_text_len.csv',
    '../dataset/processed/train_tfidf_vector.csv',
    '../dataset/processed/train_similarity.csv'
]
test_feature_csv_path_list = [
    '../dataset/processed/test_has_person.csv',
    '../dataset/processed/test_od_counts.csv',
    '../dataset/processed/test_text_len.csv',
    '../dataset/processed/test_tfidf_vector.csv',
    '../dataset/processed/test_similarity.csv'
]
create_train_valid_test_dict = Create5FoldDataFrame(
    '../dataset/processed/5fold_stratified_mmbt_seed_0/',
    '../dataset/csv/train.csv',
    train_feature_csv_path_list,
    test_feature_csv_path_list
)
train_valid_test_dict = create_train_valid_test_dict()

In [3]:
assert ['fold_1', 'fold_2', 'fold_3', 'fold_4', 'fold_5'] == sorted(train_valid_test_dict.keys()), '第一階層のkeyが想定通りか'
column_list = train_valid_test_dict['fold_1']['train']['X'].columns.to_list()
for fold_name in train_valid_test_dict.keys():
    for phase in train_valid_test_dict[fold_name]:
        assert train_valid_test_dict[fold_name][phase]['X'].columns.to_list() == column_list, '全てのカラムの順番が同じか'
        assert np.all(np.sum(train_valid_test_dict[fold_name][phase]['X'].isna()) == 0)
        if phase != 'test':
            assert np.all(np.sum(train_valid_test_dict[fold_name][phase]['y'].isna()) == 0)
            

## 学習

- モデル: lightGBM

- 手順(下記をfold数分実施する。)

    1. 学習用データにおいて、cross validation結果からハイパーパラメータを求める(一旦固定)

    2. 学習用データ, 検証用データ(early stopping用)でモデル学習

    3. テストデータで予測して、submission用csv作成

In [5]:
class Objective:
    def __init__(
                self,
                model,
                X,
                y,
                cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=0),
                stopping_rounds=50
            ):
        self.model = model
        self.X = X
        self.y = y
        self.cv = cv
        self.stopping_rounds = stopping_rounds

    def __call__(self, trial):
        params = {
            'num_leaves': trial.suggest_int('num_leaves', 10, 200),
            'subsample_freq': trial.suggest_int('subsample_freq', 1, 5),
            'subsample': trial.suggest_float('subsample', 0.1, 1),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
            'min_child_samples': trial.suggest_int('min_child_samples', 1, 60),
            'max_depth': trial.suggest_int('max_depth', 1, 30),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.0001, 1, log=True),
            'reg_lambda': trial.suggest_float('reg_alpha', 0.0001, 1, log=True),
        }
        self.model.set_params(**params)
        scores = []
        for (train_idx, valid_idx) in self.cv.split(self.X, self.y):
            self.model.fit(
                X=self.X.iloc[train_idx],
                y=self.y.iloc[train_idx],
                eval_set=(self.X.iloc[valid_idx], self.y.iloc[valid_idx]),
                callbacks=[early_stopping(stopping_rounds=self.stopping_rounds, verbose=False)],
                eval_metric='binary_logloss'
            )
            y_pred_proba = self.model.predict_proba(self.X.iloc[valid_idx])[:, 1]
            scores.append(log_loss(self.y.iloc[valid_idx], y_pred_proba)) 
        return np.mean(scores)
        

def train_and_eval_lgm(X_train, y_train, X_valid, y_valid, X_test, params, stopping_rounds=50):
    model = LGBMClassifier(**params)
    model.fit(
        X=X_train,
        y=y_train,
        eval_set=(X_valid, y_valid),
        callbacks=[early_stopping(stopping_rounds=stopping_rounds, verbose=True)],
        eval_metric='binary_logloss'
    )
    y_pred_val = model.predict(X_valid)
    y_pred_test = model.predict_proba(X_test)
    return y_pred_test[:, 1], model

In [6]:
SEED = 0
N_TRIAL = 50
base_params = {
    'learning_rate': 0.01,
    'random_state': SEED,
    'n_estimators': 20000,
}
stopping_rounds = 100
y_pred_result = []
models = []
log_loss_list = []
for fold_name in train_valid_test_dict:
    print(f'fold: {fold_name}')
    model = LGBMClassifier(**base_params)
    objective = Objective(
        model,
        train_valid_test_dict[fold_name]['train']['X'],
        train_valid_test_dict[fold_name]['train']['y'],
        stopping_rounds=stopping_rounds
    )
    study = optuna.create_study(
        direction='minimize',
        sampler=optuna.samplers.TPESampler(seed=SEED)
    )
    study.optimize(objective, n_trials=N_TRIAL)
    best_params = study.best_trial.params
    best_score = study.best_trial.value
    print(f'最適パラメータ {best_params}\nスコア {best_score}')
    y_pred_proba, model = train_and_eval_lgm(
        train_valid_test_dict[fold_name]['train']['X'],
        train_valid_test_dict[fold_name]['train']['y'],
        train_valid_test_dict[fold_name]['valid']['X'],
        train_valid_test_dict[fold_name]['valid']['y'],
        train_valid_test_dict[fold_name]['test']['X'],
        {**best_params, **base_params},
        stopping_rounds=stopping_rounds
    )
    y_val_pred = model.predict_proba(train_valid_test_dict[fold_name]['valid']['X'])[:, 1]
    loss = log_loss(train_valid_test_dict[fold_name]['valid']['y'], y_val_pred)
    log_loss_list.append(loss)
    y_pred_result.append(y_pred_proba)
    models.append(model)
y_pred_result = np.mean(y_pred_result, axis=0)
print(np.mean(log_loss_list), np.std(log_loss_list))
to_submission('../dataset/csv/sample_submission.csv', y_pred_result, '../results/turned_lgm_with_features.csv')

[32m[I 2022-10-02 17:39:32,470][0m A new study created in memory with name: no-name-fc2a405f-5c38-4424-9414-bd321178bda6[0m


fold: fold_3


[32m[I 2022-10-02 17:41:45,325][0m Trial 0 finished with value: 0.5383986321000698 and parameters: {'num_leaves': 114, 'subsample_freq': 4, 'subsample': 0.6424870384644795, 'colsample_bytree': 0.5903948646972071, 'min_child_samples': 26, 'max_depth': 20, 'reg_alpha': 0.0056279320474151686}. Best is trial 0 with value: 0.5383986321000698.[0m
[32m[I 2022-10-02 17:43:57,465][0m Trial 1 finished with value: 0.5403374093959707 and parameters: {'num_leaves': 180, 'subsample_freq': 5, 'subsample': 0.4450973669431999, 'colsample_bytree': 0.8125525342743981, 'min_child_samples': 32, 'max_depth': 18, 'reg_alpha': 0.503948959867121}. Best is trial 0 with value: 0.5383986321000698.[0m
[32m[I 2022-10-02 17:44:36,652][0m Trial 2 finished with value: 0.5396645631801038 and parameters: {'num_leaves': 23, 'subsample_freq': 1, 'subsample': 0.11819655769629316, 'colsample_bytree': 0.8493578609931441, 'min_child_samples': 47, 'max_depth': 27, 'reg_alpha': 0.8212461922256864}. Best is trial 0 with 

最適パラメータ {'num_leaves': 134, 'subsample_freq': 2, 'subsample': 0.5196796955706757, 'colsample_bytree': 0.31998303280144247, 'min_child_samples': 10, 'max_depth': 4, 'reg_alpha': 0.04220057397195014}
スコア 0.5356982021110731
Training until validation scores don't improve for 100 rounds


[32m[I 2022-10-02 18:22:45,747][0m A new study created in memory with name: no-name-91923900-19aa-4762-bb74-aedfb752fa35[0m


Early stopping, best iteration is:
[131]	valid_0's binary_logloss: 0.638928
fold: fold_1


[32m[I 2022-10-02 18:24:42,890][0m Trial 0 finished with value: 0.5265608060279539 and parameters: {'num_leaves': 114, 'subsample_freq': 4, 'subsample': 0.6424870384644795, 'colsample_bytree': 0.5903948646972071, 'min_child_samples': 26, 'max_depth': 20, 'reg_alpha': 0.0056279320474151686}. Best is trial 0 with value: 0.5265608060279539.[0m
[32m[I 2022-10-02 18:26:48,675][0m Trial 1 finished with value: 0.5275463212819511 and parameters: {'num_leaves': 180, 'subsample_freq': 5, 'subsample': 0.4450973669431999, 'colsample_bytree': 0.8125525342743981, 'min_child_samples': 32, 'max_depth': 18, 'reg_alpha': 0.503948959867121}. Best is trial 0 with value: 0.5265608060279539.[0m
[32m[I 2022-10-02 18:27:28,879][0m Trial 2 finished with value: 0.5266850179372589 and parameters: {'num_leaves': 23, 'subsample_freq': 1, 'subsample': 0.11819655769629316, 'colsample_bytree': 0.8493578609931441, 'min_child_samples': 47, 'max_depth': 27, 'reg_alpha': 0.8212461922256864}. Best is trial 0 with 

最適パラメータ {'num_leaves': 162, 'subsample_freq': 3, 'subsample': 0.8024762586578099, 'colsample_bytree': 0.20644698328203992, 'min_child_samples': 39, 'max_depth': 5, 'reg_alpha': 0.6007249475906198}
スコア 0.523293682673183
Training until validation scores don't improve for 100 rounds


[32m[I 2022-10-02 19:02:41,633][0m A new study created in memory with name: no-name-fffd2507-8b29-42fc-9f4a-4bd0aacc5664[0m


Early stopping, best iteration is:
[103]	valid_0's binary_logloss: 0.644614
fold: fold_5


[32m[I 2022-10-02 19:04:32,672][0m Trial 0 finished with value: 0.5649268985524735 and parameters: {'num_leaves': 114, 'subsample_freq': 4, 'subsample': 0.6424870384644795, 'colsample_bytree': 0.5903948646972071, 'min_child_samples': 26, 'max_depth': 20, 'reg_alpha': 0.0056279320474151686}. Best is trial 0 with value: 0.5649268985524735.[0m
[32m[I 2022-10-02 19:06:33,702][0m Trial 1 finished with value: 0.5657360131340123 and parameters: {'num_leaves': 180, 'subsample_freq': 5, 'subsample': 0.4450973669431999, 'colsample_bytree': 0.8125525342743981, 'min_child_samples': 32, 'max_depth': 18, 'reg_alpha': 0.503948959867121}. Best is trial 0 with value: 0.5649268985524735.[0m
[32m[I 2022-10-02 19:07:12,985][0m Trial 2 finished with value: 0.5656027926244043 and parameters: {'num_leaves': 23, 'subsample_freq': 1, 'subsample': 0.11819655769629316, 'colsample_bytree': 0.8493578609931441, 'min_child_samples': 47, 'max_depth': 27, 'reg_alpha': 0.8212461922256864}. Best is trial 0 with 

最適パラメータ {'num_leaves': 162, 'subsample_freq': 3, 'subsample': 0.8024762586578099, 'colsample_bytree': 0.20644698328203992, 'min_child_samples': 39, 'max_depth': 5, 'reg_alpha': 0.6007249475906198}
スコア 0.5635374436165422
Training until validation scores don't improve for 100 rounds


[32m[I 2022-10-02 19:41:29,508][0m A new study created in memory with name: no-name-5a1a6076-c740-472c-802c-803cbad794b5[0m


Early stopping, best iteration is:
[129]	valid_0's binary_logloss: 0.640103
fold: fold_4


[32m[I 2022-10-02 19:43:37,039][0m Trial 0 finished with value: 0.5431096363996287 and parameters: {'num_leaves': 114, 'subsample_freq': 4, 'subsample': 0.6424870384644795, 'colsample_bytree': 0.5903948646972071, 'min_child_samples': 26, 'max_depth': 20, 'reg_alpha': 0.0056279320474151686}. Best is trial 0 with value: 0.5431096363996287.[0m
[32m[I 2022-10-02 19:45:37,830][0m Trial 1 finished with value: 0.5454282874440852 and parameters: {'num_leaves': 180, 'subsample_freq': 5, 'subsample': 0.4450973669431999, 'colsample_bytree': 0.8125525342743981, 'min_child_samples': 32, 'max_depth': 18, 'reg_alpha': 0.503948959867121}. Best is trial 0 with value: 0.5431096363996287.[0m
[32m[I 2022-10-02 19:46:18,715][0m Trial 2 finished with value: 0.5441424554561445 and parameters: {'num_leaves': 23, 'subsample_freq': 1, 'subsample': 0.11819655769629316, 'colsample_bytree': 0.8493578609931441, 'min_child_samples': 47, 'max_depth': 27, 'reg_alpha': 0.8212461922256864}. Best is trial 0 with 

最適パラメータ {'num_leaves': 27, 'subsample_freq': 5, 'subsample': 0.3764957033418272, 'colsample_bytree': 0.3047369568241308, 'min_child_samples': 53, 'max_depth': 30, 'reg_alpha': 0.4270897845895936}
スコア 0.5396641785880096
Training until validation scores don't improve for 100 rounds


[32m[I 2022-10-02 20:22:49,909][0m A new study created in memory with name: no-name-fd8dddf7-8f09-4f87-86cc-a1167e0edbad[0m


Early stopping, best iteration is:
[124]	valid_0's binary_logloss: 0.643792
fold: fold_2


[32m[I 2022-10-02 20:24:31,683][0m Trial 0 finished with value: 0.5790801114456676 and parameters: {'num_leaves': 114, 'subsample_freq': 4, 'subsample': 0.6424870384644795, 'colsample_bytree': 0.5903948646972071, 'min_child_samples': 26, 'max_depth': 20, 'reg_alpha': 0.0056279320474151686}. Best is trial 0 with value: 0.5790801114456676.[0m
[32m[I 2022-10-02 20:26:10,401][0m Trial 1 finished with value: 0.5807329707138803 and parameters: {'num_leaves': 180, 'subsample_freq': 5, 'subsample': 0.4450973669431999, 'colsample_bytree': 0.8125525342743981, 'min_child_samples': 32, 'max_depth': 18, 'reg_alpha': 0.503948959867121}. Best is trial 0 with value: 0.5790801114456676.[0m
[32m[I 2022-10-02 20:26:35,833][0m Trial 2 finished with value: 0.580974990440527 and parameters: {'num_leaves': 23, 'subsample_freq': 1, 'subsample': 0.11819655769629316, 'colsample_bytree': 0.8493578609931441, 'min_child_samples': 47, 'max_depth': 27, 'reg_alpha': 0.8212461922256864}. Best is trial 0 with v

最適パラメータ {'num_leaves': 172, 'subsample_freq': 4, 'subsample': 0.3305356794741203, 'colsample_bytree': 0.4523656800127492, 'min_child_samples': 46, 'max_depth': 4, 'reg_alpha': 0.9263461787560893}
スコア 0.5780633392641933
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[116]	valid_0's binary_logloss: 0.649502
0.6433877662986168 0.0037346401831336905


Unnamed: 0,id,is_laugh
0,rfdjcfsqq,0.448371
1,tsgqmfpef,0.354625
2,owjcthkz2,0.329739
3,rvgaocjyy,0.306742
4,uxtwu5i69,0.778133


### lightGBMチューニング無しで学習

In [None]:
params = {
    'num_leaves': 63,
    'learning_rate': 0.01,
    'colsample _bytree': 0.8,
    'subsample_freq': 1,
    'subsample': 0.8,
    'random_state': 0,
    'n_estimators': 20000,
}
y_pred_result = []
models = []
log_loss_list = []
for fold_name in train_valid_test_dict:
    print(f'fold: {fold_name}')
    y_pred_proba, model = train_and_eval_lgm(
        train_valid_test_dict[fold_name]['train']['X'],
        train_valid_test_dict[fold_name]['train']['y'],
        train_valid_test_dict[fold_name]['valid']['X'],
        train_valid_test_dict[fold_name]['valid']['y'],
        train_valid_test_dict[fold_name]['test']['X'],
        params,
        stopping_rounds=100
    )
    y_val_pred = model.predict_proba(train_valid_test_dict[fold_name]['valid']['X'])[:, 1]
    loss = log_loss(train_valid_test_dict[fold_name]['valid']['y'], y_val_pred)
    log_loss_list.append(loss)
    y_pred_result.append(y_pred_proba)
    models.append(model)
y_pred_result = np.mean(y_pred_result, axis=0)
print(np.mean(log_loss_list), np.std(log_loss_list))
to_submission('../dataset/csv/sample_submission.csv', y_pred_result, '../results/lgm_with_features.csv')