In [304]:
import os
import gc
import joblib
import random
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm
from argparse import Namespace
from collections import defaultdict
from scipy.signal import find_peaks

import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import TimeSeriesSplit, StratifiedKFold, GroupKFold, train_test_split, KFold

import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', 64)

from sklearn import model_selection, metrics


In [356]:
df = pd.read_csv('../input/we-are-all-alike-on-the-inside/train.csv')
test_org = pd.read_csv('../input/we-are-all-alike-on-the-inside/test.csv')
df['target'] = df['category'].map(
{
    'association':0,
    'disagreement':1,
    'unbiased':2,
})


In [339]:
# sakami_data
sakami_folder_list = ['v1', 'v2', 'v3', 'v5']
oof_dict = {}
test_dict = {}

for sakami_folder in sakami_folder_list:
    oof = pd.read_csv(f'../input/gcsdata/sakami/{sakami_folder}/valid.csv')
    test = pd.read_csv(f'../input/gcsdata/sakami/{sakami_folder}/test.csv').sort_values('id')
    oof = oof.merge(df[['id', 'category', 'target']]).sort_values('id')
    pred_cols = ['association','disagreement', 'unbiased']
    oof = oof.loc[oof[pred_cols].sum(axis=1) != 0]
    test['target'] = test[pred_cols].to_numpy().argmax(axis=1)
    test[pred_cols] *= 4
    test['category'] = test['target'].map({
        0: 'association',
        1: 'disagreement',
        2: 'unbiased',
    })
    
    for c in pred_cols:
        oof[f'f_{c}_{sakami_folder}'] = oof[c]
        test[f'f_{c}_{sakami_folder}'] = test[c]
    oof_dict[f'sakami{sakami_folder}'] = oof
    test_dict[f'sakami{sakami_folder}'] = test
    
    # test[['id', 'category']].to_csv(f'../output/sub_{sakami_folder}_sakami.csv', index=False)
    print(metrics.f1_score(oof['target'], oof[pred_cols].to_numpy().argmax(axis=1), average='micro'))


0.7942130778333397
0.7918082223155323
0.7848417757758522
0.7891170744741764


In [341]:
# shimacos データ
oof = pd.read_csv('../input/gcsdata/shima/v0/valid_fold0.csv').rename(
    columns={
        'category_pred_0': 'association',
        'category_pred_1': 'disagreement',
        'category_pred_2': 'unbiased',
    }
).drop_duplicates('id').sort_values('id')
del oof['category_pred'], oof['category']
oof = oof.merge(df[['id', 'category', 'target']])
for c in pred_cols:
    oof[f'f_{c}_shima'] = oof[c]

test = pd.read_csv('../input/gcsdata/shima/v0/test_fold0.csv').rename(
    columns={
        'category_pred_0': 'association',
        'category_pred_1': 'disagreement',
        'category_pred_2': 'unbiased',
    }
).drop_duplicates('id').sort_values('id')
# del test['category_pred'], test['category']
# test = test.merge(df[['id', 'category', 'target']])
for c in pred_cols:
    test[f'f_{c}_shima'] = test[c]
    

oof_dict['shima_v0'] = oof
test_dict['shima_v0'] = test
print(metrics.f1_score(oof['target'], oof[pred_cols].to_numpy().argmax(axis=1), average='micro'))


# ensemble スコア
p = np.zeros([len(oof), 3])

for key, w in zip(['sakamiv1', 'sakamiv2', 'sakamiv3', 'shima_v0'], [0.2, 0.1, 0.1, 0.75]):
    p += (oof_dict[key][pred_cols].to_numpy() ** 2) * w
print(metrics.f1_score(oof['target'],p.argmax(axis=1), average='micro'))


# test ensemble
p = np.zeros([len(test), 3])

for key, w in zip(['sakamiv1', 'sakamiv2', 'sakamiv3', 'shima_v0'], [0.2, 0.1, 0.1, 0.75]):
    p += (test_dict[key][pred_cols].to_numpy() ** 2) * w
# sub
test['target'] = p.argmax(axis=1)
test['category'] = test['target'].map({
    0: 'association',
    1: 'disagreement',
    2: 'unbiased',
})
# test[['id', 'category']].to_csv('../output/sub_ens.csv', index=False)

0.8373096156048403
0.8420048097110356


In [359]:
import copy
oof_df = copy.deepcopy(oof_dict['sakamiv1'])[['id', 'target']]
for key in ['sakamiv1', 'sakamiv2', 'sakamiv3', 'sakamiv5', 'shima_v0']:
    xcols = [c for c in oof_dict[key].columns if c.startswith('f_')]
    oof_df = oof_df.merge(oof_dict[key][['id']+xcols])
tr = oof_df.copy()
test_df = copy.deepcopy(test_dict['sakamiv1'])[['id', 'target']]
for key in ['sakamiv1', 'sakamiv2', 'sakamiv3', 'sakamiv5', 'shima_v0']:
    xcols = [c for c in test_dict[key].columns if c.startswith('f_')]
    test_df = test_df.merge(test_dict[key][['id']+xcols])
tr = tr.merge(df[['id', 'sim1', 'sim2', 'sim3']])
test_df = test_df.merge(test_org[['id', 'sim1', 'sim2', 'sim3']])

In [364]:
lgb_params = {
        'learning_rate':0.01,
        "objective": "multiclass",
        'num_class': 3,
#         "metric": "rmse",
        'boosting_type': "gbdt", #trial.suggest_categorical("boosting_type", ["dart", "gbdt"]),
        'verbosity': -1,
        'n_jobs': -1,
        'seed': 19930820,
        "lambda_l1": 0,
        "lambda_l2": 0,
        "num_leaves": 8,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.9,
        "bagging_freq": 1,
        'max_depth': -1,
        'max_bin':512,
    }
X_cols = [c for c in tr.columns if c not in ['id', 'target']]# + ['sim1', 'sim2', 'sim3']
target_col = 'target'
kf = model_selection.KFold(n_splits=4)
oof = np.zeros([len(tr), 3])
pred_stack_test = np.zeros([len(test), 3])
for tr_idx, va_idx in kf.split(tr):
    trn = tr.iloc[tr_idx]
    val = tr.iloc[va_idx]
    tr_D = lgb.Dataset(trn[X_cols], trn[target_col])
    va_D = lgb.Dataset(val[X_cols], val[target_col])
    model = lgb.train(lgb_params, tr_D, valid_sets=[va_D], num_boost_round=2000, early_stopping_rounds=100, verbose_eval=10)
    oof[va_idx, :] = model.predict(val[X_cols])
    pred_stack_test += model.predict(test_df[X_cols])
p = oof.argmax(axis=1)
metrics.f1_score(tr['target'], p, average='micro')

Training until validation scores don't improve for 100 rounds
[10]	valid_0's multi_logloss: 0.983082
[20]	valid_0's multi_logloss: 0.890491
[30]	valid_0's multi_logloss: 0.815052
[40]	valid_0's multi_logloss: 0.753015
[50]	valid_0's multi_logloss: 0.700233
[60]	valid_0's multi_logloss: 0.655572
[70]	valid_0's multi_logloss: 0.618376
[80]	valid_0's multi_logloss: 0.586163
[90]	valid_0's multi_logloss: 0.558662
[100]	valid_0's multi_logloss: 0.535445
[110]	valid_0's multi_logloss: 0.515763
[120]	valid_0's multi_logloss: 0.499002
[130]	valid_0's multi_logloss: 0.484459
[140]	valid_0's multi_logloss: 0.472172
[150]	valid_0's multi_logloss: 0.461635
[160]	valid_0's multi_logloss: 0.452585
[170]	valid_0's multi_logloss: 0.444707
[180]	valid_0's multi_logloss: 0.437968
[190]	valid_0's multi_logloss: 0.432212
[200]	valid_0's multi_logloss: 0.42732
[210]	valid_0's multi_logloss: 0.423055
[220]	valid_0's multi_logloss: 0.419385
[230]	valid_0's multi_logloss: 0.41616
[240]	valid_0's multi_logloss

Training until validation scores don't improve for 100 rounds
[10]	valid_0's multi_logloss: 0.984237
[20]	valid_0's multi_logloss: 0.892456
[30]	valid_0's multi_logloss: 0.81782
[40]	valid_0's multi_logloss: 0.756425
[50]	valid_0's multi_logloss: 0.704373
[60]	valid_0's multi_logloss: 0.66047
[70]	valid_0's multi_logloss: 0.623845
[80]	valid_0's multi_logloss: 0.592308
[90]	valid_0's multi_logloss: 0.565336
[100]	valid_0's multi_logloss: 0.542707
[110]	valid_0's multi_logloss: 0.523472
[120]	valid_0's multi_logloss: 0.507212
[130]	valid_0's multi_logloss: 0.493139
[140]	valid_0's multi_logloss: 0.481271
[150]	valid_0's multi_logloss: 0.471064
[160]	valid_0's multi_logloss: 0.462305
[170]	valid_0's multi_logloss: 0.454775
[180]	valid_0's multi_logloss: 0.448269
[190]	valid_0's multi_logloss: 0.442739
[200]	valid_0's multi_logloss: 0.438028
[210]	valid_0's multi_logloss: 0.433933
[220]	valid_0's multi_logloss: 0.430425
[230]	valid_0's multi_logloss: 0.427317
[240]	valid_0's multi_logloss

0.8447913883269077

In [365]:
lgb_params = {
        'learning_rate':0.01,
        "objective": "multiclass",
        'num_class': 3,
#         "metric": "rmse",
        'boosting_type': "gbdt", #trial.suggest_categorical("boosting_type", ["dart", "gbdt"]),
        'verbosity': -1,
        'n_jobs': -1,
        'seed': 19930820,
        "lambda_l1": 0,
        "lambda_l2": 0,
        "num_leaves": 8,
        "feature_fraction": 0.8,
        "bagging_fraction": 0.9,
        "bagging_freq": 1,
        'max_depth': -1,
        'max_bin':512,
    }
X_cols = [c for c in tr.columns if c not in ['id', 'target'] + ['sim1', 'sim2', 'sim3']]
target_col = 'target'
kf = model_selection.KFold(n_splits=4)
oof = np.zeros([len(tr), 3])
pred_stack_test = np.zeros([len(test), 3])
for tr_idx, va_idx in kf.split(tr):
    trn = tr.iloc[tr_idx]
    val = tr.iloc[va_idx]
    tr_D = lgb.Dataset(trn[X_cols], trn[target_col])
    va_D = lgb.Dataset(val[X_cols], val[target_col])
    model = lgb.train(lgb_params, tr_D, valid_sets=[va_D], num_boost_round=2000, early_stopping_rounds=100, verbose_eval=10)
    oof[va_idx, :] = model.predict(val[X_cols])
    pred_stack_test += model.predict(test_df[X_cols])
p = oof.argmax(axis=1)
metrics.f1_score(tr['target'], p, average='micro')

Training until validation scores don't improve for 100 rounds
[10]	valid_0's multi_logloss: 0.982807
[20]	valid_0's multi_logloss: 0.889429
[30]	valid_0's multi_logloss: 0.813272
[40]	valid_0's multi_logloss: 0.750277
[50]	valid_0's multi_logloss: 0.697619
[60]	valid_0's multi_logloss: 0.653254
[70]	valid_0's multi_logloss: 0.615814
[80]	valid_0's multi_logloss: 0.58402
[90]	valid_0's multi_logloss: 0.557287
[100]	valid_0's multi_logloss: 0.534318
[110]	valid_0's multi_logloss: 0.515005
[120]	valid_0's multi_logloss: 0.49826
[130]	valid_0's multi_logloss: 0.483897
[140]	valid_0's multi_logloss: 0.471618
[150]	valid_0's multi_logloss: 0.461161
[160]	valid_0's multi_logloss: 0.452222
[170]	valid_0's multi_logloss: 0.444502
[180]	valid_0's multi_logloss: 0.437881
[190]	valid_0's multi_logloss: 0.432189
[200]	valid_0's multi_logloss: 0.427263
[210]	valid_0's multi_logloss: 0.423023
[220]	valid_0's multi_logloss: 0.419403
[230]	valid_0's multi_logloss: 0.41628
[240]	valid_0's multi_logloss:

[120]	valid_0's multi_logloss: 0.506817
[130]	valid_0's multi_logloss: 0.492866
[140]	valid_0's multi_logloss: 0.480987
[150]	valid_0's multi_logloss: 0.470852
[160]	valid_0's multi_logloss: 0.462164
[170]	valid_0's multi_logloss: 0.454746
[180]	valid_0's multi_logloss: 0.448365
[190]	valid_0's multi_logloss: 0.442904
[200]	valid_0's multi_logloss: 0.438164
[210]	valid_0's multi_logloss: 0.434046
[220]	valid_0's multi_logloss: 0.43055
[230]	valid_0's multi_logloss: 0.427515
[240]	valid_0's multi_logloss: 0.424866
[250]	valid_0's multi_logloss: 0.422599
[260]	valid_0's multi_logloss: 0.420635
[270]	valid_0's multi_logloss: 0.418936
[280]	valid_0's multi_logloss: 0.417425
[290]	valid_0's multi_logloss: 0.416119
[300]	valid_0's multi_logloss: 0.414964
[310]	valid_0's multi_logloss: 0.413986
[320]	valid_0's multi_logloss: 0.413124
[330]	valid_0's multi_logloss: 0.412338
[340]	valid_0's multi_logloss: 0.411677
[350]	valid_0's multi_logloss: 0.411068
[360]	valid_0's multi_logloss: 0.410528
[

0.844409665228843

In [353]:
sub = test_df[['id']].copy()
sub['target'] = pred_stack_test.argmax(axis=1)
sub['category'] = sub['target'].map({
    0: 'association',
    1: 'disagreement',
    2: 'unbiased',
})
sub[['id', 'category']].to_csv('../output/sub.csv', index=False)