In [52]:
import numpy as np
import pandas as pd
import lightgbm as lgb

import random
from collections import Counter, defaultdict

from sklearn.model_selection import GroupKFold

In [2]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    np.random.seed(seed)
    # ラベルの数をカウント
    labels_num = np.max(y) + 1
    # 各グループのラベルの数をカウントする
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1
    # 各フォールドのラベルの数をカウント
    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)

    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)

    for i in range(k):
        test_k = i
        train_groups = all_groups - groups_per_fold[test_k]  #  - groups_per_fold[val_k]
        test_groups = groups_per_fold[test_k]

        def choice_ind(group):
            indices = []
            n_g = None
            list_ = []
            for i, g in enumerate(groups):
                if g in group:
                    if n_g is not None and n_g != g:
                        indices.append(list_)
                        list_ = []
                    list_.append(i)
                    n_g = g

            indices = [np.random.choice(i) for i in indices]
            return indices

        train_indices = choice_ind(train_groups)
        test_indices = choice_ind(test_groups)
        # print(train_indices)
        yield train_indices, test_indices  # val_indices,

In [174]:
train_path = '/code/data/processed/proceeded_train_20200119_120438.csv'
test_path = '/code/data/processed/proceeded_test_20200119_120438.csv'

train = pd.read_csv(train_path, index_col=0)
test = pd.read_csv(test_path, index_col=0)

In [175]:
train['target'] = 0
test['target'] = 1

In [183]:
train_df = pd.concat([train, test])
train_df.columns = train_df.columns.str.replace(',', '')
train_df = train_df.reset_index(drop=True)

In [184]:
y = train_df['target']
x = train_df.drop('target', axis=1)
lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
    }
x = x.drop(['accuracy_group'], axis=1)

In [185]:
from sklearn.model_selection import KFold

def lgb_regression(x, y, lgb_params) -> pd.DataFrame:

    num_fold = 3
    install_id = x['installation_id']
    unique_install_id = install_id.unique()

    kf = KFold(num_fold, shuffle=True, random_state=77)
    pred = np.zeros(y.shape)
    all_importance = []
    all_train_ind = []
    all_test_ind = []
    for fold_ind, (train_ind, test_ind) in enumerate(
            kf.split(unique_install_id)):
        tr_groups = unique_install_id[train_ind]
        test_groups = unique_install_id[test_ind]

        def choice_ind(groups):
            ind = []
            for g in groups:
                id = np.random.choice(x[x['installation_id']==g].index)
                ind.append(id)
            return ind

        train_ind = choice_ind(tr_groups)
        test_ind = choice_ind(test_groups)
        all_train_ind.append(train_ind)
        all_test_ind.append(test_ind)

        x_train = x.iloc[train_ind].drop('installation_id', axis=1)
        print(x_train)
        y_train = y.iloc[train_ind]
        x_test = x.iloc[test_ind].drop('installation_id', axis=1)
        print(x_test)
        y_test = y.iloc[test_ind]

        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_val = lgb.Dataset(x_test, y_test, reference=lgb_train)

        model = lgb.train(params=lgb_params,
                          train_set=lgb_train,
                          valid_sets=lgb_val)

        pred[test_ind] = model.predict(x_test, num_iteration=model.best_iteration)

        all_importance.append(pd.DataFrame(model.feature_importance('gain'), index=x_train.columns))

    all_importance = pd.concat(all_importance, axis=1)
    return pred, all_importance, all_train_ind, all_test_ind


In [186]:
pred, importance, all_train_ind, all_test_ind = lgb_regression(x, y, lgb_params)

     0.500000   
8                           0                         0        0.000000   
14                          0                         0        0.000000   
32                          1                         0        0.250000   
36                          1                         0        1.000000   
...                       ...                       ...             ...   
18677                       0                         0        0.000000   
18680                       2                         6        0.666667   
18681                       6                         2        0.833333   
18686                       1                         0        1.000000   
18688                       2                         4        0.250000   

       acc_Mushroom Sorter (Assessment)  acc_Chest Sorter (Assessment)  ...  \
4                                   0.5                           -1.0  ...   
8                                   0.0                           -1.0  ..

In [190]:
pred

array([0.        , 0.        , 0.        , ..., 0.12459382, 0.05883467,
       0.05577431])

In [189]:
importance.mean(axis=1).sort_values()

coordinates                  0.000000
0ce40006                     0.000000
info_9                       0.000000
58a0de5c                     0.000000
1c178d24                     0.000000
                             ...     
total_duration             260.312218
frequency                  415.926528
count_correct_attempts     468.441702
Assessment                 808.742301
acc_0                     1171.284232
Length: 558, dtype: float64

In [18]:
pred, importance = lgb_regression(x, y, groups, lgb_params)

24, 15332, 15333, 15348, 15357, 15361, 15362, 15365, 15378, 15383, 15386, 15390, 15393, 15397, 15399, 15403, 15405, 15410, 15417, 15425, 15426, 15427, 15448, 15451, 15455, 15459, 15466, 15479, 15485, 15496, 15501, 15512, 15516, 15519, 15529, 15531, 15533, 15537, 15539, 15542, 15546, 15547, 15549, 15551, 15555, 15561, 15564, 15569, 15574, 15578, 15581, 15582, 15590, 15592, 15593, 15595, 15602, 15607, 15620, 15625, 15635, 15649, 15655, 15660, 15665, 15671, 15683, 15687, 15691, 15692, 15702, 15709, 15713, 15716, 15717, 15733, 15740, 15743, 15768, 15776, 15780, 15783, 15792, 15794, 15797, 15807, 15811, 15813, 15819, 15822, 15823, 15827, 15830, 15831, 15832, 15838, 15852, 15854, 15859, 15861, 15866, 15889, 15890, 15900, 15901, 15903, 15913, 15928, 15931, 15943, 15944, 15946, 15948, 15950, 15952, 15962, 15972, 15977, 15982, 16002, 16003, 16006, 16007, 16010, 16013, 16017, 16021, 16023, 16024, 16025, 16038, 16040, 16050, 16054, 16058, 16061, 16068, 16078, 16095, 16110, 16124, 16128, 16140, 16

In [192]:
importance.mean(axis=1).sort_values(ascending=False).head(20)

acc_0                       1171.284232
Assessment                   808.742301
count_correct_attempts       468.441702
frequency                    415.926528
total_duration               260.312218
count_uncorrect_attempts     211.994156
mean_accuracy_group          164.933594
count_accuracy               156.780484
Clip                         153.792881
2000                         146.253296
args_1                       142.065855
Welcome to Lost Lagoon!      133.394773
info_8                       131.348084
dayofweek_3                  131.319402
good_comment                 125.733654
game_mean_event_count        120.312932
dayofweek_1                  116.907198
dayofweek_6                  115.161496
4100                         112.224122
4070                         111.209331
dtype: float64

In [20]:
x.drop(['frequency'], axis=1, inplace=True)

In [21]:
pred, importance = lgb_regression(x, y, groups, lgb_params)

 15324, 15332, 15333, 15348, 15357, 15361, 15362, 15365, 15378, 15383, 15386, 15390, 15393, 15397, 15399, 15403, 15405, 15410, 15417, 15425, 15426, 15427, 15448, 15451, 15455, 15459, 15466, 15479, 15485, 15496, 15501, 15512, 15516, 15519, 15529, 15531, 15533, 15537, 15539, 15542, 15546, 15547, 15549, 15551, 15555, 15561, 15564, 15569, 15574, 15578, 15581, 15582, 15590, 15592, 15593, 15595, 15602, 15607, 15620, 15625, 15635, 15649, 15655, 15660, 15665, 15671, 15683, 15687, 15691, 15692, 15702, 15709, 15713, 15716, 15717, 15733, 15740, 15743, 15768, 15776, 15780, 15783, 15792, 15794, 15797, 15807, 15811, 15813, 15819, 15822, 15823, 15827, 15830, 15831, 15832, 15838, 15852, 15854, 15859, 15861, 15866, 15889, 15890, 15900, 15901, 15903, 15913, 15928, 15931, 15943, 15944, 15946, 15948, 15950, 15952, 15962, 15972, 15977, 15982, 16002, 16003, 16006, 16007, 16010, 16013, 16017, 16021, 16023, 16024, 16025, 16038, 16040, 16050, 16054, 16058, 16061, 16068, 16078, 16095, 16110, 16124, 16128, 16140

In [23]:
importance.mean(axis=1).sort_values(ascending=False).head(20)

total_duration                      639.633032
Clip                                470.432223
acc_Cart Balancer (Assessment)      327.845625
acc_Cauldron Filler (Assessment)    315.261216
count_uncorrect_attempts            232.683823
3bfd1a65                            226.402568
7ad3efc6                            215.972835
acc_Mushroom Sorter (Assessment)    186.146940
2000                                157.962033
good_comment                        153.532875
90d848e0                            152.358725
args_1                              149.386477
dayofweek_1                         149.254494
4070                                144.308858
dayofweek_0                         130.031789
info_8                              126.118112
game_mean_event_count               123.153610
info_7                              117.321724
count_actions                       117.056071
dayofweek_6                         116.520521
dtype: float64

In [24]:
x.drop(['total_duration'], axis=1, inplace=True)

In [25]:
pred, importance = lgb_regression(x, y, groups, lgb_params)

312, 15324, 15332, 15333, 15348, 15357, 15361, 15362, 15365, 15378, 15383, 15386, 15390, 15393, 15397, 15399, 15403, 15405, 15410, 15417, 15425, 15426, 15427, 15448, 15451, 15455, 15459, 15466, 15479, 15485, 15496, 15501, 15512, 15516, 15519, 15529, 15531, 15533, 15537, 15539, 15542, 15546, 15547, 15549, 15551, 15555, 15561, 15564, 15569, 15574, 15578, 15581, 15582, 15590, 15592, 15593, 15595, 15602, 15607, 15620, 15625, 15635, 15649, 15655, 15660, 15665, 15671, 15683, 15687, 15691, 15692, 15702, 15709, 15713, 15716, 15717, 15733, 15740, 15743, 15768, 15776, 15780, 15783, 15792, 15794, 15797, 15807, 15811, 15813, 15819, 15822, 15823, 15827, 15830, 15831, 15832, 15838, 15852, 15854, 15859, 15861, 15866, 15889, 15890, 15900, 15901, 15903, 15913, 15928, 15931, 15943, 15944, 15946, 15948, 15950, 15952, 15962, 15972, 15977, 15982, 16002, 16003, 16006, 16007, 16010, 16013, 16017, 16021, 16023, 16024, 16025, 16038, 16040, 16050, 16054, 16058, 16061, 16068, 16078, 16095, 16110, 16124, 16128, 1

In [26]:
importance.mean(axis=1).sort_values(ascending=False).head(20)

acc_Cauldron Filler (Assessment)    490.727896
Clip                                407.299976
3bfd1a65                            370.854152
acc_Cart Balancer (Assessment)      339.073003
7ad3efc6                            336.623069
acc_Mushroom Sorter (Assessment)    242.524746
count_uncorrect_attempts            229.816326
2000                                185.070773
4070                                163.558103
dayofweek_0                         156.874173
args_1                              154.543166
good_comment                        151.112981
info_8                              146.596503
dayofweek_6                         140.880771
dayofweek_1                         137.529724
count_correct_attempts              135.541106
90d848e0                            135.427507
dayofweek_4                         121.331728
game_mean_event_count               119.528539
args_19                             118.971213
dtype: float64