In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb

import random
from collections import Counter, defaultdict

In [25]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    np.random.seed(seed)
    # ラベルの数をカウント
    labels_num = np.max(y) + 1
    # 各グループのラベルの数をカウントする
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1
    # 各フォールドのラベルの数をカウント
    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)

    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)

    for i in range(k):
        test_k = i
        # val_k = i+1 if i+1 != k else 0
        # print(val_k)
        train_groups = all_groups - groups_per_fold[test_k]  #  - groups_per_fold[val_k]
        # val_groups = groups_per_fold[val_k]
        test_groups = groups_per_fold[test_k]
        # print(test_groups)
        # train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        # val_indices = [i for i, g in enumerate(groups) if g in val_groups]
        # test_indices = {str(g): [i for i, g in enumerate(groups) if g in test_groups]}

        def choice_ind(group):
            indices = []
            n_g = None
            list_ = []
            for i, g in enumerate(groups):
                if g in group:
                    if n_g is not None and n_g != g:
                        indices.append(list_)
                        list_ = []
                    list_.append(i)
                    n_g = g

            indices = [np.random.choice(i) for i in indices]
            return indices

        train_indices = choice_ind(train_groups)
        test_indices = choice_ind(test_groups)
        # print(train_indices)
        yield train_indices, test_indices  # val_indices,

In [3]:
train_path = '/code/data/processed/proceeded_train_20200116_102618.csv'
test_path = '/code/data/processed/proceeded_test_20200116_102618.csv'

train = pd.read_csv(train_path, index_col=0)
test = pd.read_csv(test_path, index_col=0)

In [4]:
train.head()

Unnamed: 0,Clip,Activity,Assessment,Game,session_title,count_correct_attempts,count_uncorrect_attempts,count_accuracy,acc_Cart Balancer (Assessment),acc_Cauldron Filler (Assessment),...,label_count_action_label_description_val,count_label_count_action_label_description_val,label_session_title_description_val,count_label_session_title_description_val,label_session_title_count_action_label,count_label_session_title_count_action_label,label_session_title_count_accuracy_label,count_label_session_title_count_accuracy_label,label_session_title_mean_accuracy_group_label,count_label_session_title_mean_accuracy_group_label
0,11,3,0,4,42,0,0,0.0,-1.0,-1.0,...,99,337,158,763,46,747,20,1115,20,1115
1,14,4,1,6,30,1,0,1.0,-1.0,-1.0,...,104,488,76,959,27,814,14,463,14,463
2,14,4,2,6,42,1,11,0.5,-1.0,-1.0,...,129,290,154,370,47,1006,22,1192,22,1036
3,24,9,4,10,42,2,11,0.5,-1.0,-1.0,...,130,522,155,534,47,1006,22,1192,22,1036
4,28,10,5,13,30,3,12,0.5,-1.0,-1.0,...,104,488,76,959,27,814,12,1012,12,883


In [5]:
train['target'] = 0
test['target'] = 1

In [6]:
test['target']

0      1
1      1
2      1
3      1
4      1
      ..
995    1
996    1
997    1
998    1
999    1
Name: target, Length: 1000, dtype: int64

In [7]:
train_df = pd.concat([train, test])
train_df.columns = train_df.columns.str.replace(',', '')

In [15]:
train_df

Unnamed: 0,Clip,Activity,Assessment,Game,session_title,count_correct_attempts,count_uncorrect_attempts,count_accuracy,acc_Cart Balancer (Assessment),acc_Cauldron Filler (Assessment),...,count_label_count_action_label_description_val,label_session_title_description_val,count_label_session_title_description_val,label_session_title_count_action_label,count_label_session_title_count_action_label,label_session_title_count_accuracy_label,count_label_session_title_count_accuracy_label,label_session_title_mean_accuracy_group_label,count_label_session_title_mean_accuracy_group_label,target
0,11,3,0,4,42,0,0,0.000000,-1.0,-1.0,...,337.0,158,763.0,46,747,20,1115,20,1115,0
1,14,4,1,6,30,1,0,1.000000,-1.0,-1.0,...,488.0,76,959.0,27,814,14,463,14,463,0
2,14,4,2,6,42,1,11,0.500000,-1.0,-1.0,...,290.0,154,370.0,47,1006,22,1192,22,1036,0
3,24,9,4,10,42,2,11,0.500000,-1.0,-1.0,...,522.0,155,534.0,47,1006,22,1192,22,1036,0
4,28,10,5,13,30,3,12,0.500000,-1.0,-1.0,...,488.0,76,959.0,27,814,12,1012,12,883,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,4,0,3,1,19,2,0,0.666667,-1.0,-1.0,...,357.0,68,380.0,15,453,7,1256,8,870,1
996,11,2,1,2,39,1,0,1.000000,1.0,-1.0,...,357.0,125,913.0,35,351,19,532,19,532,1
997,32,2,4,0,13,3,6,0.583333,-1.0,1.0,...,572.0,29,416.0,6,963,2,1308,2,1163,1
998,11,3,3,1,13,2,4,0.250000,-1.0,0.0,...,87.0,21,135.0,5,654,1,557,2,1163,1


In [8]:
train_df.shape

(18690, 702)

In [22]:
y = train_df['target']
x = train_df.drop('target', axis=1)
groups = np.array(x['installation_id'])
lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
    }
x = x.drop(['accuracy_group', 'installation_id'], axis=1)

In [26]:
def lgb_regression(x, y, groups, lgb_params) -> pd.DataFrame:

    num_fold = 3
    pred = np.zeros(y.shape)
    all_importance = []

    for fold_ind, (train_ind, test_ind) in enumerate(
            stratified_group_k_fold(X=x, y=y, groups=groups, k=num_fold, seed=77)):
        print(train_ind)
        print(test_ind)
        x_train = x.iloc[train_ind]
        y_train = y.iloc[train_ind]
        x_test = x.iloc[test_ind]
        y_test = y.iloc[test_ind]
        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_val = lgb.Dataset(x_test, y_test, reference=lgb_train)

        model = lgb.train(params=lgb_params,
                          train_set=lgb_train,
                          valid_sets=lgb_val)

        pred[test_ind] = model.predict(x_test, num_iteration=model.best_iteration)

        all_importance.append(pd.DataFrame(model.feature_importance('gain'), index=x_train.columns))

    all_importance = pd.concat(all_importance, axis=1)
    return pred, all_importance


In [27]:
pred, importance = lgb_regression(x, y, groups, lgb_params)

2, 15324, 15332, 15333, 15348, 15357, 15361, 15362, 15365, 15378, 15383, 15386, 15390, 15393, 15397, 15399, 15403, 15405, 15410, 15417, 15425, 15426, 15427, 15448, 15451, 15455, 15459, 15466, 15479, 15485, 15496, 15501, 15512, 15516, 15519, 15529, 15531, 15533, 15537, 15539, 15542, 15546, 15547, 15549, 15551, 15555, 15561, 15564, 15569, 15574, 15578, 15581, 15582, 15590, 15592, 15593, 15595, 15602, 15607, 15620, 15625, 15635, 15649, 15655, 15660, 15665, 15671, 15683, 15687, 15691, 15692, 15702, 15709, 15713, 15716, 15717, 15733, 15740, 15743, 15768, 15776, 15780, 15783, 15792, 15794, 15797, 15807, 15811, 15813, 15819, 15822, 15823, 15827, 15830, 15831, 15832, 15838, 15852, 15854, 15859, 15861, 15866, 15889, 15890, 15900, 15901, 15903, 15913, 15928, 15931, 15943, 15944, 15946, 15948, 15950, 15952, 15962, 15972, 15977, 15982, 16002, 16003, 16006, 16007, 16010, 16013, 16017, 16021, 16023, 16024, 16025, 16038, 16040, 16050, 16054, 16058, 16061, 16068, 16078, 16095, 16110, 16124, 16128, 161

In [28]:
pred.max()

0.9958722886817871

In [29]:
importance.mean(axis=1).sort_values(ascending=False).head(20)

duration_mean                                  275.415255
ratio_Assessment                               240.351286
std_coordinates_x                              228.805673
frequency                                      213.725680
mean_coordinates_x                             205.292363
duration_std                                   203.323524
good_comment_ratio                             201.963101
duration_median                                190.458946
total_duration                                 182.752400
mean_coordinates_y                             171.651708
std_coordinates_y                              143.276855
ratio_event_code_2000                          141.358897
ratio_event_code_3000                          134.675560
ratio_Game                                     129.695904
ratio_Clip                                     127.490226
count_label_frequency_label_description_val    121.370596
Clip                                           120.305007
Assessment    