In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb

import random
from collections import Counter, defaultdict

In [2]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    np.random.seed(seed)
    # ラベルの数をカウント
    labels_num = np.max(y) + 1
    # 各グループのラベルの数をカウントする
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1
    # 各フォールドのラベルの数をカウント
    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)

    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)

    for i in range(k):
        test_k = i
        # val_k = i+1 if i+1 != k else 0
        # print(val_k)
        train_groups = all_groups - groups_per_fold[test_k]  #  - groups_per_fold[val_k]
        # val_groups = groups_per_fold[val_k]
        test_groups = groups_per_fold[test_k]
        # print(test_groups)
        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        # val_indices = [i for i, g in enumerate(groups) if g in val_groups]
        # test_indices = {str(g): [i for i, g in enumerate(groups) if g in test_groups]}

        test_indices = []
        n_g = None
        test_list = []
        for i, g in enumerate(groups):
            if g in test_groups:
                if n_g is not None and n_g != g:
                    test_indices.append(test_list)
                    test_list = []
                test_list.append(i)
                n_g = g

        test_indices = [np.random.choice(i) for i in test_indices]
        yield train_indices, test_indices  # val_indices,

In [3]:
train_path = '/code/data/processed/proceeded_train_20200116_102618.csv'
test_path = '/code/data/processed/proceeded_test_20200116_102618.csv'

train = pd.read_csv(train_path, index_col=0)
test = pd.read_csv(test_path, index_col=0)

In [4]:
train.head()

Unnamed: 0,Clip,Activity,Assessment,Game,session_title,count_correct_attempts,count_uncorrect_attempts,count_accuracy,acc_Cart Balancer (Assessment),acc_Cauldron Filler (Assessment),...,label_count_action_label_description_val,count_label_count_action_label_description_val,label_session_title_description_val,count_label_session_title_description_val,label_session_title_count_action_label,count_label_session_title_count_action_label,label_session_title_count_accuracy_label,count_label_session_title_count_accuracy_label,label_session_title_mean_accuracy_group_label,count_label_session_title_mean_accuracy_group_label
0,11,3,0,4,42,0,0,0.0,-1.0,-1.0,...,99,337,158,763,46,747,20,1115,20,1115
1,14,4,1,6,30,1,0,1.0,-1.0,-1.0,...,104,488,76,959,27,814,14,463,14,463
2,14,4,2,6,42,1,11,0.5,-1.0,-1.0,...,129,290,154,370,47,1006,22,1192,22,1036
3,24,9,4,10,42,2,11,0.5,-1.0,-1.0,...,130,522,155,534,47,1006,22,1192,22,1036
4,28,10,5,13,30,3,12,0.5,-1.0,-1.0,...,104,488,76,959,27,814,12,1012,12,883


In [5]:
train['target'] = 0
test['target'] = 1

In [6]:
test['target']

0      1
1      1
2      1
3      1
4      1
      ..
995    1
996    1
997    1
998    1
999    1
Name: target, Length: 1000, dtype: int64

In [7]:
train_df = pd.concat([train, test])
train_df.columns = train_df.columns.str.replace(',', '')

In [8]:
train_df.shape

(18690, 702)

In [9]:
y = train_df['target']
x = train_df.drop('target', axis=1)
groups = np.array(x['installation_id'])
lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
    }
x = x.drop(['accuracy_group', 'installation_id'], axis=1)

In [10]:
def lgb_regression(x, y, groups, lgb_params) -> pd.DataFrame:

    num_fold = 3
    pred = np.zeros(y.shape)
    all_importance = []

    for fold_ind, (train_ind, test_ind) in enumerate(
            stratified_group_k_fold(X=x, y=y, groups=groups, k=num_fold, seed=77)):
        x_train = x.iloc[train_ind]
        y_train = y.iloc[train_ind]
        x_test = x.iloc[test_ind]
        y_test = y.iloc[test_ind]
        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_val = lgb.Dataset(x_test, y_test, reference=lgb_train)

        model = lgb.train(params=lgb_params,
                          train_set=lgb_train,
                          valid_sets=lgb_val)

        pred[test_ind] = model.predict(x_test, num_iteration=model.best_iteration)

        all_importance.append(pd.DataFrame(model.feature_importance('gain'), index=x_train.columns))

    all_importance = pd.concat(all_importance, axis=1)
    return pred, all_importance


In [11]:
pred, importance = lgb_regression(x, y, groups, lgb_params)

[1]	valid_0's auc: 0.530549
[2]	valid_0's auc: 0.568
[3]	valid_0's auc: 0.578914
[4]	valid_0's auc: 0.584547
[5]	valid_0's auc: 0.582844
[6]	valid_0's auc: 0.582641
[7]	valid_0's auc: 0.580566
[8]	valid_0's auc: 0.581874
[9]	valid_0's auc: 0.578889
[10]	valid_0's auc: 0.573098
[11]	valid_0's auc: 0.574261
[12]	valid_0's auc: 0.576296
[13]	valid_0's auc: 0.580833
[14]	valid_0's auc: 0.580249
[15]	valid_0's auc: 0.580892
[16]	valid_0's auc: 0.581141
[17]	valid_0's auc: 0.57895
[18]	valid_0's auc: 0.577672
[19]	valid_0's auc: 0.577922
[20]	valid_0's auc: 0.578021
[21]	valid_0's auc: 0.580347
[22]	valid_0's auc: 0.585092
[23]	valid_0's auc: 0.585072
[24]	valid_0's auc: 0.588135
[25]	valid_0's auc: 0.590367
[26]	valid_0's auc: 0.591797
[27]	valid_0's auc: 0.59528
[28]	valid_0's auc: 0.596011
[29]	valid_0's auc: 0.597525
[30]	valid_0's auc: 0.598491
[31]	valid_0's auc: 0.599039
[32]	valid_0's auc: 0.600118
[33]	valid_0's auc: 0.602339
[34]	valid_0's auc: 0.601359
[35]	valid_0's auc: 0.600436

In [12]:
importance.mean(axis=1).sort_values(ascending=False).head(20)

Assessment                  858.139293
count_correct_attempts      589.396695
ratio_Assessment            566.918432
duration_std                565.764501
2010                        389.923489
acc_0                       290.556852
mean_coordinates_x          288.921900
count_uncorrect_attempts    281.240407
std_coordinates_x           258.746443
std_coordinates_y           240.005289
ratio_event_code_4000       223.524323
duration_mean               222.061683
ratio_event_code_3000       207.564758
ratio_Clip                  201.282815
frequency                   200.469653
mean_size                   199.890797
good_comment_ratio          195.405997
month_10                    190.501958
mean_coordinates_y          181.264334
total_duration              180.878907
dtype: float64