In [37]:
import numpy as np
import pandas as pd
import lightgbm as lgb

import random
from collections import Counter, defaultdict

In [38]:
def stratified_group_k_fold(X, y, groups, k, seed=None):
    np.random.seed(seed)
    # ラベルの数をカウント
    labels_num = np.max(y) + 1
    # 各グループのラベルの数をカウントする
    y_counts_per_group = defaultdict(lambda: np.zeros(labels_num))
    y_distr = Counter()
    for label, g in zip(y, groups):
        y_counts_per_group[g][label] += 1
        y_distr[label] += 1
    # 各フォールドのラベルの数をカウント
    y_counts_per_fold = defaultdict(lambda: np.zeros(labels_num))
    groups_per_fold = defaultdict(set)

    def eval_y_counts_per_fold(y_counts, fold):
        y_counts_per_fold[fold] += y_counts
        std_per_label = []
        for label in range(labels_num):
            label_std = np.std([y_counts_per_fold[i][label] / y_distr[label] for i in range(k)])
            std_per_label.append(label_std)
        y_counts_per_fold[fold] -= y_counts
        return np.mean(std_per_label)

    groups_and_y_counts = list(y_counts_per_group.items())
    random.Random(seed).shuffle(groups_and_y_counts)

    for g, y_counts in sorted(groups_and_y_counts, key=lambda x: -np.std(x[1])):
        best_fold = None
        min_eval = None
        for i in range(k):
            fold_eval = eval_y_counts_per_fold(y_counts, i)
            if min_eval is None or fold_eval < min_eval:
                min_eval = fold_eval
                best_fold = i
        y_counts_per_fold[best_fold] += y_counts
        groups_per_fold[best_fold].add(g)

    all_groups = set(groups)

    for i in range(k):
        test_k = i
        # val_k = i+1 if i+1 != k else 0
        # print(val_k)
        train_groups = all_groups - groups_per_fold[test_k]  #  - groups_per_fold[val_k]
        # val_groups = groups_per_fold[val_k]
        test_groups = groups_per_fold[test_k]
        # print(test_groups)
        train_indices = [i for i, g in enumerate(groups) if g in train_groups]
        # val_indices = [i for i, g in enumerate(groups) if g in val_groups]
        # test_indices = {str(g): [i for i, g in enumerate(groups) if g in test_groups]}

        test_indices = []
        n_g = None
        test_list = []
        for i, g in enumerate(groups):
            if g in test_groups:
                if n_g is not None and n_g != g:
                    test_indices.append(test_list)
                    test_list = []
                test_list.append(i)
                n_g = g

        test_indices = [np.random.choice(i) for i in test_indices]
        yield train_indices, test_indices  # val_indices,

In [22]:
train_path = '/code/data/processed/proceeded_train_20200113_143512.csv'
test_path = '/code/data/processed/proceeded_test_20200113_143512.csv'

train = pd.read_csv(train_path, index_col=0)
test = pd.read_csv(test_path, index_col=0)

In [23]:
train.head()

Unnamed: 0,Clip,Activity,Assessment,Game,session_title,count_correct_attempts,count_uncorrect_attempts,count_accuracy,acc_Mushroom Sorter (Assessment),acc_Chest Sorter (Assessment),...,mean_game_level,so_cool,greatjob,installation_session_count,installation_duration_mean,installation_title_nunique,sum_event_code_2000,sum_event_code_3000,sum_event_code_4000,installation_event_code_count_mean
0,11,3,0,4,0,0,0,0.0,-1.0,-1.0,...,6.0,20,6,5,36.99,2,79,188,380,1553.0
1,14,4,1,6,41,1,0,1.0,1.0,-1.0,...,6.0,20,6,5,36.99,2,101,503,539,1553.0
2,14,4,2,6,0,1,11,0.5,1.0,-1.0,...,6.0,20,6,5,36.99,2,103,529,598,1553.0
3,24,9,4,10,0,2,11,0.5,0.0,-1.0,...,5.0,36,12,5,36.99,2,196,807,1146,1553.0
4,28,10,5,13,41,3,12,0.5,0.5,-1.0,...,5.0,36,12,5,36.99,2,232,1086,1258,1553.0


In [73]:
test.Assessment.value_counts()/len(test.Assessment)


0     0.215
1     0.192
2     0.139
3     0.107
4     0.068
6     0.049
5     0.045
7     0.035
9     0.026
8     0.020
10    0.016
12    0.016
14    0.012
11    0.010
16    0.006
13    0.006
17    0.006
18    0.005
15    0.004
21    0.002
34    0.002
27    0.002
26    0.002
20    0.002
19    0.002
22    0.001
23    0.001
25    0.001
44    0.001
29    0.001
30    0.001
31    0.001
32    0.001
35    0.001
43    0.001
56    0.001
Name: Assessment, dtype: float64

In [74]:
train.Assessment.value_counts()/len(train.Assessment)

0      0.174901
1      0.129847
2      0.103222
3      0.081967
4      0.067326
         ...   
175    0.000057
136    0.000057
152    0.000057
168    0.000057
183    0.000057
Name: Assessment, Length: 187, dtype: float64

In [82]:
sum(train.Assessment>56)

288

In [24]:
test.shape

(1000, 542)

In [25]:
train['target'] = 0
test['target'] = 1

In [26]:
test['target']

0      1
1      1
2      1
3      1
4      1
      ..
995    1
996    1
997    1
998    1
999    1
Name: target, Length: 1000, dtype: int64

In [41]:
train_df =pd.concat([train, test])
train_df.columns = train_df.columns.str.replace(',', '')

In [42]:
train_df.shape

(18690, 543)

In [43]:
y = train_df['target']
x = train_df.drop('target', axis=1)
groups = np.array(x['installation_id'])
lgb_params = {
        'objective': 'binary',
        'metric': 'auc',
    }
x = x.drop('installation_id', axis=1)

In [44]:
def lgb_regression(x, y, groups, lgb_params) -> pd.DataFrame:

    num_fold = 8
    pred = np.zeros(y.shape)
    all_importance = []

    for fold_ind, (train_ind, test_ind) in enumerate(
            stratified_group_k_fold(X=x, y=y, groups=groups, k=num_fold, seed=77)):
        x_train = x.iloc[train_ind]
        y_train = y.iloc[train_ind]
        x_test = x.iloc[test_ind]
        y_test = y.iloc[test_ind]
        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_val = lgb.Dataset(x_test, y_test, reference=lgb_train)

        model = lgb.train(params=lgb_params,
                          train_set=lgb_train,
                          valid_sets=lgb_val)

        pred[test_ind] = model.predict(x_test, num_iteration=model.best_iteration)

        all_importance.append(pd.DataFrame(model.feature_importance('gain'), index=x_train.columns))

    all_importance = pd.concat(all_importance, axis=1)
    return pred, all_importance


In [45]:
pred, importance = lgb_regression(x, y, groups, lgb_params)

[7]	valid_0's auc: 0.988824
[8]	valid_0's auc: 0.988574
[9]	valid_0's auc: 0.988753
[10]	valid_0's auc: 0.988806
[11]	valid_0's auc: 0.988806
[12]	valid_0's auc: 0.989074
[13]	valid_0's auc: 0.988931
[14]	valid_0's auc: 0.98827
[15]	valid_0's auc: 0.988127
[16]	valid_0's auc: 0.988109
[17]	valid_0's auc: 0.988824
[18]	valid_0's auc: 0.989003
[19]	valid_0's auc: 0.988699
[20]	valid_0's auc: 0.98877
[21]	valid_0's auc: 0.988896
[22]	valid_0's auc: 0.988735
[23]	valid_0's auc: 0.988717
[24]	valid_0's auc: 0.988699
[25]	valid_0's auc: 0.989521
[26]	valid_0's auc: 0.989378
[27]	valid_0's auc: 0.989665
[28]	valid_0's auc: 0.989718
[29]	valid_0's auc: 0.98979
[30]	valid_0's auc: 0.989504
[31]	valid_0's auc: 0.98945
[32]	valid_0's auc: 0.989718
[33]	valid_0's auc: 0.989933
[34]	valid_0's auc: 0.989915
[35]	valid_0's auc: 0.989718
[36]	valid_0's auc: 0.989754
[37]	valid_0's auc: 0.989808
[38]	valid_0's auc: 0.989772
[39]	valid_0's auc: 0.98979
[40]	valid_0's auc: 0.989861
[41]	valid_0's auc: 0.

In [52]:
importance.mean(axis=1).sort_values(ascending=False).head(20)

installation_session_count    23250.888813
accuracy_group                 6608.824734
Assessment                     1398.674609
count_correct_attempts          568.599501
session_title                   297.789299
count_uncorrect_attempts        208.559019
total_duration                  173.250146
frequency                       162.320610
4020                            109.943887
4070                            109.749494
4035                             96.760540
game_mean_event_count            90.888499
Clip                             89.127022
args_1                           84.723639
mean_game_round                  79.380634
sum_event_code_2000              75.029718
args_6                           72.252024
0                                68.284594
info_7                           65.225129
Magma Peak - Level 1             63.580822
dtype: float64

In [53]:
x = x.drop(['installation_session_count', 'accuracy_group'], axis=1)

In [54]:
pred, importance = lgb_regression(x, y, groups, lgb_params)

: 0.94505
[7]	valid_0's auc: 0.945819
[8]	valid_0's auc: 0.946231
[9]	valid_0's auc: 0.947214
[10]	valid_0's auc: 0.9475
[11]	valid_0's auc: 0.948448
[12]	valid_0's auc: 0.948001
[13]	valid_0's auc: 0.949583
[14]	valid_0's auc: 0.949869
[15]	valid_0's auc: 0.950674
[16]	valid_0's auc: 0.951175
[17]	valid_0's auc: 0.951801
[18]	valid_0's auc: 0.952069
[19]	valid_0's auc: 0.951854
[20]	valid_0's auc: 0.95164
[21]	valid_0's auc: 0.951658
[22]	valid_0's auc: 0.951747
[23]	valid_0's auc: 0.951532
[24]	valid_0's auc: 0.95164
[25]	valid_0's auc: 0.951693
[26]	valid_0's auc: 0.95105
[27]	valid_0's auc: 0.951193
[28]	valid_0's auc: 0.951675
[29]	valid_0's auc: 0.95164
[30]	valid_0's auc: 0.951675
[31]	valid_0's auc: 0.951997
[32]	valid_0's auc: 0.951872
[33]	valid_0's auc: 0.951872
[34]	valid_0's auc: 0.951783
[35]	valid_0's auc: 0.951935
[36]	valid_0's auc: 0.952042
[37]	valid_0's auc: 0.95231
[38]	valid_0's auc: 0.952471
[39]	valid_0's auc: 0.952382
[40]	valid_0's auc: 0.952909
[41]	valid_0's

In [55]:
importance.mean(axis=1).sort_values(ascending=False).head(20)

installation_title_nunique        19824.908691
Assessment                         3279.361196
installation_duration_mean         1506.882978
duration_mean                       716.780994
4025                                302.229632
frequency                           256.426395
session_title                       229.204865
acc_Cart Balancer (Assessment)      175.137729
count_correct_attempts              166.448591
total_duration                      162.473571
4070                                158.329737
info_8                              141.169509
args_1                              137.224695
Sandcastle Builder (Activity)       130.561635
count_accuracy                      128.665204
4030                                126.942328
mean_accuracy_group                 110.601671
Magma Peak - Level 1                110.109623
duration_max                        108.832744
count_uncorrect_attempts            104.788689
dtype: float64

In [56]:
x = x.drop(['installation_title_nunique'], axis=1) 

In [57]:
pred, importance = lgb_regression(x, y, groups, lgb_params)

705
[7]	valid_0's auc: 0.78385
[8]	valid_0's auc: 0.788114
[9]	valid_0's auc: 0.786487
[10]	valid_0's auc: 0.778995
[11]	valid_0's auc: 0.788615
[12]	valid_0's auc: 0.789634
[13]	valid_0's auc: 0.788838
[14]	valid_0's auc: 0.790716
[15]	valid_0's auc: 0.796903
[16]	valid_0's auc: 0.793854
[17]	valid_0's auc: 0.796098
[18]	valid_0's auc: 0.792906
[19]	valid_0's auc: 0.790171
[20]	valid_0's auc: 0.787935
[21]	valid_0's auc: 0.793961
[22]	valid_0's auc: 0.795106
[23]	valid_0's auc: 0.795267
[24]	valid_0's auc: 0.794489
[25]	valid_0's auc: 0.794417
[26]	valid_0's auc: 0.793327
[27]	valid_0's auc: 0.794864
[28]	valid_0's auc: 0.795991
[29]	valid_0's auc: 0.795794
[30]	valid_0's auc: 0.798396
[31]	valid_0's auc: 0.798494
[32]	valid_0's auc: 0.799076
[33]	valid_0's auc: 0.797913
[34]	valid_0's auc: 0.797681
[35]	valid_0's auc: 0.795338
[36]	valid_0's auc: 0.794444
[37]	valid_0's auc: 0.795016
[38]	valid_0's auc: 0.80233
[39]	valid_0's auc: 0.801901
[40]	valid_0's auc: 0.801668
[41]	valid_0's 

In [58]:
importance.mean(axis=1).sort_values(ascending=False).head(20)

installation_duration_mean            6450.532554
duration_mean                         2818.935551
Assessment                            1432.000039
installation_event_code_count_mean    1071.202923
duration_max                           691.965703
count_correct_attempts                 554.516523
count_actions                          474.887160
0                                      369.437510
4070                                   253.700206
frequency                              210.035845
count_uncorrect_attempts               195.482103
2010                                   195.219121
total_duration                         192.310235
4050                                   177.476826
mean_game_duration                     177.025606
587b5989                               152.540137
sum_event_code_4000                    151.808507
565a3990                               149.455313
4030                                   147.736911
game_mean_event_count                  137.089797


In [60]:
x = x.drop(['installation_duration_mean'], axis=1) 

In [61]:
pred, importance = lgb_regression(x, y, groups, lgb_params)

[7]	valid_0's auc: 0.692422
[8]	valid_0's auc: 0.692198
[9]	valid_0's auc: 0.696159
[10]	valid_0's auc: 0.699485
[11]	valid_0's auc: 0.699494
[12]	valid_0's auc: 0.697384
[13]	valid_0's auc: 0.703473
[14]	valid_0's auc: 0.704742
[15]	valid_0's auc: 0.704769
[16]	valid_0's auc: 0.702623
[17]	valid_0's auc: 0.702024
[18]	valid_0's auc: 0.7029
[19]	valid_0's auc: 0.701631
[20]	valid_0's auc: 0.705788
[21]	valid_0's auc: 0.705824
[22]	valid_0's auc: 0.704823
[23]	valid_0's auc: 0.707693
[24]	valid_0's auc: 0.712091
[25]	valid_0's auc: 0.714988
[26]	valid_0's auc: 0.715829
[27]	valid_0's auc: 0.717188
[28]	valid_0's auc: 0.719262
[29]	valid_0's auc: 0.722749
[30]	valid_0's auc: 0.723875
[31]	valid_0's auc: 0.72738
[32]	valid_0's auc: 0.726468
[33]	valid_0's auc: 0.728739
[34]	valid_0's auc: 0.730313
[35]	valid_0's auc: 0.732494
[36]	valid_0's auc: 0.733174
[37]	valid_0's auc: 0.734711
[38]	valid_0's auc: 0.733317
[39]	valid_0's auc: 0.731314
[40]	valid_0's auc: 0.732262
[41]	valid_0's auc: 

In [62]:
importance.mean(axis=1).sort_values(ascending=False).head(20)

installation_event_code_count_mean    2414.344731
count_actions                         1862.044062
Assessment                            1183.334565
count_correct_attempts                1093.162348
duration_mean                          740.911715
0                                      601.416505
duration_max                           464.674686
count_uncorrect_attempts               383.755275
frequency                              230.261658
4020                                   228.919536
4030                                   211.674159
mean_game_duration                     211.437108
sum_event_code_4000                    211.111909
mean_game_round                        195.895013
4050                                   194.546577
2010                                   185.269307
game_mean_event_count                  185.194587
sum_game_duration                      182.130796
total_duration                         181.858096
db02c830                               177.678108


In [63]:
x = x.drop(['installation_event_code_count_mean'], axis=1) 
pred, importance = lgb_regression(x, y, groups, lgb_params)


id_0's auc: 0.646788
[8]	valid_0's auc: 0.650499
[9]	valid_0's auc: 0.659019
[10]	valid_0's auc: 0.65606
[11]	valid_0's auc: 0.649435
[12]	valid_0's auc: 0.659466
[13]	valid_0's auc: 0.658984
[14]	valid_0's auc: 0.654907
[15]	valid_0's auc: 0.654433
[16]	valid_0's auc: 0.653557
[17]	valid_0's auc: 0.649954
[18]	valid_0's auc: 0.651098
[19]	valid_0's auc: 0.648416
[20]	valid_0's auc: 0.646574
[21]	valid_0's auc: 0.645894
[22]	valid_0's auc: 0.645492
[23]	valid_0's auc: 0.641218
[24]	valid_0's auc: 0.63884
[25]	valid_0's auc: 0.637982
[26]	valid_0's auc: 0.641978
[27]	valid_0's auc: 0.643436
[28]	valid_0's auc: 0.641809
[29]	valid_0's auc: 0.639967
[30]	valid_0's auc: 0.642041
[31]	valid_0's auc: 0.641236
[32]	valid_0's auc: 0.639144
[33]	valid_0's auc: 0.642086
[34]	valid_0's auc: 0.643284
[35]	valid_0's auc: 0.644392
[36]	valid_0's auc: 0.641898
[37]	valid_0's auc: 0.64213
[38]	valid_0's auc: 0.644285
[39]	valid_0's auc: 0.647611
[40]	valid_0's auc: 0.647987
[41]	valid_0's auc: 0.64788

In [65]:
importance.mean(axis=1).sort_values(ascending=False).head(50)

Assessment                       1410.063828
count_correct_attempts           1150.061852
duration_mean                     893.226619
0                                 554.050847
duration_max                      457.286662
4070                              401.657602
count_uncorrect_attempts          335.615326
frequency                         301.235413
total_duration                    300.062503
2010                              260.566189
4100                              242.654864
mean_game_duration                238.478051
db02c830                          228.520066
mean_game_round                   220.015048
args_1                            209.364742
4050                              207.966627
4030                              194.849952
info_8                            187.210440
587b5989                          184.361248
game_mean_event_count             177.222922
4035                              176.079355
args_6                            173.059398
sum_game_d

In [116]:
importance.loc['session_title']

0    16.259590
0    18.399180
0    32.779990
0     2.790970
0    26.130560
0    40.678490
0    34.685639
0    10.600580
Name: session_title, dtype: float64

In [117]:
importance.loc['Clip']


0     74.243039
0    146.971561
0     75.992951
0    107.774130
0    139.258680
0    117.598090
0    124.780860
0     90.552450
Name: Clip, dtype: float64

In [83]:
a = {'a': 1, 'b': 4}

In [111]:
{i: j/sum(a.values()) for i, j in zip(a.keys(), a.values())}

{'a': 0.2, 'b': 0.8}

In [108]:
sum(a.values())

5

In [121]:
x = x.reset_index()

In [124]:
x = x.drop('index', axis=1)

In [132]:
x[['Clip_ratio', 'Assessment_ratio', 'Game_ratio', 'Activity_ratio']] = x[['Clip', 'Assessment', 'Game', 'Activity']].apply(lambda x: x/sum(x), axis=1)

In [133]:
x = x.drop(['Clip', 'Assessment', 'Game', 'Activity'], axis=1)

In [134]:
pred, importance = lgb_regression(x, y, groups, lgb_params)


[7]	valid_0's auc: 0.620735
[8]	valid_0's auc: 0.62576
[9]	valid_0's auc: 0.623856
[10]	valid_0's auc: 0.622702
[11]	valid_0's auc: 0.617186
[12]	valid_0's auc: 0.623972
[13]	valid_0's auc: 0.621799
[14]	valid_0's auc: 0.627208
[15]	valid_0's auc: 0.629274
[16]	valid_0's auc: 0.631357
[17]	valid_0's auc: 0.627486
[18]	valid_0's auc: 0.62711
[19]	valid_0's auc: 0.628147
[20]	valid_0's auc: 0.627119
[21]	valid_0's auc: 0.621951
[22]	valid_0's auc: 0.620574
[23]	valid_0's auc: 0.622774
[24]	valid_0's auc: 0.626064
[25]	valid_0's auc: 0.629086
[26]	valid_0's auc: 0.631643
[27]	valid_0's auc: 0.636498
[28]	valid_0's auc: 0.638322
[29]	valid_0's auc: 0.638214
[30]	valid_0's auc: 0.640047
[31]	valid_0's auc: 0.640011
[32]	valid_0's auc: 0.641782
[33]	valid_0's auc: 0.642408
[34]	valid_0's auc: 0.645787
[35]	valid_0's auc: 0.646333
[36]	valid_0's auc: 0.64754
[37]	valid_0's auc: 0.648291
[38]	valid_0's auc: 0.647879
[39]	valid_0's auc: 0.64754
[40]	valid_0's auc: 0.6472
[41]	valid_0's auc: 0.

In [137]:
importance.mean(axis=1).sort_values(ascending=False).head(50)

Assessment_ratio            1480.854546
count_correct_attempts      1096.795032
duration_mean                948.112836
duration_max                 480.998904
4070                         370.616316
0                            350.842691
count_uncorrect_attempts     325.618108
frequency                    284.531168
Clip_ratio                   276.962983
total_duration               265.301160
mean_game_duration           257.839239
Game_ratio                   223.347436
Activity_ratio               204.585007
game_mean_event_count        199.310560
4050                         196.770041
args_1                       195.220088
4100                         192.968505
4035                         186.359278
db02c830                     180.189819
4030                         173.707126
max_game_duration            173.672345
2010                         166.034482
587b5989                     165.656602
mean_game_round              162.261674
args_6                       156.680235


In [149]:
train.Clip.describe()

count    17690.000000
mean        37.454607
std         61.083332
min          0.000000
25%          9.000000
50%         19.000000
75%         42.000000
max        772.000000
Name: Clip, dtype: float64

In [150]:
test.Clip.describe()

count    1000.000000
mean       22.711000
std        35.490662
min         0.000000
25%         7.000000
50%        14.000000
75%        26.000000
max       466.000000
Name: Clip, dtype: float64

In [154]:
a = 0
for i in range(1000):
    a = a * 0.95 + 1
print(a)

19.99999999999995


In [155]:
a = 0
for i in range(10000):
    a = a * 0.95 + 1
print(a)

19.99999999999995


In [156]:
a = 0
for i in range(100):
    a = a * 0.95 + 1
print(a)

19.8815894155933
