In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

from sklearn.model_selection import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV

from sklearn.cluster import KMeans

warnings.filterwarnings('ignore')




基本変数定義

In [2]:
sampling_flg=0 #サンプリング有無をコントロール
submit_flg=1 #保存するかをコントロール（サンプリングしない時のみ）

SEED=12345
sample_num=10000
fold_num=5

#train関連
train_dir='../../02_feature/101_train.csv'
train_drop_col=['ID_code', 'target']
train_label='target'

#test関連
test_dir='../../02_feature/101_test.csv'
test_drop_col=['ID_code']

#結果ファイル関連　nameは自分の名前に変更する
train_preds_dir='../../03_predict_train/oka_243_LightGBM_train.csv'
test_preds_dir='../../04_predict_test/oka_243_LightGBM_submission.csv'
save_col_name='oof_xgb'

sample_submission_dir='../../01_input/sample_submission.csv'
submission_target_col_name='target'
submission_id_col_name='ID_code'

前処理

In [3]:
#ロード
train_df=pd.read_csv(train_dir)
test_df=pd.read_csv(test_dir)

In [4]:
#サンプリング
if sampling_flg ==1:
    train_df=train_df.sample(n=sample_num,random_state=SEED)
    test_df=test_df.sample(n=sample_num,random_state=SEED)

In [5]:
#x,y作成
x_train=train_df.drop(train_drop_col,axis=1)
y_train=train_df[train_label]
x_test=test_df.drop(test_drop_col,axis=1)

モデル実行

In [6]:
#model_param
param = {
    "objective" : "binary", 
    "boost":"gbdt",
    "metric":"auc",
    "boost_from_average":"false",
    "num_threads":28,
    "learning_rate" : 0.01,
    "num_leaves" : 13,
    "max_depth":-1,
    "tree_learner" : "serial",
    "feature_fraction" : 0.05,
    "bagging_freq" : 5,
    "bagging_fraction" : 0.4,
    "min_data_in_leaf" : 80,
    "min_sum_hessian_in_leaf" : 10.0,
    "verbosity" : 1,
    'seed': 44000,
    }

In [7]:
clf=KMeans(n_clusters=2, init='k-means++', n_init=10, max_iter=300,
                               tol=0.0001,precompute_distances='auto', verbose=0,
                               random_state=11111, copy_x=True, n_jobs=1)
clf.fit(x_train)
pred=clf.predict(x_test)
type(pred)

numpy.ndarray

In [8]:
def create_kmean(in_trn_x,in_val_x,in_test_x):
    print(in_test_x.shape)
    new_trn=in_trn_x
    new_val=in_val_x
    new_test=in_test_x
    for i in range(2,10):
        print(i)
        clf=KMeans(n_clusters=i, init='k-means++', n_init=10, max_iter=300,
                               tol=0.0001,precompute_distances='auto', verbose=0,
                               random_state=11111, copy_x=True, n_jobs=1)
        clf.fit(in_trn_x)
        pred=clf.predict(in_trn_x)
        new_trn['kmeans_'+str(i)]=pred
        pred=clf.predict(in_val_x)
        new_val['kmeans_'+str(i)]=pred
        pred=clf.predict(in_test_x)
        new_test['kmeans_'+str(i)]=pred
        
    return new_trn,new_val,new_test

In [9]:
x_test.head

<bound method NDFrame.head of         Unnamed: 0    var_0    var_1    var_2    var_3    var_4    var_5  \
0                0  11.0656   7.7798  12.9536   9.4292  11.4327  -2.3805   
1                1   8.5304   1.2543  11.3047   5.1858   9.1974  -4.0117   
2                2   5.4827 -10.3581  10.1407   7.0479  10.2628   9.8052   
3                3   8.5374  -1.3222  12.0220   6.5749   8.8458   3.1744   
4                4  11.7058  -0.1327  14.1295   7.7506   9.1035  -8.5848   
5                5   5.9862  -2.2913   8.6058   7.0685  14.2465  -8.6761   
6                6   8.4624  -6.1065   7.3603   8.2627  12.0104  -7.2073   
7                7  17.3035  -2.4212  13.3989   8.3998  11.0777   9.6449   
8                8   6.9856   0.8402  13.7161   4.7749   8.6784 -13.7607   
9                9  10.3811  -6.9348  14.6690   9.0941  11.9058 -10.8018   
10              10   8.3431  -4.1427   9.1985   9.8229  11.2494   2.9678   
11              11  10.6137  -2.1898   8.9090   3.8014  13

In [10]:
folds = StratifiedKFold(n_splits=fold_num, shuffle=True, random_state=SEED)
oof_preds = np.zeros((len(x_train), 1))
test_preds = np.zeros((len(x_test), 1))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train.values, y_train.values)):
    x_test=test_df.drop(test_drop_col,axis=1)
    print("\n")
    print("Fold {}".format(fold_))
    trn_x,trn_y = x_train.iloc[trn_idx], y_train.iloc[trn_idx]
    val_x,val_y = x_train.iloc[val_idx], y_train.iloc[val_idx]
    
    new_trn_x,new_val_x,new_x_test=create_kmean(trn_x,val_x,x_test)
    #print(new_trn_x)
    
    trn_data = lgb.Dataset(new_trn_x,trn_y)
    val_data = lgb.Dataset(new_val_x,val_y)
    
    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    val_pred = clf.predict(new_val_x, num_iteration=clf.best_iteration)
    test_pred = clf.predict(new_x_test, num_iteration=clf.best_iteration)
    
    print("AUC = {}".format(roc_auc_score(val_y, val_pred)))
    #print("val = {}".format(val_pred)
    oof_preds[val_idx, :] = val_pred.reshape((-1, 1))
    test_preds += test_pred.reshape((-1, 1))
    
test_preds /= fold_num
roc_score = roc_auc_score(y_train, oof_preds.ravel())
print("Overall AUC = {}".format(roc_score))



Fold 0
(200000, 201)
2
3
4
5
6
7
8
9
        Unnamed: 0    var_0   var_1    var_2   var_3    var_4    var_5  \
0                0   8.9255 -6.7863  11.9081  5.0930  11.4607  -9.2834   
1                1  11.5006 -4.1473  13.8588  5.3890  12.3622   7.0433   
2                2   8.6093 -2.7457  12.0805  7.8928  10.5825  -9.0837   
3                3  11.0604 -2.1518   8.9522  7.1957  12.5846  -1.8361   
4                4   9.8369 -1.4834  12.8746  6.6375  12.2772   2.4486   
5                5  11.4763 -2.3182  12.6080  8.6264  10.9621   3.5609   
6                6  11.8091 -0.0832   9.3494  4.2916  11.1355  -8.0198   
7                7  13.5580 -7.9881  13.8776  7.5985   8.6543   0.8310   
8                8  16.1071  2.4426  13.9307  5.6327   8.8014   6.1630   
9                9  12.5088  1.9743   8.8960  5.4508  13.6043 -16.2859   
10              10   5.0702 -0.5447   9.5900  4.2987  12.3910 -18.8687   
11              11  12.7188 -7.9750  10.3757  9.0101  12.8570 -12.0852   

Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.903114	valid_1's auc: 0.883708
[2000]	training's auc: 0.913775	valid_1's auc: 0.891206
[3000]	training's auc: 0.921725	valid_1's auc: 0.895077
[4000]	training's auc: 0.92774	valid_1's auc: 0.897146
[5000]	training's auc: 0.932845	valid_1's auc: 0.898144
[6000]	training's auc: 0.937331	valid_1's auc: 0.898855
[7000]	training's auc: 0.94155	valid_1's auc: 0.899262
[8000]	training's auc: 0.945385	valid_1's auc: 0.899379
[9000]	training's auc: 0.949055	valid_1's auc: 0.899377
[10000]	training's auc: 0.952582	valid_1's auc: 0.89944
[11000]	training's auc: 0.95589	valid_1's auc: 0.899416
[12000]	training's auc: 0.959105	valid_1's auc: 0.899379
Early stopping, best iteration is:
[9628]	training's auc: 0.95132	valid_1's auc: 0.899486
AUC = 0.8994863859187153


Fold 1
(200000, 201)
2
3
4
5
6
7
8
9
        Unnamed: 0    var_0   var_1    var_2    var_3    var_4    var_5  \
0                0   8.9255 -6.7863 

Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.903107	valid_1's auc: 0.882778
[2000]	training's auc: 0.91399	valid_1's auc: 0.890019
[3000]	training's auc: 0.921638	valid_1's auc: 0.894066
[4000]	training's auc: 0.927712	valid_1's auc: 0.896472
[5000]	training's auc: 0.932881	valid_1's auc: 0.897705
[6000]	training's auc: 0.937312	valid_1's auc: 0.898245
[7000]	training's auc: 0.941498	valid_1's auc: 0.898398
[8000]	training's auc: 0.945357	valid_1's auc: 0.89856
[9000]	training's auc: 0.949106	valid_1's auc: 0.898578
[10000]	training's auc: 0.952633	valid_1's auc: 0.898635
[11000]	training's auc: 0.956018	valid_1's auc: 0.898586
[12000]	training's auc: 0.959098	valid_1's auc: 0.898505
Early stopping, best iteration is:
[9584]	training's auc: 0.951217	valid_1's auc: 0.898693
AUC = 0.89869333331121


Fold 2
(200000, 201)
2
3
4
5
6
7
8
9
        Unnamed: 0    var_0   var_1    var_2    var_3    var_4    var_5  \
0                0   8.9255 -6.7863

Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.90306	valid_1's auc: 0.882615
[2000]	training's auc: 0.913945	valid_1's auc: 0.889488
[3000]	training's auc: 0.921589	valid_1's auc: 0.893235
[4000]	training's auc: 0.927764	valid_1's auc: 0.895509
[5000]	training's auc: 0.932758	valid_1's auc: 0.896803
[6000]	training's auc: 0.937125	valid_1's auc: 0.897605
[7000]	training's auc: 0.941322	valid_1's auc: 0.897991
[8000]	training's auc: 0.945219	valid_1's auc: 0.898288
[9000]	training's auc: 0.948947	valid_1's auc: 0.898331
[10000]	training's auc: 0.952513	valid_1's auc: 0.898299
[11000]	training's auc: 0.955846	valid_1's auc: 0.898318
[12000]	training's auc: 0.959021	valid_1's auc: 0.898245
[13000]	training's auc: 0.962063	valid_1's auc: 0.89813
Early stopping, best iteration is:
[10853]	training's auc: 0.955367	valid_1's auc: 0.898392
AUC = 0.8983919549003179


Fold 3
(200000, 201)
2
3
4
5
6
7
8
9
        Unnamed: 0    var_0   var_1    var_2    va

Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.902552	valid_1's auc: 0.88402
[2000]	training's auc: 0.913666	valid_1's auc: 0.890676
[3000]	training's auc: 0.921582	valid_1's auc: 0.894773
[4000]	training's auc: 0.927626	valid_1's auc: 0.896833
[5000]	training's auc: 0.932742	valid_1's auc: 0.898158
[6000]	training's auc: 0.937287	valid_1's auc: 0.898659
[7000]	training's auc: 0.941409	valid_1's auc: 0.899018
[8000]	training's auc: 0.945323	valid_1's auc: 0.89913
[9000]	training's auc: 0.948998	valid_1's auc: 0.899209
[10000]	training's auc: 0.952546	valid_1's auc: 0.899202
[11000]	training's auc: 0.95596	valid_1's auc: 0.899021
[12000]	training's auc: 0.959118	valid_1's auc: 0.898854
Early stopping, best iteration is:
[9326]	training's auc: 0.950205	valid_1's auc: 0.899245
AUC = 0.8992453439270747


Fold 4
(200000, 201)
2
3
4
5
6
7
8
9
        Unnamed: 0    var_0   var_1    var_2    var_3    var_4    var_5  \
1                1  11.5006 -4.147

Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.901427	valid_1's auc: 0.887172
[2000]	training's auc: 0.911923	valid_1's auc: 0.895363
[3000]	training's auc: 0.919907	valid_1's auc: 0.900313
[4000]	training's auc: 0.92612	valid_1's auc: 0.902664
[5000]	training's auc: 0.931411	valid_1's auc: 0.904368
[6000]	training's auc: 0.935938	valid_1's auc: 0.905346
[7000]	training's auc: 0.940299	valid_1's auc: 0.905639
[8000]	training's auc: 0.944246	valid_1's auc: 0.905822
[9000]	training's auc: 0.948016	valid_1's auc: 0.905934
[10000]	training's auc: 0.95159	valid_1's auc: 0.90582
[11000]	training's auc: 0.955014	valid_1's auc: 0.905783
Early stopping, best iteration is:
[8874]	training's auc: 0.947534	valid_1's auc: 0.905954
AUC = 0.905953924251689
Overall AUC = 0.9002894012687543


In [11]:
#結果保存
if (submit_flg ==1 and sampling_flg==0):
    series_oof_preds = pd.Series(data=oof_preds[:,0], name=save_col_name, dtype='float')
    series_oof_preds.to_csv(train_preds_dir,header=True, index=False)

    sample = pd.read_csv(sample_submission_dir)
    sample.target = test_preds[:,0].astype(float)
    sample.ID_code = test_df['ID_code']
    sample.to_csv(test_preds_dir, index=False)
    