In [None]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

from sklearn.model_selection import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV

from sklearn.cluster import KMeans

warnings.filterwarnings('ignore')


基本変数定義

In [None]:
sampling_flg=0 #サンプリング有無をコントロール
submit_flg=1 #保存するかをコントロール（サンプリングしない時のみ）

SEED=12345
sample_num=1000
fold_num=5

#train関連
train_dir='../../01_input/train.csv'
train_drop_col=['ID_code', 'target']
train_label='target'

#test関連
test_dir='../../01_input/test.csv'
test_drop_col=['ID_code']

#結果ファイル関連　nameは自分の名前に変更する
train_preds_dir='../../03_predict_train/name_200_LightGBM_train.csv'
test_preds_dir='../../04_predict_test/name_200_LightGBM_submission.csv'
save_col_name='oof_xgb'

sample_submission_dir='../../01_input/sample_submission.csv'
submission_target_col_name='target'
submission_id_col_name='ID_code'

前処理

In [None]:
#ロード
train_df=pd.read_csv(train_dir)
test_df=pd.read_csv(test_dir)

In [None]:
#サンプリング
if sampling_flg ==1:
    train_df=train_df.sample(n=sample_num,random_state=SEED)
    test_df=test_df.sample(n=sample_num,random_state=SEED)
    train_df=train_df.reset_index()
    test_df=test_df.reset_index()

In [None]:
#x,y作成
x_train=train_df.drop(train_drop_col,axis=1)
y_train=train_df[train_label]
x_test=test_df.drop(test_drop_col,axis=1)

In [None]:
predict_col=x_train.columns

モデル実行

In [None]:
clf=KMeans(n_clusters=10, init='k-means++', n_init=10, max_iter=300,
                               tol=0.0001,precompute_distances='auto', verbose=0,
                               random_state=11111, copy_x=True, n_jobs=1)
clf.fit(x_train)
pred=clf.predict(x_test)
x_test['kmeans']=pred
pred=clf.predict(x_train)
x_train['kmeans']=pred

各クラスタの状況確認

In [None]:
x_check=x_train['kmeans']
x_check['target']=y_train


In [9]:
def lgbm(train_x,train_y,test_x,test_y):
    #model_param
    param = {
        "objective" : "binary", 
        "boost":"gbdt",
        "metric":"auc",
        "boost_from_average":"false",
        "num_threads":28,
        "learning_rate" : 0.01,
        "num_leaves" : 13,
        "max_depth":-1,
        "tree_learner" : "serial",
        "feature_fraction" : 0.05,
        "bagging_freq" : 5,
        "bagging_fraction" : 0.4,
        "min_data_in_leaf" : 80,
        "min_sum_hessian_in_leaf" : 10.0,
        "verbosity" : 1,
        'seed': i,
        }

    trn_data = lgb.Dataset(train_x,train_y)
    val_data = lgb.Dataset(test_x,test_y)
    
    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    
    val_pred = clf.predict(test_x, num_iteration=clf.best_iteration)
    auc_score=roc_auc_score(test_y, val_pred)
    print("")
    print("AUC = {}".format(auc_score))
    
    return auc_score,clf

In [10]:
def best_clf(pred_x,pred_y,other_x,other_y):
    print("")
    print('pred_num='+str(len(pred_x)))
    print('other_num='+str(len(other_x)))
    max_score=0
    best_i=0
    print("")
    for i in range(0,5):
        other_x['target']=other_y
        other_x_1=other_x[other_x['target']==1]
        other_x_0=other_x[other_x['target']==0]
        other_sample_x_0=other_x_0.sample(n=int(len(other_x_0)*0.9),random_state=i)
        sample_x_concat=pd.concat([other_sample_x_0,other_x_1])
        
        other_sample_y=sample_x_concat['target']
        other_sample_x=sample_x_concat.drop(['target'],axis=1)
        
        print("")
        print("0:"+str(len(other_sample_x_0)))
        print("1:"+str(len(other_x_1)))
        print('sample_num='+str(len(other_sample_x)))
        score,model=lgbm(other_sample_x,other_sample_y,pred_x,pred_y)
        if max_score < score :
            max_score=score
            pred_model=model
            best_i=i
    print("")
    print('Best Model:'+str(i)+" Best Score:"+str(max_score))
    return pred_model

In [11]:
x_train['target']=y_train
pred_x=x_train.query('kmeans == 0')
other_x=x_train.query('not kmeans == 0')
pred_y=pred_x['target']
other_y=other_x['target']
pred_x=pred_x.drop('target',axis=1)
other_x=other_x.drop('target',axis=1)

In [None]:
y_test = np.zeros((len(x_test), 1))
y_train_preds = np.zeros((len(x_train), 1))
for i in range(0,10):
    print("")
    print("kmeans:"+str(i))
    x_train['target']=y_train
    pred_x=x_train.query('kmeans =='+str(i))
    other_x=x_train.query('not kmeans =='+str(i))
    pred_y=pred_x['target']
    other_y=other_x['target']
    pred_x=pred_x.drop('target',axis=1)
    other_x=other_x.drop('target',axis=1)
    
    best_model=best_clf(pred_x,pred_y,other_x,other_y)
    
    x_test_pred=x_test[x_test['kmeans']==i]
    
    y_test_pred=best_model.predict(x_test_pred)
    y_train_pred=best_model.predict(pred_x)
    
    x_test_pred_index=x_test_pred.index
    x_train_pred_index=pred_x.index
    
    y_test[x_test_pred_index, :]=y_test_pred.reshape((-1, 1))
    y_train_preds[x_train_pred_index, :]=y_train_pred.reshape((-1, 1))

print("Over All AUC = {}".format(roc_auc_score(y_train, y_train_preds)))
    


kmeans:0

pred_num=20763
other_num=179237


0:145259
1:17838
sample_num=163097
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.900043	valid_1's auc: 0.881877
[2000]	training's auc: 0.911375	valid_1's auc: 0.890559
[3000]	training's auc: 0.919272	valid_1's auc: 0.895603
[4000]	training's auc: 0.925117	valid_1's auc: 0.898303
[5000]	training's auc: 0.930205	valid_1's auc: 0.900126
[6000]	training's auc: 0.934552	valid_1's auc: 0.901154
[7000]	training's auc: 0.938476	valid_1's auc: 0.901554
[8000]	training's auc: 0.942221	valid_1's auc: 0.901723
[9000]	training's auc: 0.945865	valid_1's auc: 0.901856
[10000]	training's auc: 0.949324	valid_1's auc: 0.901915
[11000]	training's auc: 0.95259	valid_1's auc: 0.90194
[12000]	training's auc: 0.955718	valid_1's auc: 0.901912
[13000]	training's auc: 0.95875	valid_1's auc: 0.901787
[14000]	training's auc: 0.961633	valid_1's auc: 0.90179
Early stopping, best iteration is:
[11221]	training's auc: 0.953294	val

[7000]	training's auc: 0.938916	valid_1's auc: 0.897014
[8000]	training's auc: 0.942706	valid_1's auc: 0.897291
[9000]	training's auc: 0.946282	valid_1's auc: 0.897339
[10000]	training's auc: 0.949701	valid_1's auc: 0.897485
[11000]	training's auc: 0.952947	valid_1's auc: 0.897412
[12000]	training's auc: 0.956047	valid_1's auc: 0.897188
Early stopping, best iteration is:
[9762]	training's auc: 0.948901	valid_1's auc: 0.897532

AUC = 0.8975322882587885

0:146794
1:17827
sample_num=164621
Training until validation scores don't improve for 3000 rounds.
[1000]	training's auc: 0.900527	valid_1's auc: 0.88079
[2000]	training's auc: 0.911921	valid_1's auc: 0.888593
[3000]	training's auc: 0.919629	valid_1's auc: 0.892766
[4000]	training's auc: 0.925521	valid_1's auc: 0.895025
[5000]	training's auc: 0.930522	valid_1's auc: 0.896236
[6000]	training's auc: 0.934936	valid_1's auc: 0.896953
[7000]	training's auc: 0.938924	valid_1's auc: 0.897229
[8000]	training's auc: 0.94268	valid_1's auc: 0.89737

In [None]:
y_test

In [None]:
#結果保存
if (submit_flg ==1 and sampling_flg==0):
    sample = pd.read_csv(sample_submission_dir)
    sample.target = y_test
    sample.ID_code = test_df['ID_code']
    sample.to_csv(test_preds_dir, index=False)
    