In [1]:
import gc
import os
import logging
import datetime
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import ShuffleSplit

from sklearn.model_selection import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV

from sklearn.cluster import KMeans

warnings.filterwarnings('ignore')



基本変数定義

In [2]:
sampling_flg=0 #サンプリング有無をコントロール
submit_flg=1 #保存するかをコントロール（サンプリングしない時のみ）

SEED=12345
sample_num=1000
fold_num=5

#train関連
train_dir='../../01_input/train.csv'
train_drop_col=['ID_code', 'target']
train_label='target'

#test関連
test_dir='../../01_input/test.csv'
test_drop_col=['ID_code']

#結果ファイル関連　nameは自分の名前に変更する
train_preds_dir='../../03_predict_train/name_200_LightGBM_train.csv'
test_preds_dir='../../04_predict_test/name_200_LightGBM_submission.csv'
save_col_name='oof_xgb'

sample_submission_dir='../../01_input/sample_submission.csv'
submission_target_col_name='target'
submission_id_col_name='ID_code'

前処理

In [3]:
#ロード
train_df=pd.read_csv(train_dir)
test_df=pd.read_csv(test_dir)

In [4]:
#サンプリング
if sampling_flg ==1:
    train_df=train_df.sample(n=sample_num,random_state=SEED)
    test_df=test_df.sample(n=sample_num,random_state=SEED)
    train_df=train_df.reset_index()
    test_df=test_df.reset_index()

In [5]:
#x,y作成
x_train=train_df.drop(train_drop_col,axis=1)
y_train=train_df[train_label]
x_test=test_df.drop(test_drop_col,axis=1)

In [6]:
predict_col=x_train.columns

モデル実行

In [None]:
clf=KMeans(n_clusters=20, init='k-means++', n_init=100, max_iter=10000,
                               tol=0.0001,precompute_distances='auto', verbose=0,
                               random_state=11111, copy_x=True, n_jobs=1)
clf.fit(x_train)
pred=clf.predict(x_test)
x_test['kmeans']=pred
pred=clf.predict(x_train)
x_train['kmeans']=pred

各クラスタの状況確認

In [None]:
x_check=x_train['kmeans']
x_check=pd.concat([x_check,train_df['target']],axis=1)
x_check=x_check.head(200000)

In [None]:
x_check.groupby(['kmeans']).mean()

In [None]:
def lgbm(train_x,train_y,test_x,test_y):
    #model_param
    param = {
        "objective" : "binary", 
        "boost":"gbdt",
        "metric":"auc",
        "boost_from_average":"false",
        "num_threads":28,
        "learning_rate" : 0.01,
        "num_leaves" : 13,
        "max_depth":-1,
        "tree_learner" : "serial",
        "feature_fraction" : 0.05,
        "bagging_freq" : 5,
        "bagging_fraction" : 0.4,
        "min_data_in_leaf" : 80,
        "min_sum_hessian_in_leaf" : 10.0,
        "verbosity" : 1,
        'seed': i,
        }

    trn_data = lgb.Dataset(train_x,train_y)
    val_data = lgb.Dataset(test_x,test_y)
    
    num_round = 1000000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 3000)
    
    val_pred = clf.predict(test_x, num_iteration=clf.best_iteration)
    auc_score=roc_auc_score(test_y, val_pred)
    print("")
    print("AUC = {}".format(auc_score))
    
    return auc_score,clf

In [None]:
def best_clf(pred_x,pred_y,other_x,other_y):
    print("")
    print('pred_num='+str(len(pred_x)))
    print('other_num='+str(len(other_x)))
    max_score=0
    best_i=0
    print("")
    for i in range(0,5):
        other_x['target']=other_y
        other_x_1=other_x[other_x['target']==1]
        other_x_0=other_x[other_x['target']==0]
        other_sample_x_0=other_x_0.sample(n=int(len(other_x_0)*0.9),random_state=i)
        sample_x_concat=pd.concat([other_sample_x_0,other_x_1])
        
        other_sample_y=sample_x_concat['target']
        other_sample_x=sample_x_concat.drop(['target'],axis=1)
        
        print("")
        print("0:"+str(len(other_sample_x_0)))
        print("1:"+str(len(other_x_1)))
        print('sample_num='+str(len(other_sample_x)))
        score,model=lgbm(other_sample_x,other_sample_y,pred_x,pred_y)
        if max_score < score :
            max_score=score
            pred_model=model
            best_i=i
    print("")
    print('Best Model:'+str(i)+" Best Score:"+str(max_score))
    return pred_model

In [None]:
x_train['target']=y_train
pred_x=x_train.query('kmeans == 0')
other_x=x_train.query('not kmeans == 0')
pred_y=pred_x['target']
other_y=other_x['target']
pred_x=pred_x.drop('target',axis=1)
other_x=other_x.drop('target',axis=1)

In [None]:
y_test = np.zeros((len(x_test), 1))
y_train_preds = np.zeros((len(x_train), 1))
for i in range(0,10):
    print("")
    print("kmeans:"+str(i))
    x_train['target']=y_train
    pred_x=x_train.query('kmeans =='+str(i))
    other_x=x_train.query('not kmeans =='+str(i))
    pred_y=pred_x['target']
    other_y=other_x['target']
    pred_x=pred_x.drop('target',axis=1)
    other_x=other_x.drop('target',axis=1)
    
    best_model=best_clf(pred_x,pred_y,other_x,other_y)
    
    x_test_pred=x_test[x_test['kmeans']==i]
    
    y_test_pred=best_model.predict(x_test_pred)
    y_train_pred=best_model.predict(pred_x)
    
    x_test_pred_index=x_test_pred.index
    x_train_pred_index=pred_x.index
    
    y_test[x_test_pred_index, :]=y_test_pred.reshape((-1, 1))
    y_train_preds[x_train_pred_index, :]=y_train_pred.reshape((-1, 1))

print("Over All AUC = {}".format(roc_auc_score(y_train, y_train_preds)))
    

In [None]:
y_test

In [None]:
#結果保存
if (submit_flg ==1 and sampling_flg==0):
    sample = pd.read_csv(sample_submission_dir)
    sample.target = y_test
    sample.ID_code = test_df['ID_code']
    sample.to_csv(test_preds_dir, index=False)
    