In [20]:
# 라이브러리 

import numpy as np
import pandas as pd
import lightgbm as lgb
from tqdm import tqdm
from collections import Counter
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.model_selection import train_test_split


import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)

In [4]:
# 데이터 불러오기
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [22]:
# train data의 feature 변수와 target 변수 분리
X=train.drop(['target','ID_code'],axis=1)
y=train['target']

In [23]:
train_set=train.drop(['ID_code'],axis=1)
test_set=test.drop(['ID_code'],axis=1)

In [24]:
#feature selection을 위해 target class 0,1의 분포가 동일한지 KS TEST을 진행
train_zero=train_set[train_set['target']==0]
train_one=train_set[train_set['target']==1]

In [25]:
from scipy.stats import ks_2samp
result=pd.DataFrame(columns=['stat','pvalue'])
for i in range(1,train_set.shape[1]):
    ks_result=list(ks_2samp(train_zero.iloc[:,i],train_one.iloc[:,i]))
    result.loc[i-1]=ks_result

In [26]:
result.head()

Unnamed: 0,stat,pvalue
0,0.081059,4.17806e-83
1,0.060928,4.1159960000000003e-47
2,0.086265,4.4020889999999997e-94
3,0.015641,0.001672915
4,0.016583,0.0006946824


In [27]:
#p-value가 0.05보다 큰 것들 삭제
idx=result[result['pvalue']>0.05].index
#train data는 target 열이 있으니까 index가 하나씩 밀려서 
idx=idx+1
ks_train_set=train_set.drop(train_set.columns[idx],axis=1)

In [28]:
ks_train_set.head()

Unnamed: 0,target,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_8,var_9,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,0,15.414,-2.1016,10.4773,4.8941,12.6506,-3.7205,5.1426,4.2444,8.6555,...,-2.881,8.1647,1.0927,2.1215,17.6536,3.2253,-2.1234,8.9516,13.3485,-16.0178
1,0,12.3576,-8.1666,11.7785,2.8869,12.3183,-6.9847,4.2671,3.0662,4.8252,...,0.2397,8.1569,-1.0753,5.4679,23.6376,-0.5022,9.2414,8.2427,10.7546,-3.4394
2,0,9.4142,-8.6132,7.2196,3.2496,10.655,-3.3245,5.101,1.8721,9.5959,...,8.1638,9.2399,1.016,7.4548,17.0933,0.0715,-4.0455,9.4586,17.8789,-13.9784
3,0,13.0647,-0.7917,13.027,8.7865,10.2252,-2.9311,6.7299,-2.1274,8.6047,...,9.3914,7.5576,0.4784,1.2138,19.0498,-1.5634,7.3092,8.4344,18.1104,-7.7668
4,0,9.5222,-0.2727,8.2173,8.4071,12.7732,-10.3113,4.7486,-1.1586,6.3487,...,8.4802,0.9951,3.9973,1.937,24.4786,-2.0294,-0.5454,8.7461,21.3832,14.1786


In [29]:
ks_feature=ks_train_set.drop(['target'],axis=1)

In [30]:
#test set까지 train에서 제거했던 변수 똑같이 제거 (전처리)
ks_test_set=test_set.drop(train_set.columns[idx],axis=1)
ks_test_set.head()

Unnamed: 0,var_0,var_1,var_2,var_3,var_4,var_5,var_6,var_8,var_9,var_11,...,var_190,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199
0,8.7757,6.7905,9.292,4.8443,10.8877,3.5858,4.7267,1.9131,7.1682,0.3893,...,4.8996,4.496,2.0201,5.0578,18.7864,-0.255,4.0958,7.9291,17.6055,-7.4019
1,7.5491,-8.4764,15.4626,3.6803,12.1527,-7.1858,5.1952,2.5632,8.1126,-5.0243,...,10.2822,11.9053,3.0869,-0.6879,22.3453,-2.1595,4.1147,9.2685,12.2069,1.3086
2,5.8385,-6.9177,8.1589,6.7591,9.0635,4.9003,4.7126,-3.5385,8.3261,0.436,...,0.0732,10.2627,-0.3105,6.8481,15.2119,-0.505,1.8873,8.6977,18.6569,11.5001
3,15.6901,-7.7904,8.3676,6.3246,9.8144,6.9361,4.5284,3.3775,8.8897,-6.0086,...,11.8331,4.3828,2.973,2.4989,18.6336,1.0621,-3.8953,8.795,12.9313,-3.1393
4,15.5726,-6.6387,12.0251,7.2093,10.3838,-8.9961,4.8868,-5.2734,7.3476,0.1738,...,7.9042,10.4118,3.3438,0.4358,16.0511,0.7758,-3.3334,7.7918,18.2619,-3.2074


In [31]:
# 클래스 불균형을 해소하기 위한 oversampling (1:0.35)
from imblearn.over_sampling import RandomOverSampler

oversample = RandomOverSampler(sampling_strategy=0.35,random_state= 42)
X_over, y_over = oversample.fit_resample(ks_feature, ks_train_set['target'])

# 샘플링 후 클래스 0과 클래스 1의 갯수 확인 

print(Counter(y_over))

Counter({0: 143922, 1: 50372})


In [32]:
# train - test로 데이터 split

from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(X_over, y_over, test_size=0.2, random_state=42)

In [38]:
# RandomSearchCV()를 통해 최적 파라미터 설정 후 모델 생성

clf = lgb.LGBMClassifier(bagging_fraction=0.85, bagging_freq=1, boost='gbdt',
               boosting_type='gbdt', class_weight=None, colsample_bytree=1,
               feature_fraction=1, gamma=1, importance_type='split',
               learning_rate=0.1, max_bin=256, max_depth=1,
               min_child_samples=153, min_child_weight=0.1, min_split_gain=0.0,
               n_estimators=4000, n_jobs=-1, num_leaves=4, num_threads=8,
               objective='binary', random_state=1, reg_alpha=0.1,
               reg_lambda=0, seed=500, silent=True, subsample=0.8,
               subsample_for_bin=200000, subsample_freq=0,boost_from_average = False)

#모델 적합

clf.fit(train_x, train_y)
pred = clf.predict(test_x)


# validation set으로 계산한 accuracy와 f1 score

print(clf.score(train_x, train_y))
print(f1_score(test_y, pred, average='macro'))

0.8649338951973494
0.801773348409434


In [35]:
#prediction code에서 변수 선택한 데이터를 불러오기 위해 따로 csv 파일로 저장
ks_train_set.to_csv('ks_train_set.csv', index = False)

In [36]:
ks_feature.to_csv('ks_feature.csv', index = False)

In [37]:
ks_test_set.to_csv('ks_test_set.csv', index = False)