In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import os, glob, warnings
from itertools import combinations
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score
from sklearn.feature_selection import VarianceThreshold
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
warnings.filterwarnings('ignore')
sns.set(style = 'whitegrid')
pd.set_option('display.max_columns', 50)
np.random.seed(2018)

In [2]:
def load_data():
    trn = pd.read_csv('train.csv')
    tst = pd.read_csv('test.csv')
    
    #불필요한 칼럼 삭제
    drop_cols = ['FLAG_MOBIL', 'index', 'child_num']
    trn.drop(drop_cols, axis = 1, inplace = True)
    tst.drop(drop_cols, axis = 1, inplace = True)
    
    #occyp_type null값 처리
    trn['occyp_type'].fillna('BLANK', inplace = True)
    tst['occyp_type'].fillna('BLANK', inplace = True)    
    return trn, tst

def freq_encoding(df, col, normalize = True):    
    vc = df[col].value_counts(normalize = normalize).to_dict()
    nm = col + '_FE'
    df[nm] = df[col].map(vc)    
    return df

xgb_params = {
    'booster' : 'gbtree',
    'tree_method' : 'gpu_hist',
    'predictor' : 'gpu_predictor',    
    'objective' : 'multi:softprob',
    'eval_metric' : 'mlogloss',
    'n_estimators' : 5000,
    'max_depth' : 9,
    'min_child_weight' : 5,    
    'learning_rate' : 0.012727,    
    'subsample' : 0.91020,
    'colsample_bytree' : 0.77959,    
    'colsample_bylevel' : 0.64898,
    'lambda' : 0.05,
    'alpha' : 1,    
    'seed' : 2018
}

lgbm_params = {
    'objective': 'multiclass',
    'boosting_type': 'gbdt',
    'eval_metric' : 'logloss',    
    'n_estimators': 10000,
    'early_stopping_round': 100, 
    'max_depth': -1,
    'max_bin': 255,
    'boost_from_average' : False,
    'bagging_freq' : 1,
    'min_data_in_leaf': 40,    
    'learning_rate': 0.02272,    
    'num_leaves': 64,    
    'feature_fraction': 0.89387,
    'bagging_fraction': 0.76326,        
    'seed': 2018,
    'verbose': -1,
    'n_jobs': -1,    
}

def train_model(model, trn, tst, cv = 5):
    tst_preds = []
    vld_preds = []
    feats_importance = np.zeros(tst.shape[1])    
    for n, (trn_idx, vld_idx) in enumerate(StratifiedKFold(cv).split(trn.drop('credit', axis = 1).values, trn['credit'].values)):
        print(f"{n+1}/{cv}번째 폴드 시작..........")        
        X_trn = trn.loc[trn_idx, :].drop('credit', axis = 1)
        X_vld = trn.loc[vld_idx, :].drop('credit', axis = 1)
        y_trn = trn.loc[trn_idx, 'credit'].values
        y_vld = trn.loc[vld_idx, 'credit'].values
        
        model.fit(
            X_trn, y_trn,
            eval_set = [(X_trn, y_trn), (X_vld, y_vld)],
            verbose = 500, early_stopping_rounds = 30
        )    
        vld_preds.append(log_loss(y_vld, model.predict_proba(X_vld)))        
        
        tst_pred = model.predict_proba(tst)
        tst_preds.append(tst_pred)
        feats_importance += model.feature_importances_                        
        
    feats_importance = feats_importance / cv
    feats_importance = pd.Series(data = feats_importance, index = tst.columns)

    print('5폴더 평균 mlogloss: ', np.mean(vld_preds))
    return tst_preds, feats_importance

In [3]:
trn, tst = load_data()

features = trn.drop('credit', axis = 1).columns
sole_cols = ['gender', 'phone', 'work_phone', 'email', 'house_type']
cat_cols = ['car', 'reality', 'income_type', 'edu_type', 'family_type', 'occyp_type',            
            'YEARS_EMPLOYED', 'YEARS_BIRTH',
            'MONTHS_EMPLOYED', 'MONTHS_BIRTH',
            'WEEKS_EMPLOYED', 'WEEKS_BIRTH',            
            'income_per_family', 'EMPLOYED_RATIO',
            'income_per_days_birth',
            'income_per_weeks_birth',
            'income_per_years_birth',
           ]

df = pd.concat([trn, tst]).reset_index(drop = True)

#숫자형 피처 전처리
df.loc[df['DAYS_EMPLOYED'] == 365243, 'DAYS_EMPLOYED'] = df[df.DAYS_EMPLOYED != 365243]['DAYS_EMPLOYED'].mean()
df['DAYS_EMPLOYED'] = df['DAYS_EMPLOYED'].map(lambda x: -1 * x if x <0 else x)
df['DAYS_BIRTH'] = df['DAYS_BIRTH'].map(lambda x: -1 * x if x <0 else x)

df['YEARS_EMPLOYED'] = df['DAYS_EMPLOYED'].map(lambda x: int(x/365))
df['YEARS_BIRTH'] = df['DAYS_BIRTH'].map(lambda x: int(x/365))

df['MONTHS_EMPLOYED'] = df['DAYS_EMPLOYED'].map(lambda x: int(x/30))
df['MONTHS_BIRTH'] = df['DAYS_BIRTH'].map(lambda x: int(x/30))

df['WEEKS_EMPLOYED'] = df['DAYS_EMPLOYED'].map(lambda x: int(x/7))
df['WEEKS_BIRTH'] = df['DAYS_BIRTH'].map(lambda x: int(x/7))

df['EMPLOYED_RATIO'] = df['DAYS_EMPLOYED'] / df['DAYS_BIRTH']
df['income_per_family'] = df['income_total'] / df['family_size']
df['income_per_days_birth'] = df['income_total'] / df['DAYS_BIRTH']
df['income_per_weeks_birth'] = df['income_total'] / df['WEEKS_BIRTH']
df['income_per_years_birth'] = df['income_total'] / df['YEARS_BIRTH']

num_cols = [col for col in features if col not in cat_cols + sole_cols]
   
for col in sole_cols + cat_cols:
    df[col] = df[col].astype(str)
    df = freq_encoding(df, col)

# 조합형 변수 생성
comb_num = 2
for col in list(combinations(cat_cols, comb_num)):
    new_col = col[0]
    for n in range(1, comb_num):
        new_col = new_col + "_" + col[n]
    df[new_col] = df[col[0]].astype(str)
    for n in range(1, comb_num):
        df[new_col] = df[new_col] + "_" + df[col[n]].astype(str)
    cat_cols.append(new_col)
       
# 카테고리 변수와 숫자형 변수 조합
for cat_col in sole_cols + cat_cols:
    for num_col in num_cols:        
        new_name = cat_col + "#mean#" + num_col
        grouped = df.groupby(cat_col)[num_col].mean()
        df[new_name] = df[cat_col].map(grouped)
        
        new_name = cat_col + "#std#" + num_col
        grouped = df.groupby(cat_col)[num_col].std(ddof = 1)
        df[new_name] = df[cat_col].map(grouped)
        
        new_name = cat_col + "#var#" + num_col
        grouped = df.groupby(cat_col)[num_col].var(ddof = 1)
        df[new_name] = df[cat_col].map(grouped)
        
        new_name = cat_col + "#max#" + num_col
        grouped = df.groupby(cat_col)[num_col].max()
        df[new_name] = df[cat_col].map(grouped)
        
        new_name = cat_col + "#min#" + num_col
        grouped = df.groupby(cat_col)[num_col].min()
        df[new_name] = df[cat_col].map(grouped)
        
        new_name = cat_col + "#ptp#" + num_col
        grouped = df.groupby(cat_col)[num_col].agg(np.ptp)
        df[new_name] = df[cat_col].map(grouped)
        
        new_name = cat_col + "#median" + num_col
        grouped = df.groupby(cat_col)[num_col].median()
        df[new_name] = df[cat_col].map(grouped)
        
        new_name = cat_col + "#skew" + num_col
        grouped = df.groupby(cat_col)[num_col].skew()
        df[new_name] = df[cat_col].map(grouped)
        
        new_name = cat_col + "#percentile_10" + num_col
        grouped = df.groupby(cat_col)[num_col].agg(lambda x: np.percentile(x, 10))
        df[new_name] = df[cat_col].map(grouped)
        
        new_name = cat_col + "#percentile_60" + num_col
        grouped = df.groupby(cat_col)[num_col].agg(lambda x: np.percentile(x, 60))
        df[new_name] = df[cat_col].map(grouped)
        
        new_name = cat_col + "#percentile_90" + num_col
        grouped = df.groupby(cat_col)[num_col].agg(lambda x: np.percentile(x, 90))
        df[new_name] = df[cat_col].map(grouped)
         
le_dict = {} #LabelEncoder를 저장하는 사전
for col in sole_cols + cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].values)
    le_dict[col] = le    

print(df.shape)
df.head(2)

(36457, 8876)


Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,occyp_type,family_size,begin_month,credit,YEARS_EMPLOYED,YEARS_BIRTH,MONTHS_EMPLOYED,MONTHS_BIRTH,WEEKS_EMPLOYED,WEEKS_BIRTH,EMPLOYED_RATIO,income_per_family,...,income_per_weeks_birth_income_per_years_birth#percentile_10DAYS_EMPLOYED,income_per_weeks_birth_income_per_years_birth#percentile_60DAYS_EMPLOYED,income_per_weeks_birth_income_per_years_birth#percentile_90DAYS_EMPLOYED,income_per_weeks_birth_income_per_years_birth#mean#family_size,income_per_weeks_birth_income_per_years_birth#std#family_size,income_per_weeks_birth_income_per_years_birth#var#family_size,income_per_weeks_birth_income_per_years_birth#max#family_size,income_per_weeks_birth_income_per_years_birth#min#family_size,income_per_weeks_birth_income_per_years_birth#ptp#family_size,income_per_weeks_birth_income_per_years_birth#medianfamily_size,income_per_weeks_birth_income_per_years_birth#skewfamily_size,income_per_weeks_birth_income_per_years_birth#percentile_10family_size,income_per_weeks_birth_income_per_years_birth#percentile_60family_size,income_per_weeks_birth_income_per_years_birth#percentile_90family_size,income_per_weeks_birth_income_per_years_birth#mean#begin_month,income_per_weeks_birth_income_per_years_birth#std#begin_month,income_per_weeks_birth_income_per_years_birth#var#begin_month,income_per_weeks_birth_income_per_years_birth#max#begin_month,income_per_weeks_birth_income_per_years_birth#min#begin_month,income_per_weeks_birth_income_per_years_birth#ptp#begin_month,income_per_weeks_birth_income_per_years_birth#medianbegin_month,income_per_weeks_birth_income_per_years_birth#skewbegin_month,income_per_weeks_birth_income_per_years_birth#percentile_10begin_month,income_per_weeks_birth_income_per_years_birth#percentile_60begin_month,income_per_weeks_birth_income_per_years_birth#percentile_90begin_month
0,0,0,0,202500.0,0,1,1,2,13899,4709.0,0,0,0,1,2.0,-6.0,1.0,4,18,64,201,940,797,8510,0,...,4709.0,4709.0,4709.0,2.0,0.0,0.0,2.0,2.0,0.0,2.0,0.0,2.0,2.0,2.0,-33.2,19.942417,397.7,-6.0,-58.0,52.0,-31.0,0.175234,-53.2,-28.6,-13.6
1,0,0,1,247500.0,0,4,0,1,11380,1540.0,0,0,1,9,3.0,-5.0,1.0,34,11,358,117,452,441,5432,339,...,1540.0,1540.0,1540.0,3.0,0.0,0.0,3.0,3.0,0.0,3.0,0.0,3.0,3.0,3.0,-7.181818,3.311138,10.963636,-3.0,-13.0,10.0,-6.0,-0.593003,-11.0,-5.0,-4.0


In [4]:
df.to_pickle('data.pkl')

In [3]:
%time
df = pd.read_pickle('data.pkl').reset_index(drop = True)

# learning
trn = df[df.credit.notna()]
tst = df[df.credit.isna()].drop('credit', axis = 1)
print(trn.shape, tst.shape)

tst_preds_lgbm, feat_im_lgbm = train_model(LGBMClassifier(**lgbm_params), trn, tst, cv = 20)
tst_preds_xgb, feat_im_xgb = train_model(XGBClassifier(**xgb_params), trn, tst, cv = 20)

Wall time: 0 ns
(26457, 8876) (10000, 8875)
1/20번째 폴드 시작..........
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[272]	training's multi_logloss: 0.430961	valid_1's multi_logloss: 0.672956
2/20번째 폴드 시작..........
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[311]	training's multi_logloss: 0.410859	valid_1's multi_logloss: 0.651845
3/20번째 폴드 시작..........
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[305]	training's multi_logloss: 0.414168	valid_1's multi_logloss: 0.637326
4/20번째 폴드 시작..........
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[261]	training's multi_logloss: 0.437836	valid_1's multi_logloss: 0.677995
5/20번째 폴드 시작..........
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[274]	training's multi_logloss: 0.43019	valid_1's multi_

Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[256]	training's multi_logloss: 0.439377	valid_1's multi_logloss: 0.680238
13/20번째 폴드 시작..........
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[300]	training's multi_logloss: 0.416791	valid_1's multi_logloss: 0.644675
14/20번째 폴드 시작..........
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[280]	training's multi_logloss: 0.426952	valid_1's multi_logloss: 0.657352
15/20번째 폴드 시작..........
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[287]	training's multi_logloss: 0.422429	valid_1's multi_logloss: 0.665549
16/20번째 폴드 시작..........
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[262]	training's multi_logloss: 0.437436	valid_1's multi_logloss: 0.663197
17/20번째 폴드 시작..........
Training until valid

Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 30 rounds.
[500]	validation_0-mlogloss:0.45497	validation_1-mlogloss:0.69125
Stopping. Best iteration:
[650]	validation_0-mlogloss:0.41959	validation_1-mlogloss:0.68857

7/20번째 폴드 시작..........
[0]	validation_0-mlogloss:1.09146	validation_1-mlogloss:1.09192
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 30 rounds.
[500]	validation_0-mlogloss:0.45842	validation_1-mlogloss:0.64426
Stopping. Best iteration:
[703]	validation_0-mlogloss:0.41258	validation_1-mlogloss:0.63891

8/20번째 폴드 시작..........
[0]	validation_0-mlogloss:1.09143	validation_1-mlogloss:1.09217
Multiple eval metrics have been passed: 'validation_1-mlogloss' will be used for early stopping.

Will train until validation_1-mlogloss hasn't improved in 30 rounds.
[500]	v

In [4]:
def make_submission(xgb_preds, lgbm_preds, xgb_ratio=0.5, cv=5):
    xgb_result = np.zeros((10000, 3))
    for xgb_pred in xgb_preds:
        xgb_result += xgb_pred
    xgb_result /= cv
    lgbm_result = np.zeros((10000, 3))
    for lgbm_pred in lgbm_preds:
        lgbm_result += lgbm_pred
    lgbm_result /= cv    
    result = xgb_result * xgb_ratio + lgbm_result * (1 - xgb_ratio)
    
    submission = pd.read_csv('sample_submission.csv', index_col = 'index')
    submission = pd.DataFrame(
        index = submission.index,
        columns = submission.columns,
        data = result
    )
    submission.to_csv('submission.csv')
make_submission(tst_preds_xgb, tst_preds_lgbm, 0.5, cv=20)