In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings(action = 'ignore')

In [2]:
## 데이터 불러오기
trn = pd.read_csv('data/train.csv')
tst = pd.read_csv('data/test.csv')

## 변수 생성
- 종성형 변수와 내 변수
- 연봉까지 포함하여 만든 카드 개수 변수가 begin_month 제외한 카드 개수 변수와 얼마나 일치하는지 보기
- 내가 만들었던 변수 넣기

In [3]:
## 나이 변환
def days_to_age(x):
    return (x*-1)/365

## 신용카드 발급월 양수로 전환
def begin_plus_month(x):
    return (x*-1)

## 신용카드 발급월 연도로 전환
def begin_plus_year(x):
    return (x)//12

## 업무 시작일 변환
def plus(x):
    if x<0:
        return x*(-1)
    else:
        return 0


## 나이로 변환
trn['birth'] = days_to_age(trn.DAYS_BIRTH).round()

## 나이 구간화
trn['age']=(trn.DAYS_BIRTH*(-1)/365).apply(lambda x:str(x)[0]+'0')
# trn.drop('DAYS_BIRTH', axis=1, inplace = True)

## 신용카드 발급월 변수의 변환
trn['begin_month_plus'] = begin_plus_month(trn.begin_month).round()
# trn.drop('begin_month', axis=1, inplace = True)

## 신용카드 발급월 연차로 변환
trn['begin_month_year'] = begin_plus_year(trn.begin_month_plus)

## 신용카드 발급월 구간화
trn['begin_month_bin'] = (trn.begin_month_plus*10).apply(lambda x:str(x)[0]+'0')

## 업무시작일 양수로 변환
trn['employed_plus'] = trn.DAYS_EMPLOYED.map(plus)

## 0과 1로 범주화 (업무를 하는 사람과 아닌 사람)
trn['employed_bin'] = trn.employed_plus.map(lambda x : 0 if x==0 else 1)

## 업무 연차로 변환
trn['employed_bin2'] = trn['employed_plus']//365


#### test 데이터
## 나이로 변환
tst['birth'] = days_to_age(tst.DAYS_BIRTH).round()

## 나이 구간화
tst['age']=(tst.DAYS_BIRTH*(-1)/365).apply(lambda x:str(x)[0]+'0')
# trn.drop('DAYS_BIRTH', axis=1, inplace = True)

## 신용카드 발급월 변수의 변환
tst['begin_month_plus'] = begin_plus_month(tst.begin_month).round()
# trn.drop('begin_month', axis=1, inplace = True)

## 신용카드 발급월 연차로 변환
tst['begin_month_year'] = begin_plus_year(tst.begin_month_plus)

## 신용카드 발급월 구간화
tst['begin_month_bin'] = (tst.begin_month_plus*10).apply(lambda x:str(x)[0]+'0')

## 업무시작일 양수로 변환
tst['employed_plus'] = tst.DAYS_EMPLOYED.map(plus)

## 0과 1로 범주화 (업무를 하는 사람과 아닌 사람)
tst['employed_bin'] = tst.employed_plus.map(lambda x : 0 if x==0 else 1)

## 업무 연차로 변환
tst['employed_bin2'] = tst['employed_plus']//365

In [4]:
### train 데이터 조정

#식별자 변수-1
trn['ident']=''
for i in range(trn.shape[1]-4):
    trn['ident']=trn['ident']+trn.drop(['index','begin_month','credit'],axis=1).iloc[:,i].astype(str)

#카드 순위
trn['rank_card']=trn.groupby('ident').begin_month.rank()
trn=pd.merge(trn,trn.groupby('ident').index.count(),how='left',on='ident')
trn=pd.merge(trn,trn.groupby('ident').begin_month.std(),how='left',on='ident')
trn=pd.merge(trn,trn.groupby('ident').begin_month_x.max(),how='left',on='ident')
trn=pd.merge(trn,trn.groupby('ident').begin_month_x_x.min(),how='left',on='ident')

# 카드 개수 : begin_month 제외
trn['num_card']=trn['index_y']

# 카드 발급일 표준편차, 최소, 최대값
trn['begin_month_std']=trn['begin_month_y']
trn['begin_month_max']=trn['begin_month_x_y']
trn['begin_month_min']=trn['begin_month_x_x_y']
trn['begin_month_std'].fillna(100,inplace=True)
trn['begin_month']=trn.begin_month_x_x_x

## 중복된 관측치 = 1 // 아니면 = 0
trn['same_card_human1'] = (trn['num_card']>1).astype(int)

trn=trn.drop(['ident','index_y','begin_month_y','begin_month_x_y','begin_month_x_x_y','begin_month_x_x_x'],axis=1)

# 카드 발급일 포함 식별자 변수-2
trn['ident']=''
for i in range(trn.shape[1]-3):
    trn['ident']=trn['ident']+trn.drop(['index_x','credit'],axis=1).iloc[:,i].astype(str)

trn=pd.merge(trn,trn.groupby('ident').gender.count(),how='left',on='ident')

# 중복 카드 개수 : begin_month 포함(같은 날에 여러개 만든 사람)
trn['dup_card']=trn.gender_y
trn['gender']=trn.gender_x

## 중복된 관측치 = 1 // 아니면 = 0
trn['same_card_human2'] = (trn['dup_card']>1).astype(int)

trn.drop(['gender_y','gender_x','ident'],axis=1,inplace=True)

# 고용 당시 연령
trn['age_employed']=trn['DAYS_BIRTH']-trn['DAYS_EMPLOYED']

# 카드 발급 당시 연령
trn['age_begin']=trn['DAYS_BIRTH']/30-trn['begin_month']

# 카드 발급 당시 회사 근속 연차(월)
trn['begin_employed']=trn['DAYS_EMPLOYED']/30-trn['begin_month']

# 자산 수
trn['asset']=trn['car'].replace(['Y','N'],[1,0])+trn['reality'].replace(['Y','N'],[1,0])

# 카드 발급 월 카테고리
trn['begin_month_cate']=(trn.begin_month%12).astype('category')

# 생월 카테고리
trn['birth_month']=((trn.DAYS_BIRTH/30).astype(int)%12).astype('category')

# 고용 월 카테고리
trn['employed_month']=((trn.DAYS_EMPLOYED/30).astype(int)%12).astype('category')


### test 데이터 조정
tst['ident']=''
for i in range(tst.shape[1]-3):
    tst['ident']=tst['ident']+tst.drop(['index','begin_month'],axis=1).iloc[:,i].astype(str)

tst['rank_card']=tst.groupby('ident').begin_month.rank()  
tst=pd.merge(tst,tst.groupby('ident').index.count(),how='left',on='ident')
tst=pd.merge(tst,tst.groupby('ident').begin_month.std(),how='left',on='ident')
tst=pd.merge(tst,tst.groupby('ident').begin_month_x.max(),how='left',on='ident')
tst=pd.merge(tst,tst.groupby('ident').begin_month_x_x.min(),how='left',on='ident')


tst['num_card']=tst['index_y']
tst['begin_month_std']=tst['begin_month_y']
tst['begin_month_max']=tst['begin_month_x_y']
tst['begin_month_min']=tst['begin_month_x_x_y']
tst['begin_month_std'].fillna(100,inplace=True)
tst['begin_month']=tst.begin_month_x_x_x

## 중복된 관측치 = 1 // 아니면 = 0
tst['same_card_human1'] = (tst['num_card']>1).astype(int)

tst=tst.drop(['ident','index_y','begin_month_y','begin_month_x_y','begin_month_x_x_y','begin_month_x_x_x'],axis=1)

tst['ident']=''
for i in range(tst.shape[1]-2):
    tst['ident']=tst['ident']+tst.drop(['index_x'],axis=1).iloc[:,i].astype(str)

tst=pd.merge(tst,tst.groupby('ident').gender.count(),how='left',on='ident')
tst['dup_card']=tst.gender_y
tst['gender']=tst.gender_x

## 중복된 관측치 = 1 // 아니면 = 0
tst['same_card_human2'] = (tst['dup_card']>1).astype(int)

tst.drop(['gender_y','gender_x','ident'],axis=1,inplace=True)

tst['age_employed']=tst['DAYS_BIRTH']-tst['DAYS_EMPLOYED']
tst['age_begin']=tst['DAYS_BIRTH']/30-tst['begin_month']
tst['begin_employed']=tst['DAYS_EMPLOYED']/30-tst['begin_month']

tst['asset']=tst['car'].replace(['Y','N'],[1,0])+tst['reality'].replace(['Y','N'],[1,0])
tst['begin_month_cate']=(tst.begin_month%12).astype('category')

tst['birth_month']=((tst.DAYS_BIRTH/30).astype(int)%12).astype('category')

tst['employed_month']=((tst.DAYS_EMPLOYED/30).astype(int)%12).astype('category')

trn.drop(['index_x'],axis=1,inplace=True)
tst.drop(['index_x'],axis=1,inplace=True)

In [5]:
## 변수 포함 여부 확인
trn.columns[trn.columns.isin(tst.columns)==False]

Index(['credit'], dtype='object')

In [6]:
print(trn.shape)
print(tst.shape)

(26457, 42)
(10000, 41)


In [7]:
## 독립변수 중 category 변수 int 형으로 전환
trn.begin_month_bin = trn.begin_month_bin.astype(int)
trn.age = trn.begin_month_bin.astype(int)
trn.begin_month_cate = trn.begin_month_cate.astype(int)
trn.birth_month = trn.birth_month.astype(int)
trn.employed_month = trn.employed_month.astype(int)

tst.begin_month_bin = tst.begin_month_bin.astype(int)
tst.age = tst.begin_month_bin.astype(int)
tst.begin_month_cate = tst.begin_month_cate.astype(int)
tst.birth_month = tst.birth_month.astype(int)
tst.employed_month = tst.employed_month.astype(int)

## Label Encoding

## 라벨 인코딩 조정

In [8]:
## edu_type 내림차순으로 정렬 및 인코딩
lbe = LabelEncoder()
lbe.fit(list(trn.groupby('edu_type')['income_total'].mean().sort_values(ascending = False).index))
trn.edu_type = lbe.transform(trn.edu_type)
tst.edu_type = lbe.transform(tst.edu_type)

## house_type 내림차순으로 정렬 및 인코딩
lbe = LabelEncoder()
lbe.fit(list(trn.groupby('house_type')['income_total'].mean().sort_values(ascending = False).index))
trn.house_type = lbe.transform(trn.house_type)
tst.house_type = lbe.transform(tst.house_type)

## occyp_type의 결측값들을 NONE이라는 범주로 만듦
trn['occyp_type'] = trn['occyp_type'].fillna('NONE').astype('object').values
tst['occyp_type'] = tst['occyp_type'].fillna('NONE').astype('object').values

## occyp_type 내림차순으로 정렬 및 인코딩
lbe = LabelEncoder()
lbe.fit(list(trn.groupby('occyp_type')['income_total'].mean().sort_values(ascending = False).index))
trn.occyp_type = lbe.transform(trn.occyp_type)
tst.occyp_type = lbe.transform(tst.occyp_type)

## income_type 내림차순으로 정렬 및 인코딩
lbe = LabelEncoder()
lbe.fit(list(trn.groupby('income_type')['income_total'].mean().sort_values(ascending = False).index))
trn.income_type = lbe.transform(trn.income_type)
tst.income_type = lbe.transform(tst.income_type)

In [9]:
index_col = 'index'
target_col = 'credit'

cat_cols = [x for x in trn.columns if trn[x].dtype == 'object']
float_cols = [x for x in trn.columns.drop('credit') if trn[x].dtype == 'float64']
num_cols = [x for x in trn.columns if x not in cat_cols + [target_col]]
feature_cols = num_cols + cat_cols
print(len(feature_cols), len(cat_cols), len(num_cols))

from sklearn.preprocessing import LabelEncoder

lbe = LabelEncoder()
for i in cat_cols:
    trn[i] = lbe.fit_transform(trn[i])
    tst[i] = lbe.transform(tst[i])
    
for i in float_cols:
    trn[i] = trn[i].astype('int')
    tst[i] = tst[i].astype('int')

41 4 37


In [10]:
trn.credit = trn.credit.astype('category')

In [11]:
trn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26457 entries, 0 to 26456
Data columns (total 42 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   car               26457 non-null  int32   
 1   reality           26457 non-null  int32   
 2   child_num         26457 non-null  int64   
 3   income_total      26457 non-null  int32   
 4   income_type       26457 non-null  int32   
 5   edu_type          26457 non-null  int32   
 6   family_type       26457 non-null  int32   
 7   house_type        26457 non-null  int32   
 8   DAYS_BIRTH        26457 non-null  int64   
 9   DAYS_EMPLOYED     26457 non-null  int64   
 10  FLAG_MOBIL        26457 non-null  int64   
 11  work_phone        26457 non-null  int64   
 12  phone             26457 non-null  int64   
 13  email             26457 non-null  int64   
 14  occyp_type        26457 non-null  int32   
 15  family_size       26457 non-null  int32   
 16  credit            2645

## 변수 찾기
- plot importance를 하며 확인

In [13]:
from sklearn.ensemble import ExtraTreesClassifier

# train과 valid(test) 셋을 분리 
x_train, x_valid, y_train, y_valid = train_test_split(trn.drop('credit', axis=1), trn.credit, test_size=0.3, stratify=trn.credit, random_state=40)

# ExtraTreesClassifier 적용
extra = ExtraTreesClassifier(n_estimators = 5000,
                            max_features = 5,
                            min_samples_split = 6,
                            random_state = 2021,
                            n_jobs = 6)
extra.fit(x_train, y_train)

ExtraTreesClassifier(max_features=5, min_samples_split=6, n_estimators=5000,
                     n_jobs=6, random_state=2021)

In [14]:
extra_pred = extra.predict_proba(x_valid)

In [15]:
log_loss(y_valid, extra_pred)

0.7894756728247738

## 그리드 서치로 파라미터 찾기

In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [21]:
param_grid = {
    'n_estimators': [1000, 2000, 3000, 4000],
    'max_features': [3, 4, 5, 6, 7, 8, 9],
    'min_samples_split': [4,5,6,7,8,9,10],
    'random_state' : [2021]
}

In [22]:
estimator = ExtraTreesClassifier()

In [23]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=40)

In [24]:
# define grid_search
grid_search = GridSearchCV(estimator=estimator, 
                           param_grid=param_grid, 
                           cv=kf, 
                           n_jobs=2, 
                           verbose=2
                          )

# fit with (x_train, y_train)
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 196 candidates, totalling 980 fits


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=40, shuffle=True),
             estimator=ExtraTreesClassifier(), n_jobs=2,
             param_grid={'max_features': [3, 4, 5, 6, 7, 8, 9],
                         'min_samples_split': [4, 5, 6, 7, 8, 9, 10],
                         'n_estimators': [1000, 2000, 3000, 4000],
                         'random_state': [2021]},
             verbose=2)

In [25]:
grid_search.best_params_

{'max_features': 4,
 'min_samples_split': 10,
 'n_estimators': 1000,
 'random_state': 2021}

# permutation importance

In [79]:
import eli5 
from eli5.sklearn import PermutationImportance 

In [24]:
perm = PermutationImportance(rf, 
                             scoring = 'neg_log_loss',
                             random_state = 2021).fit(x_valid, y_valid)

In [25]:
eli5.show_weights(perm, top = ftr.shape[1], feature_names = x_valid.columns.tolist())

Weight,Feature
0.0203  ± 0.0017,DAYS_BIRTH
0.0189  ± 0.0008,age_employed
0.0154  ± 0.0007,employed_plus
0.0148  ± 0.0016,DAYS_EMPLOYED
0.0145  ± 0.0018,age_begin
0.0121  ± 0.0018,birth
0.0104  ± 0.0017,income_total
0.0088  ± 0.0017,begin_employed
0.0078  ± 0.0014,begin_month_max
0.0074  ± 0.0012,birth_month


## 퍼뮤테이션 기준 변수 선택

In [38]:
## 1차 퍼뮤테이션 변수 : RF 최고기록 사용
f_name = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min','employed_month','occyp_type','employed_bin2','family_type','edu_type','income_type','family_size','begin_month_year' ,'asset', 'phone','reality', 'work_phone','house_type','gender','child_num','car','email']
f_name2 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min','employed_month','occyp_type','employed_bin2','family_type','edu_type','income_type','family_size','begin_month_year' ,'asset', 'phone','reality', 'work_phone','house_type','gender','child_num','car']
f_name3 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min','employed_month','occyp_type','employed_bin2','family_type','edu_type','income_type','family_size','begin_month_year' ,'asset', 'phone','reality', 'work_phone','house_type','gender','child_num']
f_name4 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min','employed_month','occyp_type','employed_bin2','family_type','edu_type','income_type','family_size','begin_month_year' ,'asset', 'phone','reality', 'work_phone','house_type','gender']
f_name5 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min','employed_month','occyp_type','employed_bin2','family_type','edu_type','income_type','family_size','begin_month_year' ,'asset', 'phone','reality', 'work_phone','house_type']
f_name6 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min','employed_month','occyp_type','employed_bin2','family_type','edu_type','income_type','family_size','begin_month_year' ,'asset', 'phone','reality', 'work_phone']
f_name7 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min','employed_month','occyp_type','employed_bin2','family_type','edu_type','income_type','family_size','begin_month_year' ,'asset', 'phone','reality']
f_name8 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month']

# 9가 제일 좋음
f_name9 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min','employed_month','occyp_type','employed_bin2','family_type','edu_type','income_type','family_size','begin_month_year' ,'asset', 'phone']
f_name10 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min','employed_month','occyp_type','employed_bin2','family_type','edu_type','income_type','family_size','begin_month_year' ,'asset']
f_name11 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min','employed_month','occyp_type','employed_bin2','family_type','edu_type','income_type','family_size','begin_month_year']
f_name12 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min','employed_month','occyp_type','employed_bin2','family_type','edu_type','income_type','family_size']
f_name13 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min','employed_month','occyp_type','employed_bin2','family_type','edu_type','income_type']
f_name14 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min','employed_month','occyp_type','employed_bin2','family_type','edu_type']
f_name15 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min','employed_month','occyp_type','employed_bin2','family_type']
f_name16 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min','employed_month','occyp_type','employed_bin2']
f_name17 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min','employed_month','occyp_type']
f_name18 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min','employed_month']
f_name19 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus','begin_month_min']
f_name20 = ['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus','DAYS_EMPLOYED', 'age_begin','birth','income_total', 'begin_employed','begin_month_max','birth_month','begin_month','begin_month_plus']

## 변수를 줄여가며 확인

In [30]:
from tqdm import tqdm

for i in tqdm([f_name, f_name2, f_name3, f_name4, f_name5, f_name6, f_name7, f_name8, f_name9, f_name10, f_name11, f_name12, f_name13, f_name14, f_name15, f_name16, f_name17, f_name18, f_name19, f_name20]):
    # train과 valid(test) 셋을 분리 
    x_train, x_valid, y_train, y_valid = train_test_split(trn[i].drop('credit', axis=1), trn.credit, test_size=0.3, stratify=trn.credit, random_state=40)

    # RF 모델링 
    ex = ExtraTreesClassifier(n_estimators = 5000,
                           max_features = 5,
                           min_samples_split = 6,
                            random_state = 2021,
                            n_jobs = 8
                           )
    ex.fit(x_train, y_train)
    ex_pred = ex.predict_proba(x_valid)
    print(i, log_loss(y_valid, ex_pred))

  5%|▌         | 1/20 [01:00<19:11, 60.61s/it]

['credit', 'DAYS_BIRTH', 'age_employed', 'employed_plus', 'DAYS_EMPLOYED', 'age_begin', 'birth', 'income_total', 'begin_employed', 'begin_month_max', 'birth_month', 'begin_month', 'begin_month_plus', 'begin_month_min', 'employed_month', 'occyp_type', 'employed_bin2', 'family_type', 'edu_type', 'income_type', 'family_size', 'begin_month_year', 'asset', 'phone', 'reality', 'work_phone', 'house_type', 'gender', 'child_num', 'car', 'email'] 0.7693498723354302


  5%|▌         | 1/20 [01:19<25:11, 79.53s/it]


KeyboardInterrupt: 

## 교차검증을 사용하여 확인
- max_features = 3, min_samples_split = 6 : 이 2개를 넣으면 변수가 많아서 그런지 cv=5로 해서 돌리면 최고가 0.7386759055883846이다. (email까지 사용)
- 디폴트 값으로 돌린 다음 제일 좋은 것을 파라미터 튜닝하여 사용해보기 (1번)
- 상관관계 확인하여 지우기 (2번)
- 라벨인코딩을 새로이 하여 상관관계 확인하기 (2번에서 하기)

In [37]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=40)

print(trn.shape)
print(tst.shape)
n_class = 3
n_fold = 5

for j in tqdm([f_name, f_name2, f_name3, f_name4, f_name5, f_name6, f_name7, f_name8, f_name9, f_name10, f_name11, f_name12, f_name13, f_name14, f_name15, f_name16, f_name17, f_name18, f_name19, f_name20]):
    j.remove('credit')
    ftr = trn[j].values
    tst_ar = tst[j].values
    ext_p_val = np.zeros((ftr.shape[0], n_class))
    ext_p_tst = np.zeros((tst_ar.shape[0], n_class))
    for i, (i_trn, i_val) in enumerate(cv.split(ftr, target), 1):
        print(f'training model for CV #{i}')
        ext_clf = ExtraTreesClassifier(n_estimators = 5000, 
                                random_state=2021,
                                max_features = 3,
                                min_samples_split = 6,
                                n_jobs=10)
        ext_clf.fit(ftr[i_trn], target[i_trn])
        ext_p_val[i_val, :] = ext_clf.predict_proba(ftr[i_val])
        ext_p_tst += ext_clf.predict_proba(tst_ar) / n_fold
    
    print(j)
    print(log_loss(target, ext_p_val))
    print(confusion_matrix(target, np.argmax(ext_p_val, axis=1)))

  0%|          | 0/20 [00:00<?, ?it/s]

(26457, 42)
(10000, 41)
training model for CV #1


  0%|          | 0/20 [00:08<?, ?it/s]


KeyboardInterrupt: 

## 교차 검증 확인

In [None]:
## predict_rf
ftr = trn6.drop('credit', axis=1).values
target = trn['credit'].values
tst_ar = tst.drop(['family_size', 'employed_plus'], axis = 1).values
n_class = 3
n_fold = 10

print(ftr.shape)
print(tst_ar.shape)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

print(ftr.shape)

ext_p_val = np.zeros((ftr.shape[0], n_class))
ext_p_tst = np.zeros((tst_ar.shape[0], n_class))

for i, (i_trn, i_val) in enumerate(cv.split(ftr, target), 1):
    print(f'training model for CV #{i}')
    ext_clf = ExtraTreesClassifier(n_estimators=5000, 
                                   random_state=2021)
    
    ext_clf.fit(ftr[i_trn], tst_ar[i_trn])
    ext_p_val[i_val, :] = ext_clf.predict_proba(ftr[i_val])
    ext_p_tst += ext_clf.predict_proba(tst_ar) / n_fold
    
    print(log_loss(target[i_val],np.argmax(ext_p_val[i_val],axis=1)))

print(f'{log_loss(target, ext_p_val)}')
print(confusion_matrix(target, np.argmax(ext_p_val, axis=1)))

In [122]:
# train과 valid(test) 셋을 분리 
x_train, x_valid, y_train, y_valid = train_test_split(trn.drop('credit', axis=1), trn.credit, test_size=0.3, stratify=trn.credit, random_state=40)

# RF 모델링 
rf = RandomForestClassifier(n_estimators = 5000,
                           max_features = 5,
                           min_samples_split = 6,
                            random_state = 2021,
                            n_jobs = -1
                           )
rf.fit(x_train, y_train)

RandomForestClassifier(max_features=5, min_samples_split=6, n_estimators=5000,
                       n_jobs=-1, random_state=2021)

In [123]:
rf_pred = rf.predict_proba(x_valid)

In [124]:
log_loss(y_valid, rf_pred)

0.7617244533856751

## 벨리드 검정 기준1 (변수 20개)
- 0.7163870280361224
: rf = RandomForestClassifier(n_estimators = 1000,
                           max_features = 3,
                            min_samples_split = 5,
                            random_state = 2021,
                            n_jobs = -1
                            
                            
- 0.7123578685866444
: rf = RandomForestClassifier(n_estimators = 1000,
                           max_features = 3,
                            min_samples_split = 6,
                            random_state = 2021,
                            n_jobs = -1
                           )
                           
- 0.7120824800381204
: rf = RandomForestClassifier(n_estimators = 5000,
                           max_features = 3,
                            min_samples_split = 6,
                            random_state = 2021,
                            n_jobs = -1
                           )

## 벨리드 검정 기준2
- 종성형 변수와 내 변수 모두 넣고 돌린것 
: 0.7722579318936101

- 위 조건 + 파라미터 2개 추가
: 0.76

- 위 조건에서 max_features = 5로 변경
: 0.7617244533856751 (조금 더 내려감)

## <변수 조합 찾기>

In [125]:
print(trn.shape)
print(tst.shape)

(26457, 42)
(10000, 41)


In [126]:
from itertools import combinations

In [None]:
list(combinations(trn.columns, 2))

In [None]:
for i in list(combinations(20)):

In [None]:
# 테스트 예측
x_train, x_valid, y_train, y_valid = train_test_split(trn.drop('credit', axis=1), trn.credit, test_size=0.3, stratify=trn.credit, random_state=40)

# RF 모델링 
rf = RandomForestClassifier(n_estimators = 1000,
                           max_features = 3,
                            min_samples_split = 5,
                            random_state = 2021,
                            n_jobs = -1
                           )
rf.fit(trn.drop('credit', axis=1), trn.credit)

In [74]:
rf_pred = rf.predict_proba(tst)

In [75]:
rf_pred

array([[0.07131905, 0.21075974, 0.71792121],
       [0.34897536, 0.27639733, 0.37462731],
       [0.04357063, 0.04397911, 0.91245025],
       ...,
       [0.01549643, 0.09656677, 0.8879368 ],
       [0.26784917, 0.4597715 , 0.27237933],
       [0.10945581, 0.25875794, 0.63178626]])

## 제출하기

In [76]:
sub = pd.read_csv('data/sample_submission.csv', index_col = 0)
sub[sub.columns] = rf_pred
sub.head()

In [78]:
sub.to_csv('data/R_베이스라인.csv')