# **** 여기부터 하기 ****

In [14]:
import pandas as pd
import numpy as np
from collections import Counter
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import log_loss

import warnings
warnings.filterwarnings(action = 'ignore')

In [40]:
trn = pd.read_csv('data/train.csv', index_col = 0)
tst = pd.read_csv('data/test.csv', index_col = 0)

trn2 = trn.drop('credit', axis = 1)

df = pd.concat([trn2,tst])

In [41]:
## 나이 변환
def days_to_age(x):
    return (x*-1)/365

## 신용카드 발급월 양수로 전환
def begin_plus_month(x):
    return (x*-1)

## 신용카드 발급월 연도로 전환
def begin_plus_year(x):
    return (x)//12

## 업무 시작일 변환
def plus(x):
    if x<0:
        return x*(-1)
    else:
        return 0

# 변수 생성

## 연속형 변수 처리

In [42]:
## 나이로 변환
trn['birth'] = days_to_age(trn.DAYS_BIRTH).round()

## 나이 구간화
trn['age'] = pd.cut(trn.birth, bins = [0,10,20,30,40,50,60,70], labels = ['10','20','30','40','50','60','70']).astype(int)
trn['age']

## 신용카드 발급월 변수의 변환
trn['begin_month_plus'] = begin_plus_month(trn.begin_month).round()

## 신용카드 발급월 연차로 변환
trn['begin_month_year'] = begin_plus_year(trn.begin_month_plus)

## 신용카드 발급월 구간화
trn['begin_month_bin'] = pd.cut(trn.begin_month_plus, bins = [-10,10,20,30,40,50,60,70], labels = ['10','20','30','40','50','60','70']).astype(int)
trn['begin_month_bin'].value_counts()

## 업무시작일 양수로 변환
trn['employed_plus'] = trn.DAYS_EMPLOYED.map(plus)
trn['employed_plus']

## 0과 1로 범주화 (업무를 하는 사람과 아닌 사람)
trn['employed_bin'] = trn.employed_plus.map(lambda x : 0 if x==0 else 1)

## 업무 연차로 변환
trn['employed_bin2'] = trn['employed_plus']//365
trn['employed_bin2']

# trn['credit'] = trn.credit.astype('object')

index
0        12
1         4
2        12
3         5
4         5
         ..
26452     5
26453     6
26454     5
26455     0
26456     2
Name: employed_bin2, Length: 26457, dtype: int64

In [43]:
## 나이로 변환
tst['birth'] = days_to_age(tst.DAYS_BIRTH).round()

## 나이 구간화
tst['age'] = pd.cut(tst.birth, bins = [0,10,20,30,40,50,60,70], labels = ['10','20','30','40','50','60','70']).astype(int)

## 신용카드 발급월 변수의 변환
tst['begin_month_plus'] = begin_plus_month(tst.begin_month).round()

## 신용카드 발급월 연차로 변환
tst['begin_month_year'] = begin_plus_year(tst.begin_month_plus)

## 신용카드 발급월 구간화
tst['begin_month_bin'] = pd.cut(tst.begin_month_plus, bins = [-10,10,20,30,40,50,60,70], labels = ['10','20','30','40','50','60','70']).astype(int)

## 업무시작일 양수로 변환
tst['employed_plus'] = tst.DAYS_EMPLOYED.map(plus)

## 0과 1로 범주화 (업무를 하는 사람과 아닌 사람)
tst['employed_bin'] = tst.employed_plus.map(lambda x : 0 if x==0 else 1)

## 업무 연차로 변환
tst['employed_bin2'] = tst['employed_plus']//365


##  범주형 변수

In [38]:
## 성별 : 여성을 0으로 남성을 1로 변환
trn['gender'] = trn['gender'].map({'F': '0', 'M': '1'})

## 자동차 소유 여부 : No 0으로 yes를 1로 변환
trn['car'] = trn['car'].map({'N': '0', 'Y': '1'})

## 부동산 소유 여부 : No 0으로 yes를 1로 변환
trn['reality'] = trn['reality'].map({'N': '0', 'Y': '1'})

# trn['edu_type'] = trn.edu_type.map({'Academic degree' : '0', 
#                   "Lower secondary" : '1', 
#                   'Incomplete higher' : '2',
#                  'Secondary / secondary special':'3',
#                  'Higher education':4})

# trn['income_type'] = trn.income_type.map({'Commercial associate' : '0', 
#                   "Working" : '1', 
#                   'State servant' : '2',
#                  'Pensioner':'3',
#                  'Student':'4'})

# trn['family_type'] = trn.family_type.map({'Married' : '0', 
#                   "Civil marriage" : '1', 
#                   'Separated' : '2',
#                  'Single / not married':'3',
#                  'Widow':'4'})

# trn['house_type'] = trn.house_type.map({'Municipal apartment' : '0', 
#                   "House / apartment" : '1', 
#                   'With parents' : '2',
#                  'Co-op apartment':'3',
#                  'Rented apartment':'4',
#                   'Office apartment' : '5'  })

In [39]:
## 성별 : 여성을 0으로 남성을 1로 변환
tst['gender'] = tst['gender'].map({'F': '0', 'M': '1'})

## 자동차 소유 여부 : No 0으로 yes를 1로 변환
tst['car'] = tst['car'].map({'N': '0', 'Y': '1'})

## 부동산 소유 여부 : No 0으로 yes를 1로 변환
tst['reality'] = tst['reality'].map({'N': '0', 'Y': '1'})

# tst['edu_type'] = tst.edu_type.map({'Academic degree' : 0, 
#                   "Lower secondary" : 1, 
#                   'Incomplete higher' : 2,
#                  'Secondary / secondary special':3,
#                  'Higher education':4})

# tst['income_type'] = tst.income_type.map({'Commercial associate' : '0', 
#                   "Working" : '1', 
#                   'State servant' : '2',
#                  'Pensioner':'3',
#                  'Student':'4'})

# tst['family_type'] = tst.family_type.map({'Married' : '0', 
#                   "Civil marriage" : '1', 
#                   'Separated' : '2',
#                  'Single / not married':'3',
#                  'Widow':'4'})

# tst['house_type'] = tst.house_type.map({'Municipal apartment' : '0', 
#                   "House / apartment" : '1', 
#                   'With parents' : '2',
#                  'Co-op apartment':'3',
#                  'Rented apartment':'4',
#                   'Office apartment' : '5'  })

## NONE 범주 사용

In [44]:
## occyp_type의 결측값들을 NONE이라는 범주로 만듦
trn['occyp_type'] = trn['occyp_type'].fillna('NONE').astype(str).values
tst['occyp_type'] = tst['occyp_type'].fillna('NONE').astype(str).values

## 라벨 인코딩 (보류)

In [46]:
trn.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26457 entries, 0 to 26456
Data columns (total 27 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            26457 non-null  object 
 1   car               26457 non-null  object 
 2   reality           26457 non-null  object 
 3   child_num         26457 non-null  int64  
 4   income_total      26457 non-null  float64
 5   income_type       26457 non-null  object 
 6   edu_type          26457 non-null  object 
 7   family_type       26457 non-null  object 
 8   house_type        26457 non-null  object 
 9   DAYS_BIRTH        26457 non-null  int64  
 10  DAYS_EMPLOYED     26457 non-null  int64  
 11  FLAG_MOBIL        26457 non-null  int64  
 12  work_phone        26457 non-null  int64  
 13  phone             26457 non-null  int64  
 14  email             26457 non-null  int64  
 15  occyp_type        26457 non-null  object 
 16  family_size       26457 non-null  float6

In [45]:
index_col = 'index'
target_col = 'credit'

cat_cols = [x for x in trn.columns if trn[x].dtype == 'object']
num_cols = [x for x in trn.columns if x not in cat_cols + [target_col]]
feature_cols = num_cols + cat_cols
print(len(feature_cols), len(cat_cols), len(num_cols))

26 8 18


In [47]:
from sklearn.preprocessing import LabelEncoder

lbe = LabelEncoder()
for i in cat_cols:
    trn[i] = lbe.fit_transform(trn[i])
    tst[i] = lbe.transform(tst[i])

In [49]:
trn.head()

Unnamed: 0_level_0,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,...,begin_month,credit,birth,age,begin_month_plus,begin_month_year,begin_month_bin,employed_plus,employed_bin,employed_bin2
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,202500.0,0,1,1,2,-13899,...,-6.0,1.0,38.0,40,6.0,0.0,10,4709,1,12
1,0,0,1,1,247500.0,0,4,0,1,-11380,...,-5.0,1.0,31.0,40,5.0,0.0,10,1540,1,4
2,1,1,1,0,450000.0,4,1,1,1,-19087,...,-22.0,2.0,52.0,60,22.0,1.0,30,4434,1,12
3,0,0,1,0,202500.0,0,4,1,1,-15088,...,-37.0,0.0,41.0,50,37.0,3.0,40,2092,1,5
4,0,1,1,0,157500.0,2,1,1,1,-15037,...,-26.0,2.0,41.0,50,26.0,2.0,30,2105,1,5


In [50]:
## 결측치 없음
trn.isnull().sum()
tst.isnull().sum()

gender              0
car                 0
reality             0
child_num           0
income_total        0
income_type         0
edu_type            0
family_type         0
house_type          0
DAYS_BIRTH          0
DAYS_EMPLOYED       0
FLAG_MOBIL          0
work_phone          0
phone               0
email               0
occyp_type          0
family_size         0
begin_month         0
birth               0
age                 0
begin_month_plus    0
begin_month_year    0
begin_month_bin     0
employed_plus       0
employed_bin        0
employed_bin2       0
dtype: int64

# 로그로스 함수

In [153]:
log_loss(y_test, RF_pred)

0.7623861475096417

# 랜덤 포레스트

In [56]:
ftr = trn.drop(['credit'],axis=1).values
target = trn['credit'].values
tst_ar = tst.values
n_class = 3
n_fold = 5

In [57]:
tst_ar.shape

(10000, 26)

In [59]:
## predict_rf

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=40)

rf_p_val = np.zeros((ftr.shape[0], n_class))
rf_p_tst = np.zeros((tst_ar.shape[0], n_class))
for i, (i_trn, i_val) in enumerate(cv.split(ftr, target), 1):
    print(f'training model for CV #{i}')
    rf_clf = RandomForestClassifier(n_estimators = 2000, 
                                random_state=40,
                                verbose=True,
                                oob_score=True,
                                n_jobs=10)
    rf_clf.fit(ftr[i_trn], target[i_trn])
    rf_p_val[i_val, :] = rf_clf.predict_proba(ftr[i_val])
    rf_p_tst += rf_clf.predict_proba(tst_ar) / n_fold
    
print(f'{log_loss(target, rf_p_val)}')
print(f'{confusion_matrix(target, np.argmax(rf_p_val, axis=1))}%')

training model for CV #1


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.7s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:    1.7s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:    3.2s
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed:    5.1s
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed:    7.6s
[Parallel(n_jobs=10)]: Done 2000 out of 2000 | elapsed:    8.6s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed:    0.2s
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed:    0.4s
[Parallel(n_job

training model for CV #2


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.7s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:    1.8s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:    3.3s
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed:    5.2s
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed:    7.5s
[Parallel(n_jobs=10)]: Done 2000 out of 2000 | elapsed:    8.6s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed:    0.2s
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed:    0.4s
[Parallel(n_job

training model for CV #3


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.7s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:    1.8s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:    3.3s
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed:    5.2s
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed:    7.6s
[Parallel(n_jobs=10)]: Done 2000 out of 2000 | elapsed:    8.6s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed:    0.2s
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed:    0.4s
[Parallel(n_job

training model for CV #4


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.7s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:    1.8s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:    3.3s
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed:    5.3s
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed:    7.7s
[Parallel(n_jobs=10)]: Done 2000 out of 2000 | elapsed:    8.7s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed:    0.2s
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed:    0.4s
[Parallel(n_job

training model for CV #5


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.7s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:    1.8s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:    3.3s
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed:    5.3s
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed:    7.8s
[Parallel(n_jobs=10)]: Done 2000 out of 2000 | elapsed:    8.7s finished
[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:    0.1s
[Parallel(n_jobs=10)]: Done 1230 tasks      | elapsed:    0.2s
[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed:    0.4s
[Parallel(n_job

0.7662433912513975
[[  598   510  2114]
 [  205  2508  3554]
 [  486  1090 15392]]%


[Parallel(n_jobs=10)]: Done 1780 tasks      | elapsed:    0.5s
[Parallel(n_jobs=10)]: Done 2000 out of 2000 | elapsed:    0.6s finished


# 제출하기

In [175]:
sub = pd.read_csv('data/sample_submission.csv', index_col = 0)
sub
sub[sub.columns] = rf_p_tst

In [177]:
sub.to_csv('data/210428_rf_처음.csv')