# **라이브러리 불러오기**

In [75]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd   
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, log_loss
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

In [78]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')
submission=pd.read_csv('sample_submission.csv')

# **[데이터 살펴보기]**

In [79]:
# obj, numeric- continuous, discrete
def type_check(train, cate_limit = 125):
    obj_feats = train.dtypes[train.dtypes == "object"].index
    numeric_feats = train.dtypes[train.dtypes != "object"].index
    
    numeric_continuous = []
    numeric_discrete = []

    for col in numeric_feats:
        if train[col].nunique() <=  cate_limit:
            numeric_discrete.append(col)
    numeric_continuous = np.setdiff1d(numeric_feats, numeric_discrete)
    print(f" obj_feats: {obj_feats}\n numeric_continuous : {numeric_continuous}\n numeric_discrete: {numeric_discrete}")

    return obj_feats,numeric_feats, numeric_continuous, numeric_discrete 

In [80]:
train.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit
0,0,F,N,N,0,202500.0,Commercial associate,Higher education,Married,Municipal apartment,-13899,-4709,1,0,0,0,,2.0,-6.0,1.0
1,1,F,N,Y,1,247500.0,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,-11380,-1540,1,0,0,1,Laborers,3.0,-5.0,1.0
2,2,M,Y,Y,0,450000.0,Working,Higher education,Married,House / apartment,-19087,-4434,1,0,1,0,Managers,2.0,-22.0,2.0
3,3,F,N,Y,0,202500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-15088,-2092,1,0,1,0,Sales staff,2.0,-37.0,0.0
4,4,F,Y,Y,0,157500.0,State servant,Higher education,Married,House / apartment,-15037,-2105,1,0,0,0,Managers,2.0,-26.0,2.0


In [81]:
test.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month
0,26457,M,Y,N,0,112500.0,Pensioner,Secondary / secondary special,Civil marriage,House / apartment,-21990,365243,1,0,1,0,,2.0,-60.0
1,26458,F,N,Y,0,135000.0,State servant,Higher education,Married,House / apartment,-18964,-8671,1,0,1,0,Core staff,2.0,-36.0
2,26459,F,N,Y,0,69372.0,Working,Secondary / secondary special,Married,House / apartment,-15887,-217,1,1,1,0,Laborers,2.0,-40.0
3,26460,M,Y,N,0,112500.0,Commercial associate,Secondary / secondary special,Married,House / apartment,-19270,-2531,1,1,0,0,Drivers,2.0,-41.0
4,26461,F,Y,Y,0,225000.0,State servant,Higher education,Married,House / apartment,-17822,-9385,1,1,0,0,Managers,2.0,-8.0


In [82]:
print(train.shape,test.shape)

(26457, 20) (10000, 19)


In [83]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26457 entries, 0 to 26456
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26457 non-null  int64  
 1   gender         26457 non-null  object 
 2   car            26457 non-null  object 
 3   reality        26457 non-null  object 
 4   child_num      26457 non-null  int64  
 5   income_total   26457 non-null  float64
 6   income_type    26457 non-null  object 
 7   edu_type       26457 non-null  object 
 8   family_type    26457 non-null  object 
 9   house_type     26457 non-null  object 
 10  DAYS_BIRTH     26457 non-null  int64  
 11  DAYS_EMPLOYED  26457 non-null  int64  
 12  FLAG_MOBIL     26457 non-null  int64  
 13  work_phone     26457 non-null  int64  
 14  phone          26457 non-null  int64  
 15  email          26457 non-null  int64  
 16  occyp_type     18286 non-null  object 
 17  family_size    26457 non-null  float64
 18  begin_

# **[데이터 전처리]**

**(주의) train data에 해준 그대로 test data 전처리**

# Features

### ID생성 / 중복컬럼 제거(train만)


In [84]:
train['ID'] = \
train['child_num'].astype(str) + '_' + train['income_total'].astype(str) + '_' +\
train['work_phone'].astype(str) + '_' + train['phone'].astype(str) + '_' +\
train['email'].astype(str) + '_' + train['family_size'].astype(str) + '_' +\
train['gender'].astype(str) + '_' + train['car'].astype(str) + '_' +\
train['reality'].astype(str) + '_' + train['income_type'].astype(str) + '_' +\
train['edu_type'].astype(str) + '_' + train['family_type'].astype(str) + '_' +\
train['house_type'].astype(str) + '_' + train['occyp_type'].astype(str)

In [85]:
train['begin_month']=-train['begin_month']  
test['begin_month']=-test['begin_month']    
train = train.sort_values(by=['begin_month'], axis=0).drop_duplicates(['ID'], keep='first')

## **1. 결측치 제거** 

In [86]:
train.isnull().sum()

index               0
gender              0
car                 0
reality             0
child_num           0
income_total        0
income_type         0
edu_type            0
family_type         0
house_type          0
DAYS_BIRTH          0
DAYS_EMPLOYED       0
FLAG_MOBIL          0
work_phone          0
phone               0
email               0
occyp_type       2005
family_size         0
begin_month         0
credit              0
ID                  0
dtype: int64

In [87]:
train=train.drop('occyp_type', axis=1)
test=test.drop('occyp_type', axis=1)

### FLAG_MOBIL drop(항상 1)

In [88]:
train=train.drop('FLAG_MOBIL', axis=1)
test=test.drop('FLAG_MOBIL', axis=1)

# EDA

In [89]:
obj_feats,numeric_feats, numeric_continuous, numeric_discrete = type_check(train)

 obj_feats: Index(['gender', 'car', 'reality', 'income_type', 'edu_type', 'family_type',
       'house_type', 'ID'],
      dtype='object')
 numeric_continuous : ['DAYS_BIRTH' 'DAYS_EMPLOYED' 'income_total' 'index']
 numeric_discrete: ['child_num', 'work_phone', 'phone', 'email', 'family_size', 'begin_month', 'credit']


In [90]:
sns.heatmap(train[numeric_feats].corr(), fmt = ".2f", annot= True)
# plt.figure(figsize=(20,20))
plt.show()

KeyboardInterrupt: 

다중공선성 우려
* family size, child_num
* days_birth, days_employed

In [91]:
train["parents_num"] = train['family_size']- train['child_num']
train.drop('family_size', axis=1, inplace = True)

test["parents_num"] = test['family_size']- test['child_num']
test.drop('family_size', axis=1, inplace = True)

## **2. Binary variables**

In [92]:
train['gender'] = train['gender'].replace(['F','M'],[0,1])
test['gender'] = test['gender'].replace(['F','M'],[0,1])
print('gender :')
print(train['gender'].value_counts())
print('--------------')

print('Having a car or not : ')
train['car'] = train['car'].replace(['N','Y'],[0,1])
test['car'] = test['car'].replace(['N','Y'],[0,1])
print(train['car'].value_counts())
print('--------------')

print('Having house reality or not: ')
train['reality'] = train['reality'].replace(['N','Y'],[0,1])
test['reality'] = test['reality'].replace(['N','Y'],[0,1])
print(train['reality'].value_counts())
print('--------------')
      
print('Having a phone or not: ')
print(train['phone'].value_counts())
print('--------------')
      

print('Having a email or not: ')
print(train['email'].value_counts())
print('--------------')
      

print('Having a work phone or not: ')
print(train['work_phone'].value_counts())
print('--------------')

gender :
0    4873
1    2730
Name: gender, dtype: int64
--------------
Having a car or not : 
0    4659
1    2944
Name: car, dtype: int64
--------------
Having house reality or not: 
1    4917
0    2686
Name: reality, dtype: int64
--------------
Having a phone or not: 
0    5261
1    2342
Name: phone, dtype: int64
--------------
Having a email or not: 
0    6832
1     771
Name: email, dtype: int64
--------------
Having a work phone or not: 
0    5751
1    1852
Name: work_phone, dtype: int64
--------------


## 3. Categorical Features

### (1) child_num & parents_num

In [93]:
train['child_num'].value_counts(sort=True)
# 19       1
# 7        1
# 14       1  이상치

0     5109
1     1618
2      739
3      115
4       16
5        3
19       1
7        1
14       1
Name: child_num, dtype: int64

In [94]:
# train.loc[train['child_num'] >= 3,'child_num']=3
# test.loc[test['child_num']>= 3, 'child_num']=3

In [95]:
train.parents_num.value_counts()
#-1.0       1 이상치 제거하기

 2.0    5681
 1.0    1916
 0.0       5
-1.0       1
Name: parents_num, dtype: int64

### **(2) Type**

In [96]:
print(train['income_type'].unique())
print(train['edu_type'].unique())
print(train['family_type'].unique())
print(train['house_type'].unique())

['State servant' 'Pensioner' 'Working' 'Commercial associate' 'Student']
['Secondary / secondary special' 'Higher education' 'Incomplete higher'
 'Lower secondary' 'Academic degree']
['Married' 'Widow' 'Civil marriage' 'Single / not married' 'Separated']
['House / apartment' 'With parents' 'Municipal apartment'
 'Rented apartment' 'Co-op apartment' 'Office apartment']


In [64]:
print(train['income_type'].value_counts(sort=False))
print(train['income_type'].value_counts(normalize=True,sort=False))
#Student                     7 

print(train['edu_type'].value_counts(sort=False))
print(train['edu_type'].value_counts(normalize=True,sort=False))
#Academic degree                     23

print(train['family_type'].value_counts(sort=False))
print(train['family_type'].value_counts(normalize=True,sort=False))

print(train['house_type'].value_counts(sort=False))
print(train['house_type'].value_counts(normalize=True,sort=False))


State servant            648
Working                 4061
Pensioner                935
Student                    3
Commercial associate    1956
Name: income_type, dtype: int64
State servant           0.085230
Working                 0.534131
Pensioner               0.122978
Student                 0.000395
Commercial associate    0.257267
Name: income_type, dtype: float64
Incomplete higher                 333
Lower secondary                    96
Secondary / secondary special    5013
Higher education                 2155
Academic degree                     6
Name: edu_type, dtype: int64
Incomplete higher                0.043799
Lower secondary                  0.012627
Secondary / secondary special    0.659345
Higher education                 0.283441
Academic degree                  0.000789
Name: edu_type, dtype: float64
Civil marriage           707
Married                 4962
Separated                492
Widow                    291
Single / not married    1151
Name: family_type, 

In [66]:
train["credit"][train["income_type"] =="Student"].value_counts()

train["credit"][train["edu_type"] =="Academic degree"].value_counts()

2.0    2
1.0    1
Name: credit, dtype: int64

In [98]:
label_encoder=LabelEncoder()
train['income_type']=label_encoder.fit_transform(train['income_type'])
test['income_type']=label_encoder.transform(test['income_type'])
########################################################################
train['edu_type']=label_encoder.fit_transform(train['edu_type'])
test['edu_type']=label_encoder.transform(test['edu_type'])
########################################################################
train['family_type']=label_encoder.fit_transform(train['family_type'])
test['family_type']=label_encoder.transform(test['family_type'])
########################################################################
train['house_type']=label_encoder.fit_transform(train['house_type'])
test['house_type']=label_encoder.transform(test['house_type'])
########################################################################
train['income_total']=label_encoder.fit_transform(train['income_total'])
test['income_total']=label_encoder.fit_transform(test['income_total'])

## **4. continuous variable** --> scale

In [99]:
numeric_continuous[0:3]

array(['DAYS_BIRTH', 'DAYS_EMPLOYED', 'income_total'], dtype=object)

### Minus continuous variable

In [100]:
train["DAYS_EMPLOYED"][train["DAYS_EMPLOYED"] >0] = train["DAYS_EMPLOYED"][train["DAYS_EMPLOYED"] >0].replace(365243, 0)
train["DAYS_EMPLOYED"][train["DAYS_EMPLOYED"] >0]

test["DAYS_EMPLOYED"][test["DAYS_EMPLOYED"] >0] = test["DAYS_EMPLOYED"][test["DAYS_EMPLOYED"] >0].replace(365243, 0)
test["DAYS_EMPLOYED"][test["DAYS_EMPLOYED"] >0]

Series([], Name: DAYS_EMPLOYED, dtype: int64)

In [101]:
train["DAYS_BIRTH"]=-train["DAYS_BIRTH"]
test["DAYS_BIRTH"]=-test["DAYS_BIRTH"]
train["DAYS_EMPLOYED"]=-train["DAYS_EMPLOYED"]
test["DAYS_EMPLOYED"]=-test["DAYS_EMPLOYED"]

In [102]:
scaler = StandardScaler()
train[numeric_continuous[0:3]] = scaler.fit_transform(train[numeric_continuous[0:3]])
test[numeric_continuous[0:3]] = scaler.transform(test[numeric_continuous[0:3]])

In [103]:
train

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,begin_month,credit,ID,parents_num
20098,20098,0,0,1,1,-0.995996,2,4,1,1,-0.158541,0.173898,0,0,0,-0.0,0.0,1_112500.0_0_0_0_3.0_F_N_Y_State servant_Secon...,2.0
19861,19861,0,0,1,0,-0.476216,1,4,4,1,1.870918,-0.946245,0,1,0,-0.0,1.0,0_135000.0_0_1_0_1.0_F_N_Y_Pensioner_Secondary...,1.0
23760,23760,1,1,0,0,0.523361,4,4,1,1,-1.307328,0.009690,0,0,0,-0.0,0.0,0_202500.0_0_0_0_2.0_M_Y_N_Working_Secondary /...,2.0
9553,9553,1,0,0,0,0.283463,0,1,0,1,0.936917,-0.841514,0,0,1,-0.0,0.0,0_180000.0_0_0_1_2.0_M_N_N_Commercial associat...,2.0
16233,16233,0,0,1,0,-1.295869,1,4,1,1,1.269859,-0.946245,0,0,0,-0.0,1.0,0_90000.0_0_0_0_2.0_F_N_Y_Pensioner_Secondary ...,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18174,18174,0,0,0,1,0.743268,4,4,2,1,-0.984904,0.564805,0,0,0,60.0,2.0,1_225000.0_0_0_0_2.0_F_N_N_Working_Secondary /...,1.0
24612,24612,0,0,1,3,0.283463,2,1,1,1,-0.956038,-0.321741,1,0,0,60.0,2.0,3_180000.0_1_0_0_5.0_F_N_Y_State servant_Highe...,2.0
15463,15463,0,0,0,0,-0.995996,4,1,1,5,-1.195042,0.649279,1,1,0,60.0,2.0,0_112500.0_1_1_0_2.0_F_N_N_Working_Higher educ...,2.0
18573,18573,0,1,1,0,1.043141,0,1,1,1,-1.058049,-0.203218,1,0,0,60.0,2.0,0_256500.0_1_0_0_2.0_F_Y_Y_Commercial associat...,2.0


In [104]:
test

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,work_phone,phone,email,begin_month,parents_num
0,26457,1,1,0,0,-1.155928,1,4,0,1,1.579073,-0.946245,0,1,0,60.0,2.0
1,26458,0,0,1,0,-0.736106,2,1,1,1,0.838820,2.790867,0,1,0,36.0,2.0
2,26459,0,0,1,0,-1.695699,4,4,1,1,0.086090,-0.852720,1,1,0,40.0,2.0
3,26460,1,1,0,0,-1.155928,0,4,1,1,0.913677,0.144590,1,0,0,41.0,2.0
4,26461,0,1,1,0,0.223488,2,1,1,1,0.559451,3.098594,1,0,0,8.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,36452,0,1,1,0,0.043564,4,2,1,1,0.748062,1.395754,1,1,0,19.0,2.0
9996,36453,1,1,1,0,0.043564,4,4,0,1,-1.137309,-0.379493,1,0,0,34.0,2.0
9997,36454,0,0,1,0,0.663302,4,4,1,1,1.340802,5.095370,0,0,0,55.0,2.0
9998,36455,0,1,0,0,-0.136359,0,4,1,1,0.246079,-0.478621,0,1,0,33.0,2.0


In [110]:
train.drop("ID", axis=1, inplace= True)
train.drop("index", axis=1, inplace= True)

In [133]:
test.drop("index", axis =1, inplace= True)

In [134]:
print(train.shape, test.shape)

(7603, 17) (10000, 16)


# **[이상치 처리]**

In [138]:
train["child_num"][train["child_num"]>5] = np.nan
train = train.dropna()

# **[데이터 모델링]**

In [139]:
train_x=train.drop('credit', axis=1)
train_y=train[['credit']]
test_x=test

In [140]:
print(train_x.shape, train_y.shape, test.shape)

(7600, 16) (7600, 1) (10000, 16)


In [148]:
n_est = 1000
seed = 42
n_fold = 5
n_class = 3

lgb_params = {
    'metric': 'multi_logloss',
    'n_estimators': n_est,
    'objective': 'multiclass',
    'random_state': seed,
    'learning_rate': 0.01,
    'min_child_samples': 20,
    'reg_alpha': 3e-5,
    'reg_lambda': 9e-2,
    'num_leaves': 64,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'num_class': n_class
}

In [150]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss
from sklearn.metrics import f1_score
from tensorflow.keras.utils import to_categorical

X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, 
                                                    stratify=train_y, test_size=0.25,
                                                    random_state = 10086)

print("Train set: ")
print(X_train.shape)
print(y_train.shape)
print("===========")
print("Validation set: ")
print(X_val.shape)
print(y_val.shape)


clf=LGBMClassifier(num_leaves=31,
                       max_depth=8, 
                       learning_rate=0.02,
                       n_estimators=250,
                       subsample = 0.8,
                       colsample_bytree =0.8)
clf.fit(X_train, y_train)
y_pred=clf.predict_proba(X_val)

print(f"log_loss: {log_loss(to_categorical(y_val['credit']), y_pred)}")

Train set: 
(5700, 16)
(5700, 1)
Validation set: 
(1900, 16)
(1900, 1)
log_loss: 0.7814281163838245


In [151]:
from sklearn.model_selection import KFold, StratifiedKFold

def run_kfold(clf):
    folds=StratifiedKFold(n_splits=5, shuffle=True, random_state=55)
    outcomes=[]
    sub=np.zeros((test_x.shape[0], 3))  
    for n_fold, (train_index, val_index) in enumerate(folds.split(train_x, train_y)):
        X_train, X_val = train_x.iloc[train_index], train_x.iloc[val_index]
        y_train, y_val = train_y.iloc[train_index], train_y.iloc[val_index]
        clf.fit(X_train, y_train)
        
        predictions=clf.predict_proba(X_val)
        
        logloss=log_loss(to_categorical(y_val['credit']), predictions)
        outcomes.append(logloss)
        print(f"FOLD {n_fold} : logloss:{logloss}")
        
        sub+=clf.predict_proba(test_x)
        
        
    mean_outcome=np.mean(outcomes)
    
    print("Mean:{}".format(mean_outcome))
    return sub/folds.n_splits

my_submission = run_kfold(clf)

# lgbm Mean:0.7748655893306473 --> 리더보드 : 0.82003

FOLD 0 : logloss:0.775327422755567
FOLD 1 : logloss:0.770019961107522
FOLD 2 : logloss:0.7687697820249774
FOLD 3 : logloss:0.7809407500901946
FOLD 4 : logloss:0.7792700306749755
Mean:0.7748655893306473


In [152]:
my_submission

array([[0.13202786, 0.11613985, 0.75183229],
       [0.09273268, 0.14225272, 0.7650146 ],
       [0.07588333, 0.13529749, 0.78881918],
       ...,
       [0.08711766, 0.08963543, 0.82324691],
       [0.10761173, 0.10801938, 0.78436889],
       [0.08343143, 0.1631811 , 0.75338748]])

In [153]:
submission

Unnamed: 0,index,0,1,2
0,26457,0,0,0
1,26458,0,0,0
2,26459,0,0,0
3,26460,0,0,0
4,26461,0,0,0
...,...,...,...,...
9995,36452,0,0,0
9996,36453,0,0,0
9997,36454,0,0,0
9998,36455,0,0,0


In [154]:
submission.loc[:,1:]=my_submission

In [155]:
submission

Unnamed: 0,index,0,1,2
0,26457,0.132028,0.116140,0.751832
1,26458,0.092733,0.142253,0.765015
2,26459,0.075883,0.135297,0.788819
3,26460,0.147014,0.130996,0.721991
4,26461,0.139326,0.166725,0.693949
...,...,...,...,...
9995,36452,0.139070,0.212870,0.648060
9996,36453,0.124683,0.153097,0.722220
9997,36454,0.087118,0.089635,0.823247
9998,36455,0.107612,0.108019,0.784369


In [224]:
os.chdir('../baseline')

In [156]:
submission.to_csv('submission0428_2.csv', index=False)