<a href="https://colab.research.google.com/github/SeryeongLee/-/blob/main/0531_credit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import os
import re

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor

from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.utils import shuffle
from scipy.special import erfc
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVR

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, mean_squared_error, r2_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, VotingRegressor

from lightgbm import LGBMRegressor

import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor

import warnings

In [None]:
train = pd.read_csv('/content/drive/MyDrive/data science/first/data/train.csv')
test = pd.read_csv('/content/drive/MyDrive/data science/first/data/test.csv')

In [None]:
train_original = train.copy()
test_original = test.copy()

## 기본 변환

#### 1) 0, 1 변환

In [None]:
# binary transform (0, 1)

train.gender = train.gender.replace(['F','M'],[0,1])  
test.gender = test.gender.replace(['F','M'],[0,1])

train.car = train.car.replace(['N','Y'],[0,1])  
test.car = test.car.replace(['N','Y'],[0,1])

train.reality = train.reality.replace(['N','Y'],[0,1])  
test.reality = test.reality.replace(['N','Y'],[0,1])

#### 2) 수 변환

In [None]:
# 음수 -> 양수 변환

train.DAYS_BIRTH = -train.DAYS_BIRTH
test.DAYS_BIRTH = -test.DAYS_BIRTH

train.DAYS_EMPLOYED = -train.DAYS_EMPLOYED
test.DAYS_EMPLOYED = -test.DAYS_EMPLOYED

train.begin_month = -train.begin_month
test.begin_month = -test.begin_month

In [None]:
# 365243 인 값 0으로 바꾸기

train.DAYS_EMPLOYED = train.DAYS_EMPLOYED.replace([-365243],[0])
test.DAYS_EMPLOYED = test.DAYS_EMPLOYED.replace([-365243],[0])

In [None]:
# 360으로 나눠서 나이, 일한 년수 계산

train['EMPLOYED']= train.DAYS_EMPLOYED.apply(lambda x: abs(x/360) )
train['age']= train.DAYS_BIRTH.apply(lambda x: abs(x/360) ).astype(int)
train['begin_month']= train.begin_month.apply(lambda x: abs(x) ).astype(int)

test['EMPLOYED']= test.DAYS_EMPLOYED.apply(lambda x: abs(x/360) )
test['age']= test.DAYS_BIRTH.apply(lambda x: abs(x/360) ).astype(int)
test['begin_month']= test.begin_month.apply(lambda x: abs(x) ).astype(int)

#### 3) 로그변환

In [None]:
tr_it=train['income_total']
tr_it_log = np.log1p(tr_it)
train['income_total']= tr_it_log
te_it=test['income_total']
te_it_log = np.log1p(te_it)
test['income_total']= te_it_log

In [None]:
train['EMPLOYED']= np.log1p(train['EMPLOYED'])
test['EMPLOYED']= np.log1p(test['EMPLOYED'])

#### 4) child_num, family_size 관련 컬럼 추가

In [None]:
# child_num이 14, 19인 행 지우기

idx_child_drop = train[(train['child_num'] == 14)|(train['child_num'] == 19)].index
train = train.drop(idx_child_drop)

In [None]:
# child_num이 family_size보다 큰 행도 지우기

idx_child_drop2 = train[train['family_size'] < train['child_num']].index
train = train.drop(idx_child_drop2)

In [None]:
train['cf_ratio'] = train['child_num'] / train['family_size']
test['cf_ratio'] = test['child_num'] / test['family_size']

# train['if_ratio'] = train['income_total'] / train['family_size']
# test['if_ratio'] = test['income_total'] / test['family_size']

#### 5) 결측치 채우기 - unemployed 만들고 나머지는 일단 Laborers으로 채우고 진행함.

In [None]:
def unemployed(data):
  data.loc[(data['DAYS_EMPLOYED'] == 0), 'occyp_type'] = 'unemployed'

unemployed(train)
unemployed(test)

In [None]:
train['occyp_type'].unique()

array([nan, 'Laborers', 'Managers', 'Sales staff',
       'High skill tech staff', 'Core staff', 'Drivers', 'Medicine staff',
       'Accountants', 'unemployed', 'Realty agents', 'Security staff',
       'Cleaning staff', 'Private service staff', 'Cooking staff',
       'Secretaries', 'HR staff', 'IT staff', 'Low-skill Laborers',
       'Waiters/barmen staff'], dtype=object)

In [None]:
# 나머지는 일단 datawig 쓰면 시간이 너무 오래걸려서 Laborers로 채우겠다
train = train.fillna('Laborers')

In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26452 entries, 0 to 26456
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26452 non-null  int64  
 1   gender         26452 non-null  int64  
 2   car            26452 non-null  int64  
 3   reality        26452 non-null  int64  
 4   child_num      26452 non-null  int64  
 5   income_total   26452 non-null  float64
 6   income_type    26452 non-null  object 
 7   edu_type       26452 non-null  object 
 8   family_type    26452 non-null  object 
 9   house_type     26452 non-null  object 
 10  DAYS_BIRTH     26452 non-null  int64  
 11  DAYS_EMPLOYED  26452 non-null  int64  
 12  FLAG_MOBIL     26452 non-null  int64  
 13  work_phone     26452 non-null  int64  
 14  phone          26452 non-null  int64  
 15  email          26452 non-null  int64  
 16  occyp_type     26452 non-null  object 
 17  family_size    26452 non-null  float64
 18  begin_

In [None]:
# test도
test = test.fillna('Laborers')

In [None]:
train.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit,EMPLOYED,age,cf_ratio
0,0,0,0,0,0,12.2185,Commercial associate,Higher education,Married,Municipal apartment,13899,4709,1,0,0,0,Laborers,2.0,6,1.0,2.644795,38,0.0
1,1,0,0,1,1,12.41917,Commercial associate,Secondary / secondary special,Civil marriage,House / apartment,11380,1540,1,0,0,1,Laborers,3.0,5,1.0,1.663505,31,0.333333
2,2,1,1,1,0,13.017005,Working,Higher education,Married,House / apartment,19087,4434,1,0,1,0,Managers,2.0,22,2.0,2.589016,53,0.0
3,3,0,0,1,0,12.2185,Commercial associate,Secondary / secondary special,Married,House / apartment,15088,2092,1,0,1,0,Sales staff,2.0,37,0.0,1.918555,41,0.0
4,4,0,1,1,0,11.967187,State servant,Higher education,Married,House / apartment,15037,2105,1,0,0,0,Managers,2.0,26,2.0,1.923843,41,0.0


## child_num, family_size PCA 해보기

위에서 child_num, family_size의 비율을 고려해서 새로운 변수 만들었었는데(cf_ratio) 이거랑 pca 쓴 변수랑 뭐가 더 성능 좋을지 확인해보려 한다.

In [None]:
train_pca = train[['child_num', 'family_size']]
train_pca_target = train['credit']

In [None]:
train_pca = StandardScaler().fit_transform(train_pca)

In [None]:
from sklearn.decomposition import PCA
pca = PCA(n_components=1) # 어차피 변수 2개라서 하나로만 진행
printcipalComponents = pca.fit_transform(train_pca)
principalDf = pd.DataFrame(data=printcipalComponents, columns = ['principal component'])

In [None]:
principalDf.head()

Unnamed: 0,principal component
0,-0.569295
1,1.192758
2,-0.569295
3,-0.569295
4,-0.569295


In [None]:
principalDf.shape # 여기는 26452개 맞는데,,,

(26452, 1)

In [None]:
pca.explained_variance_ratio_ # 전체 분산의 약 94% 설명

array([0.94343342])

In [None]:
train['cf_pca'] = principalDf # 기존 데이터 셋에 넣어주기

In [None]:
train.info() # 근데 넣으면 26447개 된다...???

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26452 entries, 0 to 26456
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26452 non-null  int64  
 1   gender         26452 non-null  int64  
 2   car            26452 non-null  int64  
 3   reality        26452 non-null  int64  
 4   child_num      26452 non-null  int64  
 5   income_total   26452 non-null  float64
 6   income_type    26452 non-null  object 
 7   edu_type       26452 non-null  object 
 8   family_type    26452 non-null  object 
 9   house_type     26452 non-null  object 
 10  DAYS_BIRTH     26452 non-null  int64  
 11  DAYS_EMPLOYED  26452 non-null  int64  
 12  FLAG_MOBIL     26452 non-null  int64  
 13  work_phone     26452 non-null  int64  
 14  phone          26452 non-null  int64  
 15  email          26452 non-null  int64  
 16  occyp_type     26452 non-null  object 
 17  family_size    26452 non-null  float64
 18  begin_

In [None]:
train_or = train.copy()

## 일단 라벨링..

원래는 임베딩이랑 원핫인코딩 얘기했었는데 뭔가 잘 안돼서 일단 라벨링으로 하고 성능 확인해보겠다

In [None]:
label_encoder = preprocessing.LabelEncoder()

In [None]:
train['income_type'] = label_encoder.fit_transform(train['income_type'])
test['income_type'] = label_encoder.transform(test['income_type'])

train['edu_type'] = label_encoder.fit_transform(train['edu_type'])
test['edu_type'] = label_encoder.transform(test['edu_type'])

train['family_type'] = label_encoder.fit_transform(train['family_type'])
test['family_type'] = label_encoder.transform(test['family_type'])

train['house_type'] = label_encoder.fit_transform(train['house_type'])
test['house_type'] = label_encoder.transform(test['house_type'])

train['occyp_type'] = label_encoder.fit_transform(train['occyp_type'])
test['occyp_type'] = label_encoder.transform(test['occyp_type'])

In [None]:
train.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit,EMPLOYED,age,cf_ratio,cf_pca
0,0,0,0,0,0,12.2185,0,1,1,2,13899,4709,1,0,0,0,8,2.0,6,1.0,2.644795,38,0.0,-0.569295
1,1,0,0,1,1,12.41917,0,4,0,1,11380,1540,1,0,0,1,8,3.0,5,1.0,1.663505,31,0.333333,1.192758
2,2,1,1,1,0,13.017005,4,1,1,1,19087,4434,1,0,1,0,10,2.0,22,2.0,2.589016,53,0.0,-0.569295
3,3,0,0,1,0,12.2185,0,4,1,1,15088,2092,1,0,1,0,14,2.0,37,0.0,1.918555,41,0.0,-0.569295
4,4,0,1,1,0,11.967187,2,1,1,1,15037,2105,1,0,0,0,10,2.0,26,2.0,1.923843,41,0.0,-0.569295


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26452 entries, 0 to 26456
Data columns (total 24 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   index          26452 non-null  int64  
 1   gender         26452 non-null  int64  
 2   car            26452 non-null  int64  
 3   reality        26452 non-null  int64  
 4   child_num      26452 non-null  int64  
 5   income_total   26452 non-null  float64
 6   income_type    26452 non-null  int64  
 7   edu_type       26452 non-null  int64  
 8   family_type    26452 non-null  int64  
 9   house_type     26452 non-null  int64  
 10  DAYS_BIRTH     26452 non-null  int64  
 11  DAYS_EMPLOYED  26452 non-null  int64  
 12  FLAG_MOBIL     26452 non-null  int64  
 13  work_phone     26452 non-null  int64  
 14  phone          26452 non-null  int64  
 15  email          26452 non-null  int64  
 16  occyp_type     26452 non-null  int64  
 17  family_size    26452 non-null  float64
 18  begin_

In [None]:
# PCA에 결측치가 5개 있다 왜지?
# 왜인지 모르겠어서 일단 뒤에서 성능 확인할 때는 드랍하고 해주겠다ㅜㅜ

In [None]:
train_dup = train.copy()

In [None]:
train = train.drop(['index', 'FLAG_MOBIL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'child_num', 'family_size'], axis = 1)

In [None]:
train.head()

Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,work_phone,phone,email,occyp_type,begin_month,credit,EMPLOYED,age,cf_ratio,cf_pca
0,0,0,0,12.2185,0,1,1,2,0,0,0,8,6,1.0,2.644795,38,0.0,-0.569295
1,0,0,1,12.41917,0,4,0,1,0,0,1,8,5,1.0,1.663505,31,0.333333,1.192758
2,1,1,1,13.017005,4,1,1,1,0,1,0,10,22,2.0,2.589016,53,0.0,-0.569295
3,0,0,1,12.2185,0,4,1,1,0,1,0,14,37,0.0,1.918555,41,0.0,-0.569295
4,0,1,1,11.967187,2,1,1,1,0,0,0,10,26,2.0,1.923843,41,0.0,-0.569295


In [None]:
train1 = train.drop('cf_pca', axis = 1) # ratio 성능 확인용
train2 = train.drop('cf_ratio', axis = 1)# pca 성능 확인용

## PCA 성능 확인 --> ratio보다 pca가 성능 좋음

### cf_ratio로 확인

In [None]:
train1_x = train1.drop("credit", axis = 1)
train1_y = train1['credit']

In [None]:
clf = RandomForestClassifier()

X_train1, X_val1, y_train1, y_val1 = train_test_split(train1_x, train1_y,
                 stratify = train1_y, 
                 test_size = 0.25,
                 random_state = 10086
                 )

clf.fit(X_train1, y_train1)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
y_proba1 = clf.predict_proba(X_val1)
y_val_onehot1 = pd.get_dummies(y_val1)

In [None]:
y_proba1

array([[0.19, 0.43, 0.38],
       [0.31, 0.06, 0.63],
       [0.54, 0.11, 0.35],
       ...,
       [0.01, 0.03, 0.96],
       [0.16, 0.06, 0.78],
       [0.09, 0.19, 0.72]])

In [None]:
y_val_onehot1

Unnamed: 0,0.0,1.0,2.0
16210,0,1,0
22921,0,0,1
7039,0,0,1
7888,0,0,1
5909,0,0,1
...,...,...,...
3601,0,0,1
23310,0,0,1
17227,0,0,1
25175,0,1,0


In [None]:
from sklearn.metrics import log_loss
log_loss(y_val_onehot1, y_proba1)

0.9159015275014014

### cf_pca로 확인

In [None]:
train2.shape

(26452, 17)

In [None]:
train2 = train2.dropna(axis = 0)

In [None]:
train2.shape # 5개 행 제거 됨

(26447, 17)

In [None]:
train2_x = train2.drop("credit", axis = 1)
train2_y = train2['credit']

X_train2, X_val2, y_train2, y_val2 = train_test_split(train2_x, train2_y,
                 stratify = train2_y, 
                 test_size = 0.25,
                 random_state = 10086
                 )

In [None]:
clf.fit(X_train2, y_train2)

y_proba2 = clf.predict_proba(X_val2)
y_val_onehot2 = pd.get_dummies(y_val2)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
log_loss(y_val_onehot2, y_proba2) # 성능 좋아졌다.

0.8031040350148698

저번에 고려했던 income_total/family_size 변수를 넣고 돌릴 때 보다 빼고 할 때 성능이 더 좋게 나온다... 어떤 팀은 넣고 했던데 조합을 다른거랑 해보거나 해야할..듯?

### child_num과 family_size는 pca로 바꿔서 사용하는게 성능에 더 좋을 것 같다. 

다만 위에서 5개 nan 값이 생겨서 일단 지우고 했는데, 이거는.... 왜인지 모르겠다.

## 직업 변수 묶어보기--> 변화 X

In [None]:
train_or['income_type'] = label_encoder.fit_transform(train_or['income_type'])

train_or['edu_type'] = label_encoder.fit_transform(train_or['edu_type'])

train_or['family_type'] = label_encoder.fit_transform(train_or['family_type'])

train_or['house_type'] = label_encoder.fit_transform(train_or['house_type'])

In [None]:
def occpy(data):
    data.loc[(data['occyp_type'] == 'IT staff')|(data['occyp_type'] == 'Low-skill Laborers')|(data['occyp_type'] == 'Private service staff'), 'occyp_type_c'] = 0
    data.loc[(data['occyp_type'] == 'High skill tech staff')|(data['occyp_type'] == 'Secretaries')|(data['occyp_type'] == 'Waiters/barmen staff'), 'occyp_type_c'] = 1
    data.loc[(data['occyp_type'] == 'HR staff')|(data['occyp_type'] == 'Medicine staff')|(data['occyp_type'] == 'Realty agents')|(data['occyp_type'] == 'Security staff')|(data['occyp_type'] == 'Cleaning staff'), 'occyp_type_c'] = 2
    data.loc[data['occyp_type'] == 'Core staff', 'occyp_type_c'] = 3
    data.loc[data['occyp_type'] == 'Drivers', 'occyp_type_c'] = 4
    data.loc[data['occyp_type'] == 'Laborers', 'occyp_type_c'] = 5
    data.loc[data['occyp_type'] == 'Managers', 'occyp_type_c'] = 6
    data.loc[data['occyp_type'] == 'Sales staff', 'occyp_type_c'] = 7
    data.loc[(data['occyp_type'] == 'Accountants')|(data['occyp_type'] == 'Cooking staff'), 'occyp_type_c'] = 8
    data.loc[data['occyp_type'] == 'unemployed', 'occyp_type_c'] = 9


occpy(train_or)


In [None]:
train_or.head()

Unnamed: 0,index,gender,car,reality,child_num,income_total,income_type,edu_type,family_type,house_type,DAYS_BIRTH,DAYS_EMPLOYED,FLAG_MOBIL,work_phone,phone,email,occyp_type,family_size,begin_month,credit,EMPLOYED,age,cf_ratio,cf_pca,try,occyp_type_c
0,0,0,0,0,0,12.2185,0,1,1,2,13899,4709,1,0,0,0,Laborers,2.0,6,1.0,2.644795,38,0.0,-0.569295,0.0,5.0
1,1,0,0,1,1,12.41917,0,4,0,1,11380,1540,1,0,0,1,Laborers,3.0,5,1.0,1.663505,31,0.333333,1.192758,1.0,5.0
2,2,1,1,1,0,13.017005,4,1,1,1,19087,4434,1,0,1,0,Managers,2.0,22,2.0,2.589016,53,0.0,-0.569295,2.0,6.0
3,3,0,0,1,0,12.2185,0,4,1,1,15088,2092,1,0,1,0,Sales staff,2.0,37,0.0,1.918555,41,0.0,-0.569295,2.0,7.0
4,4,0,1,1,0,11.967187,2,1,1,1,15037,2105,1,0,0,0,Managers,2.0,26,2.0,1.923843,41,0.0,-0.569295,0.0,6.0


In [None]:
train_occ = train_or.drop(['index', 'FLAG_MOBIL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'child_num', 'family_size', 'cf_ratio', 'occyp_type'], axis = 1)

In [None]:
train_occ.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26452 entries, 0 to 26456
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   gender        26452 non-null  int64  
 1   car           26452 non-null  int64  
 2   reality       26452 non-null  int64  
 3   income_total  26452 non-null  float64
 4   income_type   26452 non-null  int64  
 5   edu_type      26452 non-null  int64  
 6   family_type   26452 non-null  int64  
 7   house_type    26452 non-null  int64  
 8   work_phone    26452 non-null  int64  
 9   phone         26452 non-null  int64  
 10  email         26452 non-null  int64  
 11  begin_month   26452 non-null  int64  
 12  credit        26452 non-null  float64
 13  EMPLOYED      26452 non-null  float64
 14  age           26452 non-null  int64  
 15  cf_pca        26447 non-null  float64
 16  try           26452 non-null  float64
 17  occyp_type_c  26452 non-null  float64
dtypes: float64(6), int64(12)
m

In [None]:
train_occ = train_occ.dropna(axis = 0)

In [None]:
train_occ_x = train_occ.drop("credit", axis = 1)
train_occ_y = train_occ['credit']

X_train_occ, X_val_occ, y_train_occ, y_val_occ = train_test_split(train_occ_x, train_occ_y,
                 stratify = train_occ_y, 
                 test_size = 0.25,
                 random_state = 10086
                 )

clf.fit(X_train_occ, y_train_occ)

y_proba_occ = clf.predict_proba(X_val_occ)
y_val_onehot_occ = pd.get_dummies(y_val_occ)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
log_loss(y_val_onehot_occ, y_proba_occ) # 성능이 안좋아졌다,,,, 그냥 범주화 하지 말고 쓰는게 나으려나?

0.839025363097569

### 직업 변수를 묶어봤는데, 묶는 방법이 이상했던 건지 아니면 묶는게 안 좋은 시도인지 성능이 오히려 낮아졌다. 그냥 쓰는게 나을수도..

## work_phone, phone, email 묶기 --> 괜찮


세 변수 다 폰 또는 이메일 소유 여부를 나타내어 그냥 묶어봤다

In [None]:
def try_t(data):
  data.loc[(data['work_phone'] == 0)&(data['phone'] == 0)&(data['email'] == 0), 'try'] = 0
  data.loc[(data['work_phone'] == 0)&(data['phone'] == 0)&(data['email'] == 1), 'try'] = 1
  data.loc[(data['work_phone'] == 0)&(data['phone'] == 1)&(data['email'] == 0), 'try'] = 2
  data.loc[(data['work_phone'] == 1)&(data['phone'] == 0)&(data['email'] == 0), 'try'] = 3
  data.loc[(data['work_phone'] == 0)&(data['phone'] == 1)&(data['email'] == 1), 'try'] = 4
  data.loc[(data['work_phone'] == 1)&(data['phone'] == 0)&(data['email'] == 1), 'try'] = 5
  data.loc[(data['work_phone'] == 1)&(data['phone'] == 1)&(data['email'] == 0), 'try'] = 6
  data.loc[(data['work_phone'] == 1)&(data['phone'] == 1)&(data['email'] == 1), 'try'] = 7

try_t(train)

In [None]:
train.head()

Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,work_phone,phone,email,occyp_type,begin_month,credit,EMPLOYED,age,cf_ratio,cf_pca,try
0,0,0,0,12.2185,0,1,1,2,0,0,0,8,6,1.0,2.644795,38,0.0,-0.569295,0.0
1,0,0,1,12.41917,0,4,0,1,0,0,1,8,5,1.0,1.663505,31,0.333333,1.192758,1.0
2,1,1,1,13.017005,4,1,1,1,0,1,0,10,22,2.0,2.589016,53,0.0,-0.569295,2.0
3,0,0,1,12.2185,0,4,1,1,0,1,0,14,37,0.0,1.918555,41,0.0,-0.569295,2.0
4,0,1,1,11.967187,2,1,1,1,0,0,0,10,26,2.0,1.923843,41,0.0,-0.569295,0.0


In [None]:
train_try = train.drop(['cf_ratio', 'work_phone', 'phone', 'email'], axis = 1)

In [None]:
train_try.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26452 entries, 0 to 26456
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   gender        26452 non-null  int64  
 1   car           26452 non-null  int64  
 2   reality       26452 non-null  int64  
 3   income_total  26452 non-null  float64
 4   income_type   26452 non-null  int64  
 5   edu_type      26452 non-null  int64  
 6   family_type   26452 non-null  int64  
 7   house_type    26452 non-null  int64  
 8   occyp_type    26452 non-null  int64  
 9   begin_month   26452 non-null  int64  
 10  credit        26452 non-null  float64
 11  EMPLOYED      26452 non-null  float64
 12  age           26452 non-null  int64  
 13  cf_pca        26447 non-null  float64
 14  try           26452 non-null  float64
dtypes: float64(5), int64(10)
memory usage: 3.2 MB


In [None]:
train_try = train_try.dropna(axis = 0)

In [None]:
train_try_x = train_try.drop("credit", axis = 1)
train_try_y = train_try['credit']

X_train_try, X_val_try, y_train_try, y_val_try = train_test_split(train_try_x, train_try_y,
                 stratify = train_try_y, 
                 test_size = 0.25,
                 random_state = 10086
                 )

clf.fit(X_train_try, y_train_try)

y_proba_try = clf.predict_proba(X_val_try)
y_val_onehot_try = pd.get_dummies(y_val_try)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
log_loss(y_val_onehot_try, y_proba_try) # 성능이 좋아졌당

0.7970578665030416

### 묶어서 새로운 변수로 만든게 성능이 더 좋아졌다. 이 세 변수는 묶어봐도 좋을듯..?

그리고 세개로 만든 새로운 변수 추가하고 기존 변수들 누락하지 않고 진행하는게 더 성능이 좋을 수도 있을 것 같다.

## 중복데이터 처리

초반에도 중복 데이터가 있는 것 같았는데 결과를 보니 역시 이 중복 데이터를 어떻게 처리하는지가 중요하게 작용되었던 것 같다. 

일단 확인해보자..

In [None]:
train_dup_check = train_original.drop(['index', 'begin_month', 'credit'], axis = 1)

In [None]:
train_dup_check.duplicated().sum() 

17698

고유값인 인덱스와 같은 사람이 여러 장 발급받을 수 있음을 고려해 제외한 begin_month, 타겟 credit을 제외한 변수들의 중복 행이 총 17698개로 절반 이상이였다.

같은 사람을 하나로 묶기 위해서 변수를 만들어보자..

In [None]:
def duplicated(df):
    df['ID'] = \
    df['child_num'].astype(str) + '_' + df['income_total'].astype(str) + '_' +\
    df['DAYS_BIRTH'].astype(str) + '_' + df['DAYS_EMPLOYED'].astype(str) + '_' +\
    df['work_phone'].astype(str) + '_' + df['phone'].astype(str) + '_' +\
    df['email'].astype(str) + '_' + df['family_size'].astype(str) + '_' +\
    df['gender'].astype(str) + '_' + df['car'].astype(str) + '_' +\
    df['reality'].astype(str) + '_' + df['income_type'].astype(str) + '_' +\
    df['edu_type'].astype(str) + '_' + df['family_type'].astype(str) + '_' +\
    df['house_type'].astype(str) + '_' + df['occyp_type'].astype(str)

# 아까 중간 저장해뒀던 train_dup 쓰자
duplicated(train_dup)

KeyError: ignored

In [None]:
train_dup.head()

Unnamed: 0,gender,car,reality,income_total,income_type,edu_type,family_type,house_type,occyp_type,begin_month,credit,EMPLOYED,age,cf_pca,ID,try
0,0,0,0,12.2185,0,1,1,2,8,6,1.0,2.644795,38,-0.569295,4131,0.0
1,0,0,1,12.41917,0,4,0,1,8,5,1.0,1.663505,31,1.192758,7506,1.0
2,1,1,1,13.017005,4,1,1,1,10,22,2.0,2.589016,53,-0.569295,6041,2.0
3,0,0,1,12.2185,0,4,1,1,14,37,0.0,1.918555,41,-0.569295,4163,2.0
4,0,1,1,11.967187,2,1,1,1,10,26,2.0,1.923843,41,-0.569295,2953,0.0


In [None]:
# 위에서 했던 try_t()도 하자
try_t(train_dup)

In [None]:
# 드랍해보자
train_dup = train_dup.drop(['index', 'FLAG_MOBIL', 'DAYS_BIRTH', 'DAYS_EMPLOYED', 'child_num', 'family_size', 'cf_ratio'], axis = 1)

In [None]:
train_dup = train_dup.drop(['work_phone', 'phone', 'email'], axis = 1)

In [None]:
train_dup.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26447 entries, 0 to 26451
Data columns (total 16 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   gender        26447 non-null  int64  
 1   car           26447 non-null  int64  
 2   reality       26447 non-null  int64  
 3   income_total  26447 non-null  float64
 4   income_type   26447 non-null  int64  
 5   edu_type      26447 non-null  int64  
 6   family_type   26447 non-null  int64  
 7   house_type    26447 non-null  int64  
 8   occyp_type    26447 non-null  int64  
 9   begin_month   26447 non-null  int64  
 10  credit        26447 non-null  float64
 11  EMPLOYED      26447 non-null  float64
 12  age           26447 non-null  int64  
 13  cf_pca        26447 non-null  float64
 14  ID            26447 non-null  int64  
 15  try           26447 non-null  float64
dtypes: float64(5), int64(11)
memory usage: 3.4 MB


In [None]:
train_dup = train_dup.dropna(axis = 0)

In [None]:
# ID 변수 라벨링
train_dup['ID'] = label_encoder.fit_transform(train_dup['ID'])

In [None]:
train_dup_x = train_dup.drop("credit", axis = 1)
train_dup_y = train_dup['credit']

X_train_dup, X_val_dup, y_train_dup, y_val_dup = train_test_split(train_dup_x, train_dup_y,
                 stratify = train_dup_y, 
                 test_size = 0.25,
                 random_state = 10086
                 )

clf.fit(X_train_dup, y_train_dup)

y_proba_dup = clf.predict_proba(X_val_dup)
y_val_onehot_dup = pd.get_dummies(y_val_dup)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [None]:
log_loss(y_val_onehot_dup, y_proba_dup)

0.8537431630539011