In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm_notebook
from sklearn.metrics import accuracy_score
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from datetime import datetime as dt
import time
import math
from catboost import CatBoostClassifier

In [None]:
loan = pd.read_csv('loan_result_clean.csv')
user = pd.read_csv('user_spec_clean.csv')

In [None]:
user = user.drop('Unnamed: 0', axis = 1)
user.head(5)

In [None]:
loan = loan.drop('Unnamed: 0', axis = 1)
loan.head(5)

In [None]:
# user_spec 과 train 조인
df = pd.merge(loan, user, left_on='application_id', right_on='application_id', how='inner')

In [None]:
# train, test 분리
test = df.loc[df['is_applied'].isnull()]
test
# 3255482 개 

In [None]:
train = df.loc[df['is_applied'].isnull() == False]
train 
#10264386

In [None]:
# 대출 신청자 / 미신청자 특성파악을 위한 분류
loan_true = train.loc[train['is_applied'] == 1]
loan_false = train.loc[train['is_applied'] == 0]

In [None]:
df['is_applied'].value_counts()

### 예측 모델

In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
pd.options.display.float_format = '{:.2f}'.format 
plt.rcParams['font.family'] = 'Apple Gothic'

In [None]:
print(train.shape)
print(test.shape)

In [None]:
for col in train.columns:
  cols = str(col)
  print(f'{col} 고유값 : {train[cols].unique()} \n')

In [None]:
#나이 범주화
train['Age'].value_counts().sort_index()

In [None]:
bins = list(range(10, 91, 10))
bins_label = [str(x)+"이상 "+str(x+10)+"미만" for x in bins]
train["Age_level"] = pd.cut(train["Age"], bins, right=False, labels=bins_label[:-1])

test["Age_level"] = pd.cut(test["Age"], bins, right=False, labels=bins_label[:-1])

In [None]:
train['Age_level'].value_counts().sort_index()

In [None]:
test['Age_level'].value_counts().sort_index()

In [None]:
#Age Level Encoding
encoder = LabelEncoder() 
encoder.fit(train['Age_level'])
test['Age_level'] = encoder.transform(test['Age_level'])

In [None]:
encoder = LabelEncoder()
encoder.fit(train['Age_level'])
train['Age_level'] = encoder.transform(train['Age_level'])

In [None]:
# 레이블이 불균형한 데이터임을 확인할 수 있음
train['is_applied'].value_counts(normalize = True)

In [None]:
df.head()

In [None]:
df=df.drop(['age_year','year'],axis=1)

In [None]:
df.columns

In [None]:
X= np.array(df[['application_id', 'loanapply_insert_time', 'bank_id', 'product_id',
       'loan_limit', 'loan_rate', 'is_applied', 'user_id', 'birth_year',
       'gender', 'insert_time', 'credit_score', 'yearly_income', 'income_type',
       'company_enter_month', 'employment_type', 'houseown_type',
       'desired_amount', 'purpose', 'personal_rehabilitation_yn',
       'personal_rehabilitation_complete_yn', 'existing_loan_cnt',
       'existing_loan_amt', 'enter_month', 'work_day',
       'Age']])

In [None]:
X

In [None]:
y=np.array(df['is_applied'])
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3,random_state=0)

In [None]:
print("X_train 크기:", X_train.shape)
print("y_train 크기:", y_train.shape)
print("X_test 크기:", X_test.shape)
print("y_test 크기:", y_test.shape)

In [None]:
import tensorflow as tf
from keras import layers, models, optimizers, Sequential
from sklearn.metrics import accuracy_score, log_loss, make_scorer
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import OneHotEncoder

In [None]:
#object 타입 label encoding 진행
le = LabelEncoder()
cols = np.array(train_set.columns)
for c in cols:
      if train_set[c].dtype == 'object':
        train_set[c] = le.fit_transform(train_set[c])
        test_set[c] = le.fit_transform(test_set[c])

In [None]:
# add the feature(total energy of acc)
train['acc_t']  = train.apply(lambda x : (x['acc_x']**2 + x['acc_y'] **2 +  x['acc_z'] ** 2 )**(1/3), axis=1)
test['acc_t']  = test.apply(lambda x : (x['acc_x']**2 + x['acc_y'] **2 +  x['acc_z'] ** 2 )**(1/3), axis=1)

### CatBoost 사용

In [30]:
from sklearn.utils.class_weight import compute_class_weight

In [31]:
classes = np.unique(y_train)

In [None]:
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)

In [None]:
class_weights = dict(zip(classes, weights))

In [57]:
from sklearn.utils.class_weight import compute_class_weight

# 불균형한 클래스인 것으로 확인되어 class_weight = 'balanced'로 설정해준다. 
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

ValueError: classes should include all valid labels that can be in y

In [None]:
cb = CatBoostClassifier(learning_rate= 0.03, max_depth= 10, n_estimators= 1000, 
                    class_weights= class_weights, subsample = 0.8, colsample_bylevel=1.0, random_state=42, verbose =0)

# 성능 지표는 정확도(accuracy) , 교차 검증 세트는 5개 
scores = cross_val_score(cb , X, y, scoring='accuracy',cv=5)

print('교차 검증별 정확도:',np.round(scores, 4))
print('평균 검증 정확도:', np.round(np.mean(scores),4))

In [None]:
plt.figure(figsize=(14, 9))
heat_table = train_set.corr()
mask = np.zeros_like(heat_table)
mask[np.triu_indices_from(mask)]=True
heatmap_ax = sns.heatmap(heat_table, annot=True, mask=mask, cmap='pink', vmin=-1, vmax=1)
heatmap_ax.set_xticklabels(heatmap_ax.get_xticklabels(), fontsize=10, rotation=90)
heatmap_ax.set_yticklabels(heatmap_ax.get_yticklabels(), fontsize=10)
plt.title('correlation between columns', fontsize=20)

In [None]:
#변수별 PermutationImportance 확인
import eli5
from eli5.sklearn import PermutationImportance

cb = CatBoostClassifier(learning_rate= 0.03, max_depth= 10, n_estimators= 1000, 
                    class_weights= class_weights, subsample = 0.8, colsample_bylevel=1.0, random_state=42, verbose =0).fit(X_train, y_train)
perm = PermutationImportance(cb, random_state= 42).fit(X_test, y_test)
eli5.show_weights(perm, feature_names = X_test.columns.tolist())