In [None]:
import pandas as pd
import numpy as np 

loan = pd.read_csv('loan_result_clean.csv')
user = pd.read_csv('user_spec_clean.csv')
log= pd.read_csv('data/log_data.csv')

In [None]:
user = user.drop('Unnamed: 0', axis = 1)
user.head(5)

In [None]:
loan = loan.drop('Unnamed: 0', axis = 1)
loan.head(5)

In [None]:
log.head(5)

In [None]:
user[user['yearly_income'].isnull()]

In [None]:
# user_spec 과 train 조인
df = pd.merge(loan, user, left_on='application_id', right_on='application_id', how='inner')

In [None]:
df['this_year']=2022
df.head(3)

In [None]:
#나이추가
df['Age']=df['this_year']-df['birth_year']

In [None]:
df.head(10)

In [None]:
# train, test 분리
test = df.loc[df['is_applied'].isnull()]
test
# 3255482 개 

In [None]:
train = df.loc[df['is_applied'].isnull() == False]
train 
#10264386

In [None]:
# 대출 신청자 / 미신청자 특성파악을 위한 분류
loan_true = train.loc[train['is_applied'] == 1]
loan_false = train.loc[train['is_applied'] == 0]

In [None]:
loan_true

In [None]:
loan_false

In [None]:
# user_spec 과 train 조인
#loan_user = pd.merge(train, user, left_on='application_id', right_on='application_id', how='inner')

In [None]:
#loan_user.head(10)

In [None]:
df['is_applied'].value_counts()

### 예측 모델

In [None]:
!pip install catboost

In [None]:
!pip install eli5

In [None]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
pd.options.display.float_format = '{:.2f}'.format 
plt.rcParams['font.family'] = 'Apple Gothic'

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
print(train.shape)
print(test.shape)

In [None]:
for col in train.columns:
  cols = str(col)
  print(f'{col} 고유값 : {train[cols].unique()} \n')

In [None]:
import matplotlib as mpl
import seaborn as sns
plt.figure(figsize=(15,8))

In [None]:
#소득범위 범주화
train['Age'].value_counts().sort_index()

In [None]:
bins = list(range(10, 91, 10))
bins_label = [str(x)+"이상 "+str(x+10)+"미만" for x in bins]
train["Age_level"] = pd.cut(train["Age"], bins, right=False, labels=bins_label[:-1])

test["Age_level"] = pd.cut(test["Age"], bins, right=False, labels=bins_label[:-1])

In [None]:
train['Age_level'].value_counts().sort_index()

In [None]:
test['Age_level'].value_counts().sort_index()

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
#Age Level Encoding
encoder = LabelEncoder() 
encoder.fit(train['Age_level'])
test['Age_level'] = encoder.transform(test['Age_level'])

In [None]:
encoder = LabelEncoder()
encoder.fit(train['Age_level'])
train['Age_level'] = encoder.transform(train['Age_level'])

In [None]:
# 레이블이 불균형한 데이터임을 확인할 수 있음
train['is_applied'].value_counts(normalize = True)

In [None]:
# user_spec 과 train 조인
test2 = loan.loc[loan['is_applied'].isnull()]
test2

In [None]:
train2 = loan.loc[loan['is_applied'].isnull() == False]
train2 
#10264386

In [None]:
# user_spec 과 train 조인
loan_user = pd.merge(train2, user, left_on='application_id', right_on='application_id', how='inner')
loan_user.head(10)

In [None]:
loan_user['this_year']=2022
loan_user.head(3)

In [None]:
loan_user['Age']=loan_user['this_year']-loan_user['birth_year']
loan_user.head(3)

In [None]:
loan_user.drop(['this_year'],axis=1)

In [None]:
loan_user.columns

In [None]:
X= np.array(loan_user[['application_id', 'loanapply_insert_time', 'bank_id', 'product_id',
       'loan_limit', 'loan_rate', 'user_id', 'birth_year',
       'gender', 'insert_time', 'credit_score', 'yearly_income', 'income_type',
       'company_enter_month', 'employment_type', 'houseown_type',
       'desired_amount', 'purpose', 'personal_rehabilitation_yn',
       'personal_rehabilitation_complete_yn', 'existing_loan_cnt',
       'existing_loan_amt', 'enter_month', 'year', '재직일수', 'Age']])

In [None]:
X

In [None]:
y=np.array(loan_user['is_applied'])
y

In [None]:
#트레이닝 세트와 테스트 세트로 데이터 구분 (여기서는 7:3으로 구분해준다)
from sklearn.model_selection import train_test_split

#randomstate는 반복적으로 같은 결과를 만들어 내기 위하여 설정해준다 
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3,random_state=0)

print("X_train 크기:", X_train.shape)
print("y_train 크기:", y_train.shape)
print("X_test 크기:", X_test.shape)
print("y_test 크기:", y_test.shape)

In [None]:
from sklearn.utils.class_weight import compute_class_weight

# 불균형한 클래스인 것으로 확인되어 class_weight = 'balanced'로 설정해준다. 
classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

In [None]:
#최적의 모델 찾기
classification_setup = classification.setup(data=train2, target='신청여부',fold_strategy='is_applied')

In [None]:
top5_model = classification.compare_models(fold = 5, round = 3, sort = 'Accuracy', n_select = 5)