# 대출 위험도 예측 모델
- 대출 대상자 데이터를 기반으로 2년 내에 대출금을 연체할 가능성이 있는지 여부를 예측하는 알고리즘 개발
- 평가지표: roc_auc 점수

## 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('cs_data.csv')

# 첫 번째 열 삭제
data.drop(['Unnamed: 0'], axis=1, inplace=True)
data

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
0,1,0.766127,45,2,0.802982,9120.0,13,0,6,0,2.0
1,0,0.957151,40,0,0.121876,2600.0,4,0,0,0,1.0
2,0,0.658180,38,1,0.085113,3042.0,2,1,0,0,0.0
3,0,0.233810,30,0,0.036050,3300.0,5,0,0,0,0.0
4,0,0.907239,49,1,0.024926,63588.0,7,0,1,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
149995,0,0.040674,74,0,0.225131,2100.0,4,0,1,0,0.0
149996,0,0.299745,44,0,0.716562,5584.0,4,0,1,0,2.0
149997,0,0.246044,58,0,3870.000000,,18,0,1,0,0.0
149998,0,0.000000,30,0,0.000000,5716.0,4,0,0,0,0.0


In [2]:
data.info()
# 문자열 타입은 없고 MonthlyIncome과 NumberOfDependents에 null 값이 있음

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      150000 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 2   age                                   150000 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 4   DebtRatio                             150000 non-null  float64
 5   MonthlyIncome                         120269 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 7   NumberOfTimes90DaysLate               150000 non-null  int64  
 8   NumberRealEstateLoansOrLines          150000 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 10  NumberOfDependents                    146076 non-null  float64
dtype

In [3]:
data.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,120269.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,0.06684,6.048438,52.295207,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.757222
std,0.249746,249.755371,14.771866,4.192781,2037.818523,14384.67,5.145951,4.169304,1.129771,4.155179,1.115086
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029867,41.0,0.0,0.175074,3400.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.154181,52.0,0.0,0.366508,5400.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.559046,63.0,0.0,0.868254,8249.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


## 데이터 전처리
1. MonthlyIncome, NumberOfDependents의 NaN값 모두 0으로 대체

In [4]:
# DebtRatio가 상당히 높은 경우 MonthlyIncome이 누락되어 있음 --> MonthlyIncome null 값 0으로 변경
# NumberOfDependents의 경우 최빈값인 0으로 변경
data['MonthlyIncome'].fillna(0, inplace=True)
data.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,146076.0
mean,0.06684,6.048438,52.295207,0.421033,353.005076,5348.139,8.45276,0.265973,1.01824,0.240387,0.757222
std,0.249746,249.755371,14.771866,4.192781,2037.818523,13152.06,5.145951,4.169304,1.129771,4.155179,1.115086
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029867,41.0,0.0,0.175074,1550.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.154181,52.0,0.0,0.366508,4357.5,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.559046,63.0,0.0,0.868254,7400.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


In [5]:
data['NumberOfDependents'].value_counts()

0.0     86902
1.0     26316
2.0     19522
3.0      9483
4.0      2862
5.0       746
6.0       158
7.0        51
8.0        24
10.0        5
9.0         5
20.0        1
13.0        1
Name: NumberOfDependents, dtype: int64

In [6]:
data['NumberOfDependents'].fillna(0, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150000 entries, 0 to 149999
Data columns (total 11 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   SeriousDlqin2yrs                      150000 non-null  int64  
 1   RevolvingUtilizationOfUnsecuredLines  150000 non-null  float64
 2   age                                   150000 non-null  int64  
 3   NumberOfTime30-59DaysPastDueNotWorse  150000 non-null  int64  
 4   DebtRatio                             150000 non-null  float64
 5   MonthlyIncome                         150000 non-null  float64
 6   NumberOfOpenCreditLinesAndLoans       150000 non-null  int64  
 7   NumberOfTimes90DaysLate               150000 non-null  int64  
 8   NumberRealEstateLoansOrLines          150000 non-null  int64  
 9   NumberOfTime60-89DaysPastDueNotWorse  150000 non-null  int64  
 10  NumberOfDependents                    150000 non-null  float64
dtype

## 데이터 분리

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [8]:
y = data['SeriousDlqin2yrs']
X = data.drop(['SeriousDlqin2yrs'], axis=1)
y.shape, X.shape

((150000,), (150000, 10))

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train, random_state=0)
y_train.shape, y_val.shape, y_test.shape

((84375,), (28125,), (37500,))

## 데이터 스케일링
- StandardScaler, MinMaxScaler 모두 사용해볼 예정

In [10]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [11]:
ssc = StandardScaler()
X_train_ssc = ssc.fit_transform(X_train)
X_val_ssc = ssc.transform(X_val)
X_test_ssc = ssc.transform(X_test)

mms = MinMaxScaler()
X_train_mms = mms.fit_transform(X_train)
X_val_mms = mms.transform(X_val)
X_test_mms = mms.transform(X_test)

## 모델 선택 및 생성
- 0이 많은 데이터이므로 tree 모델은 X
- SVC사용

In [12]:
from sklearn.svm import SVC

In [13]:
svc = SVC(random_state=0, probability=True)

params = {
    "kernel":['linear', 'rbf'],
    "C":[0.001, 0.01, 0.1, 1, 10],
    "gamma":[0.001, 0.01, 0.1, 1, 10]
}

gs_ssc = GridSearchCV(svc, params, scoring='roc_auc', cv=3, n_jobs=-1)
gs_mms = GridSearchCV(svc, params, scoring='roc_auc', cv=3, n_jobs=-1)

## 학습

In [None]:
gs_ssc.fit(X_train_ssc, y_train)