In [204]:
# 회귀모형 
import numpy as np
import matplotlib.pyplot as plt 
import pandas as pd 
import seaborn as sns 

# 쓸데없는 알림 방지
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import itertools

#통계적 모형
import statsmodels.api as sm # 선형회귀 
from statsmodels.stats.outliers_influence import variance_inflation_factor # 다중공산성 

#머신러닝
from sklearn import datasets
from sklearn import metrics
from sklearn import svm, neighbors, tree  # 트리 KNN 

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier # 앙상블  
from sklearn.linear_model import Ridge, Lasso, LinearRegression, LogisticRegression #릿지라쏘 
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold # 그리드 서치, 트레인 테스트 셋 분리 
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer,MinMaxScaler # scale

from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score, mean_absolute_error #연속형일때 사용하는 경우 
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, confusion_matrix #범주형(분류모델)

# boost 계열 
import xgboost as xgb 
import lightgbm as lgb
import catboost as cb

personal loan.csv를 가지고 지금까지 배웠던 
- SVM, KNN, Decision Tree, ensemble 등을 적용해서 
    - 가장 정확도가 높은 모델을 찾아보라
    - 정확도가 무엇인지 확인하라. 

반응변수 = Personal Loan(대출 유무) 
- 입력변수 
    - Experience : 경력
    - income : 수입
    - zip : 우편번호 
    - family : 가족 
    - CCabg : 월단위 카드 사용량 
    - education : 교육 수준
    - mortgage : 대출 
    - security account : 유가증권  계좌 유무 
    - cd account : 양도 예금증서 계좌 유무 
    - online : 온라인 계좌유무 
    - credit card : 신용카드 유무 

In [205]:
Loan_data = pd.read_csv('C:/Users/scien/Videos/Personal_Loan.csv')

### 데이터 전처리 작업 

- Y 설정
- X 설정
- 다중 공산성 확인 전에 Y와 무관한 특정 X 컬럼들 전부 지워버리기 

In [206]:
Loan_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2500 entries, 0 to 2499
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID                  2500 non-null   int64  
 1   Age                 2500 non-null   int64  
 2   Experience          2500 non-null   int64  
 3   Income              2500 non-null   int64  
 4   ZIP Code            2500 non-null   int64  
 5   Family              2500 non-null   int64  
 6   CCAvg               2500 non-null   float64
 7   Education           2500 non-null   int64  
 8   Mortgage            2500 non-null   int64  
 9   Personal Loan       2500 non-null   int64  
 10  Securities Account  2500 non-null   int64  
 11  CD Account          2500 non-null   int64  
 12  Online              2500 non-null   int64  
 13  CreditCard          2500 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 273.6 KB


In [207]:
Loan_target = Loan_data[['Personal Loan']]

In [208]:
Loan_features = Loan_data.drop(columns = ['Personal Loan','ID', 'ZIP Code'])
Loan_features

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,1,0,0,0
1,45,19,34,3,1.5,1,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
2495,46,22,70,4,1.9,1,212,0,0,0,1
2496,63,37,32,3,0.7,2,141,0,0,0,0
2497,33,9,14,3,0.9,3,114,0,0,0,0
2498,38,14,111,2,6.1,1,326,0,0,0,0


In [209]:
print(Loan_features.isna().sum())
print(Loan_target.isna().sum())

Age                   0
Experience            0
Income                0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64
Personal Loan    0
dtype: int64


- Loan_target = 원 데이터의 Y (Personal Loan) 
- Loan_features = 원 데이터의 X (Personal 외) 
- train_x 원 데이터의 학습셋 
- test_y 원 데이터의 테스트 셋 

#### 셋 스플릿 

In [210]:
train_x, test_x, train_y, test_y = train_test_split(Loan_features, Loan_target, train_size=0.8, random_state=123)
print(train_x.shape, train_y.shape, test_x.shape, test_y.shape)

(2000, 11) (2000, 1) (500, 11) (500, 1)


#### MIN MAX SCALER

In [211]:
std_scaler = MinMaxScaler()
std_scaler_fit = std_scaler.fit(train_x)
train_x = pd.DataFrame(std_scaler_fit.transform(train_x), index=train_x.index, columns=train_x.columns)
test_x = pd.DataFrame(std_scaler_fit.transform(test_x), index = test_x.index, columns=test_x.columns)

### 일반 로지스틱 회귀

In [212]:
model_logit = sm.Logit(train_y, train_x).fit()
model_logit.summary()

Optimization terminated successfully.
         Current function value: 0.277162
         Iterations 8


0,1,2,3
Dep. Variable:,Personal Loan,No. Observations:,2000.0
Model:,Logit,Df Residuals:,1989.0
Method:,MLE,Df Model:,10.0
Date:,"Thu, 14 Jan 2021",Pseudo R-squ.:,0.1827
Time:,16:41:44,Log-Likelihood:,-554.32
converged:,True,LL-Null:,-678.27
Covariance Type:,nonrobust,LLR p-value:,1.5079999999999998e-47

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Age,-2.3985,2.888,-0.830,0.406,-8.060,3.263
Experience,-1.4685,2.881,-0.510,0.610,-7.115,4.178
Income,2.6529,0.367,7.219,0.000,1.933,3.373
Family,-1.0251,0.191,-5.374,0.000,-1.399,-0.651
CCAvg,0.2596,0.466,0.557,0.577,-0.653,1.173
Education,0.4238,0.185,2.296,0.022,0.062,0.786
Mortgage,-0.2520,0.414,-0.609,0.543,-1.063,0.559
Securities Account,-2.0360,0.360,-5.663,0.000,-2.741,-1.331
CD Account,5.0777,0.371,13.699,0.000,4.351,5.804


In [213]:
model_logRg = LogisticRegression().fit(train_x, train_y)
pred_logRg_y = model_logRg.predict(test_x)

In [214]:
cmat = confusion_matrix(test_y, pred_logRg_y)
logRg_accuracy = (cmat[0,0] + cmat[1,1]) / np.sum(cmat)
logRg_accuracy

0.956

In [215]:
type(test_y)

pandas.core.frame.DataFrame

In [216]:
type(pred_logRg_y)

numpy.ndarray

### SVM

In [180]:
best_score = 0

for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for c in [0.001, 0.01, 0.1, 1, 10, 100]:
        model = svm.SVC(kernel = 'rbf', C=c, gamma=gamma)
                                                    #CV cross value 교차검증 
        score = cross_val_score(model, train_x, train_y, cv=10)
        score = np.mean(score)
    
        if score > best_score:
            best_score  = score 
            cvs_best_parameter_ = {'C': c, 'gamma' : gamma}

In [181]:
cvs_best_parameter_

{'C': 100, 'gamma': 0.1}

In [182]:
model_rbf = svm.SVC(kernel='rbf', C=100, gamma=0.001).fit(train_x, train_y)
pred_rbf_y = model_rbf.predict(test_x)

In [183]:
cmat = confusion_matrix(test_y, pred_rbf_y)
rbf_accuracy = (cmat[0,0] + cmat[1,1]) / np.sum(cmat)
rbf_accuracy

0.95

### KNN

In [184]:
k_range = range(1,100,2)
params = {
    'n_neighbors' : k_range,
    'weights' : ['uniform', 'distance']
}

grid_search_N = GridSearchCV(neighbors.KNeighborsClassifier(), params, cv=5, return_train_score = True)
grid_search_N.fit(train_x, train_y)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(1, 100, 2),
                         'weights': ['uniform', 'distance']},
             return_train_score=True)

In [185]:
grid_search_N.best_estimator_

KNeighborsClassifier(n_neighbors=1)

In [186]:
model_knn = neighbors.KNeighborsClassifier(11, weights='distance').fit(train_x, train_y)
predict_knn_y = model_knn.predict(test_x)

In [187]:
cmat = confusion_matrix(test_y, predict_knn_y)
knn_accuracy = (cmat[0,0] + cmat[1,1]) / np.sum(cmat)
knn_accuracy

0.936

### 디시전 트리 

In [188]:
params = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [2,3,4,5,6,7,8,9,10],
    'max_leaf_nodes' : [1,2,3,4,5,6,7,8,9,10],
    'max_features' : ['auto', 'sqrt', 'log2']
}

grid_search = GridSearchCV(tree.DecisionTreeClassifier(), params, cv=5, return_train_score = True)
grid_search.fit(train_x, train_y)

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [2, 3, 4, 5, 6, 7, 8, 9, 10],
                         'max_features': ['auto', 'sqrt', 'log2'],
                         'max_leaf_nodes': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
             return_train_score=True)

In [189]:
grid_search.best_params_

{'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'sqrt',
 'max_leaf_nodes': 10}

In [190]:
grid_search.best_score_

0.9615

In [191]:
model_tree = tree.DecisionTreeClassifier().fit(train_x, train_y)
pred_tree_y = model_tree.predict(test_x)

In [192]:
cmat=confusion_matrix(test_y, pred_tree_y)
tree_accuracy=(cmat[0,0]+cmat[1,1]) / np.sum(cmat)
tree_accuracy

0.974

### 앙상블
> n_estimators = 100, max_depth = 5 고정 


#### 랜덤포레스트

In [193]:
model_rf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=123).fit(train_x, train_y)
pred_rf_y = model_rf.predict(test_x)

In [194]:
cmat=confusion_matrix(test_y, pred_rf_y)
rf_accuracy = (cmat[0,0]+cmat[1,1]) / np.sum(cmat)
rf_accuracy

0.974

#### ADABoost 

In [195]:
dt = tree.DecisionTreeClassifier(max_depth=5)
model_ada = AdaBoostClassifier(base_estimator=dt, n_estimators=100, random_state= 123).fit(train_x, train_y)


In [196]:
pred_ada_y = model_ada.predict(test_x)
cmat=confusion_matrix(test_y, pred_ada_y)
ada_accuracy = (cmat[0,0]+cmat[1,1]) / np.sum(cmat)
ada_accuracy

0.982

#### XGboost 

In [197]:
model_xgb = xgb.XGBClassifier(
    n_estimators=100, 
    max_depth = 5, 
    random_state=123, 
    learning_rate =0.01,
    objective='multi:softmax',
    num_class = 2
)
model_xgb.fit(train_x, train_y)
pred_xgb_y = model_xgb.predict(test_x)



In [198]:
cmat=confusion_matrix(test_y, pred_xgb_y)
xg_accuracy = (cmat[0,0]+cmat[1,1]) / np.sum(cmat)
xg_accuracy

0.978

#### LGBoost

In [199]:
model_lgb = lgb.LGBMClassifier(
    n_estimators=100, 
    max_depth = 5, 
    random_state=123, 
    learning_rate =0.01,
    objective = 'binary', #'cross_entorpy'
)
model_lgb.fit(train_x, train_y)
pred_lgb_y = model_lgb.predict(test_x)



In [200]:
cmat=confusion_matrix(test_y, pred_lgb_y)
lgb_accuracy = (cmat[0,0]+cmat[1,1]) / np.sum(cmat)
lgb_accuracy

0.972

#### Cat Boost 

In [201]:
model_cat = cb.CatBoostClassifier(
    n_estimators=100, 
    max_depth = 5, 
    random_state=123, 
    learning_rate =0.01,
    eval_metric = 'Accuracy',
    loss_function = 'MultiClass',
    verbose=False
)
model_cat.fit(train_x, train_y)
pred_cat_y = model_cat.predict(test_x)

In [202]:
cmat=confusion_matrix(test_y, pred_cat_y)
cat_accuracy = (cmat[0,0]+cmat[1,1]) / np.sum(cmat)
cat_accuracy

0.974

## 데이터 스케일링 이전의 데이터 표 

In [173]:
accm = pd.DataFrame(columns = ['디시전트리','Grid','Rand Forest', 'LGB','CAT','XG','ADA', 'KNN', 'Logistic','SVM'],
                   index = ['정확도'])
accm['디시전트리']['정확도'] = tree_accuracy
accm['Grid']['정확도'] = grid_search.best_score_
accm['Rand Forest']['정확도'] = rf_accuracy
accm['LGB']['정확도'] = lgb_accuracy
accm['CAT']['정확도'] = cat_accuracy
accm['XG']['정확도'] = xg_accuracy
accm['ADA']['정확도'] = ada_accuracy
accm['KNN']['정확도'] = knn_accuracy
accm['SVM']['정확도'] = rbf_accuracy
accm['Logistic']['정확도'] = logRg_accuracy
accm

Unnamed: 0,디시전트리,Grid,Rand Forest,LGB,CAT,XG,ADA,KNN,Logistic,SVM
정확도,0.976,0.9045,0.972,0.972,0.974,0.978,0.982,0.914,0.952,0.944


## MIN MAX 스케일링 이후의 데이터 표 

In [203]:
accm = pd.DataFrame(columns = ['디시전트리','Grid','Rand Forest', 'LGB','CAT','XG','ADA', 'KNN', 'Logistic','SVM'],
                   index = ['정확도'])
accm['디시전트리']['정확도'] = tree_accuracy
accm['Grid']['정확도'] = grid_search.best_score_
accm['Rand Forest']['정확도'] = rf_accuracy
accm['LGB']['정확도'] = lgb_accuracy
accm['CAT']['정확도'] = cat_accuracy
accm['XG']['정확도'] = xg_accuracy
accm['ADA']['정확도'] = ada_accuracy
accm['KNN']['정확도'] = knn_accuracy
accm['SVM']['정확도'] = rbf_accuracy
accm['Logistic']['정확도'] = logRg_accuracy
accm

Unnamed: 0,디시전트리,Grid,Rand Forest,LGB,CAT,XG,ADA,KNN,Logistic,SVM
정확도,0.974,0.9615,0.974,0.972,0.974,0.978,0.982,0.936,0.956,0.95
