### 데이터 불러 오기

In [None]:
# dataset 불러오기 (pandas, numpy, matplotlib)

In [None]:
import pandas as pd
import numpy as np
import matplotlib as plt

In [None]:
# jupyter에서 matplotlib의 결과를 아래 바로 보이도록 설정

In [None]:
%matplotlib inline

In [None]:
# 다운로드 받은 데이터를 pandas의 dataframe형태로 읽어 오기

In [None]:
df = pd.read_csv("examples/loan.csv")

### 데이터 훑어보기

In [None]:
# head() 함수를 통해 데이터 top 5를 출력
df.head()

In [None]:
# info 함수를 통해 데이터의 정보를 확인

In [None]:
df.info()

In [None]:
# describe() 함수를 통해 numerical 데이터의 요약을 확인

In [None]:
df.describe()

In [None]:
# 데이터 요약으로부터 알 수 있는 점
# 1. LoanAmount의 count를 보면 다른 column의 count보다 개수가 부족, (614-692) 22 missing value
# 2. Loan_Amount_Term, Credit_History의 값도 LoanAmount와 동일하게 missing values 발생
# 3. Credit_History의 경우 값(0,1)을 갖고 있기 때문에, 평균(84%)는 credit_history를 갖고 있다. 

In [None]:
# non-numerical 데이터의 분포 확인 (Gender, Married, Dependents, Education, 
#                                  Self_Employed, Property_Area, Loan_Status)

In [None]:
df['Gender'].value_counts()

In [None]:
df['Married'].value_counts()

In [None]:
df['Dependents'].value_counts()

In [None]:
df['Education'].value_counts()

In [None]:
df['Self_Employed'].value_counts()

In [None]:
df['Property_Area'].value_counts()

In [None]:
df['Loan_Status'].value_counts()

In [None]:
# Numerical 데이터의 분포 분석 (ApplicantIncome, CoapplicantIncome, 
#                              LoanAmount, Loan_Amount_Term, Credit_History )

In [None]:
df['ApplicantIncome'].hist(bins=50)

In [None]:
df['CoapplicantIncome'].hist(bins=50)

In [None]:
df['LoanAmount'].hist(bins=50)

In [None]:
df['Loan_Amount_Term'].hist(bins=50)

In [None]:
df['Credit_History'].hist(bins=50)

### 누락치 처리

In [None]:
# 특성별 누락치 수 확인하기

In [None]:
df.apply(lambda x: sum(x.isnull()), axis=0)

In [None]:
# df['Gender'].isnull().sum()

In [None]:
# Numeric 데이터의 누락치 : 평균값으로 채우기

In [None]:
df['LoanAmount'].fillna(df['LoanAmount'].mean(), inplace=True)
df['Loan_Amount_Term'].fillna(df['Loan_Amount_Term'].mean(), inplace=True)
df['Credit_History'].fillna(1.0, inplace=True)

In [None]:
# Non-Numeric 데이터의 누락치 : 최빈값으로 채우기

In [None]:
df['Gender'].fillna(df['Gender'].value_counts().index[0], inplace=True)
df['Married'].fillna(df['Married'].value_counts().index[0], inplace=True)
df['Dependents'].fillna(df['Dependents'].value_counts().index[0], inplace=True)
df['Self_Employed'].fillna(df['Self_Employed'].value_counts().index[0], inplace=True)

In [None]:
# df['Gender'].fillna('Male', inplace=True)

In [None]:
# 누락치가 있는지 다시 확인하기

In [None]:
df.apply(lambda x: sum(x.isnull()), axis=0)

In [None]:
# df.isnull().any()
# df.isnull().sum()

### Outlier 영향 없애기 : Numeric 데이터 로그 변환

In [None]:
# ApplicantIncome, CoapplicantIncome, LoadAmount

In [None]:
df['LoanAmount_log'] = np.log1p(df['LoanAmount'])
df['LoanAmount_log'].hist(bins=20)

In [None]:
df['ApplicantIncome_log'] = np.log1p(df['ApplicantIncome'])
df['ApplicantIncome_log'].hist(bins=20)

In [None]:
df['CoapplicantIncome_log'] = np.log1p(df['CoapplicantIncome'])
df['CoapplicantIncome_log'].hist()

### Non-Numeric 데이터 처리

In [None]:
# 원핫 인코딩 하기

In [None]:
# from sklearn.preprocessing import LabelEncoder
# cat_data = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
# encoder = LabelEncoder()
# for i in cat_data:
#    df[i] = encoder.fit_transform(df[i])
# df.dtypes 

In [None]:
cat_data = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area']
cat = pd.get_dummies(df[cat_data])

In [None]:
cat.head()

In [None]:
# df와 cat을 concat 해서 data 데이터프레임 생성

In [None]:
data = pd.concat([df, cat], axis=1)
data.info()

### 데이터 타깃 분리

In [None]:
# 불필요한 컬럼 삭제

In [None]:
data.drop(cat_data, axis=1, inplace=True)

In [None]:
data.drop(['Loan_ID','LoanAmount', 'ApplicantIncome', 'CoapplicantIncome'],axis=1, inplace=True)

In [None]:
data.info()

In [None]:
# 타겟 Series 만들기

In [None]:
target = data['Loan_Status'].copy()

In [None]:
# 데이터에서 타켓 제거

In [None]:
data.drop('Loan_Status', axis=1, inplace=True)

### 트레이닝, 테스트 세트 나누기

In [None]:
X_train = data
y_train = target

In [None]:
#from sklearn.model_selection import train_test_split  # 데이터 세트 분리
#X_train, X_test, y_train, y_test = train_test_split(data, 
#                                                    target, 
#                                                    test_size=0.2, random_state=11)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

### 분류 모델 훈련

In [None]:
# 모델 import 하기

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [None]:
# 교차검증을 위한 cross_val_predict import 하기
# 분류 성능 평가를 위한 지표 함수 import 하기

In [None]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report
#from sklearn.metrics import precision_score, recall_score, f1_score, roc_curve, roc_auc_score 

In [None]:
# LogisticRegression 에 의한 훈련 및 예측

In [None]:
lr_model = LogisticRegression()
lr_pred = cross_val_predict(lr_model, X_train, y_train, cv=3)
print(classification_report(y_train, lr_pred))

lr_scores = cross_val_predict(lr_model, X_train, y_train, cv=3, method='decision_function')
print("ROC Score : ", roc_auc_score(y_train, lr_scores))

In [None]:
# SGDClassifier에 의한 훈련 및 예측

In [None]:
sgd_model = SGDClassifier()
sgd_pred = cross_val_predict(sgd_model, X_train, y_train, cv=3)
print(classification_report(y_train, lr_pred))
sgd_scores = cross_val_predict(sgd_model, X_train, y_train, cv=3, method='decision_function')
print("ROC Score : ", roc_auc_score(y_train, sgd_scores))

In [None]:
# KNeighborsClassifier에 의한 훈련 및 예측

In [None]:
knn_model = KNeighborsClassifier()
knn_pred = cross_val_predict(knn_model, X_train, y_train, cv=3)
print(classification_report(y_train, lr_pred))
knn_scores = cross_val_predict(knn_model, X_train, y_train, cv=3, method='predict_proba')
print("ROC Score : ", roc_auc_score(y_train, knn_scores[:,1]))

In [None]:
# SVC에 의한 훈련 및 예측

In [None]:
svc_model = SVC()
svc_pred = cross_val_predict(svc_model, X_train, y_train, cv=3)
print(classification_report(y_train, lr_pred))
svc_scores = cross_val_predict(svc_model, X_train, y_train, cv=3, method='decision_function')
print("ROC Score : ", roc_auc_score(y_train, svc_scores))

In [None]:
# DecisionTreeClassifier에 의한 훈련 및 예측

In [None]:
dt_model = DecisionTreeClassifier()
dt_pred = cross_val_predict(dt_model, X_train, y_train, cv=3)
print(classification_report(y_train, lr_pred))
dt_scores = cross_val_predict(dt_model, X_train, y_train, cv=3, method='predict_proba')
print("ROC Score : ", roc_auc_score(y_train, dt_scores[:,1]))

In [None]:
# RandomForestClassifier에 의한 훈련 및 예측

In [None]:
rf_model = RandomForestClassifier()
rf_pred = cross_val_predict(rf_model, X_train, y_train, cv=3)
print(classification_report(y_train, lr_pred))
rf_scores = cross_val_predict(rf_model, X_train, y_train, cv=3, method='predict_proba')
print("ROC Score : ", roc_auc_score(y_train, rf_scores[:,1]))

### 모델 성능 향상

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# LogisticRegression

In [None]:
parameters= {'C' : [0.01, 0.1, 1, 10, 100, 1000],
            'max_iter' : [1000, 10000]}
grid_lr = GridSearchCV(lr_model, param_grid=parameters, cv=3)
grid_lr.fit(X_train, y_train)

In [None]:
grid_lr.best_estimator_

In [None]:
lr_model = LogisticRegression(C=1, max_iter=1000)
lr_pred = cross_val_predict(lr_model, X_train, y_train, cv=3)
print(classification_report(y_train, lr_pred))

lr_scores = cross_val_predict(lr_model, X_train, y_train, cv=3, method='decision_function')
print("ROC Score : ", roc_auc_score(y_train, lr_scores))

In [None]:
# SGDClassifier alpha  l1_ration

In [None]:
parameters= {'alpha' : [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'l1_ratio' : [0.15, 0.4, 0.8]}
grid_sgd = GridSearchCV(sgd_model, param_grid=parameters, cv=3)
grid_sgd.fit(X_train, y_train)

In [None]:
grid_sgd.best_estimator_

In [None]:
sgd_model = SGDClassifier(alpha=10, l1_ratio=0.8)
sgd_pred = cross_val_predict(sgd_model, X_train, y_train, cv=3)
print(classification_report(y_train, sgd_pred))

sgd_scores = cross_val_predict(sgd_model, X_train, y_train, cv=3, method='decision_function')
print("ROC Score : ", roc_auc_score(y_train, sgd_scores))

In [None]:
# KNeighborsClassifier

In [None]:
parameters= {'n_neighbors' : [1, 2, 3, 4, 5] }
grid_knn = GridSearchCV(knn_model, param_grid=parameters, cv=3)
grid_knn.fit(X_train, y_train)

In [None]:
grid_knn.best_estimator_

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=1)
knn_pred = cross_val_predict(knn_model, X_train, y_train, cv=3)
print(classification_report(y_train, lr_pred))
knn_scores = cross_val_predict(knn_model, X_train, y_train, cv=3, method='predict_proba')
print("ROC Score : ", roc_auc_score(y_train, knn_scores[:,1]))

In [None]:
# SVC 

In [None]:
parameters= {'C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
            'degree' : [2,3,4]}
grid_svc = GridSearchCV(svc_model, param_grid=parameters, cv=3)
grid_svc.fit(X_train, y_train)

In [None]:
grid_svc.best_estimator_

In [None]:
svc_model = SVC(C=0.001, degree=2)
svc_pred = cross_val_predict(svc_model, X_train, y_train, cv=3)
print(classification_report(y_train, lr_pred))
svc_scores = cross_val_predict(svc_model, X_train, y_train, cv=3, method='decision_function')
print("ROC Score : ", roc_auc_score(y_train, svc_scores))

In [None]:
# StandardScaler 적용

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

In [None]:
svc_pred = cross_val_predict(svc_model, X_train_scaled, y_train, cv=3)
print(classification_report(y_train, lr_pred))
svc_scores = cross_val_predict(svc_model, X_train_scaled, y_train, cv=3, method='decision_function')
print("ROC Score : ", roc_auc_score(y_train, svc_scores))

In [None]:
# DecisionTreeClassifier

In [None]:
parameters= {'criterion' : ["gini", "entropy"],
            'min_samples_split' : [2, 3, 4]}
grid_dt = GridSearchCV(dt_model, param_grid=parameters, cv=3)
grid_dt.fit(X_train, y_train)

In [None]:
grid_dt.best_estimator_

In [None]:
dt_model = DecisionTreeClassifier(min_samples_split=3)
dt_pred = cross_val_predict(dt_model, X_train, y_train, cv=3)
print(classification_report(y_train, lr_pred))
dt_scores = cross_val_predict(dt_model, X_train, y_train, cv=3, method='predict_proba')
print("ROC Score : ", roc_auc_score(y_train, dt_scores[:,1]))

In [None]:
#RandomForestClassifier?  n_estimators=100 min_samples_split=2,

In [None]:
parameters= {'n_estimators' : [100,200],
            'min_samples_split' : [2, 4, 6]}
grid_rf = GridSearchCV(rf_model, param_grid=parameters, cv=3)
grid_rf.fit(X_train, y_train)

In [None]:
grid_rf.best_estimator_

In [None]:
rf_model = RandomForestClassifier(min_samples_split=6)
rf_pred = cross_val_predict(rf_model, X_train, y_train, cv=3)
print(classification_report(y_train, rf_pred))
rf_scores = cross_val_predict(rf_model, X_train, y_train, cv=3, method='predict_proba')
print("ROC Score : ", roc_auc_score(y_train, rf_scores[:,1]))

In [None]:
# 중요한 특성 찾기

In [None]:
grid_rf.best_estimator_.feature_importances_

In [None]:
X_train_features = X_train[['Credit_History', 'LoanAmount_log', 'ApplicantIncome_log', 'CoapplicantIncome_log']]

In [None]:
rf_model = RandomForestClassifier(min_samples_split=6)
rf_pred = cross_val_predict(rf_model, X_train_features, y_train, cv=3)
print(classification_report(y_train, rf_pred))
rf_scores = cross_val_predict(rf_model, X_train_features, y_train, cv=3, method='predict_proba')
print("ROC Score : ", roc_auc_score(y_train, rf_scores[:,1]))