In [107]:
import koreanize_matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn.tree as tree

from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve)
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

## 1. 데이터

In [108]:
# 데이터 불러오기

df = pd.read_csv('../data/BankChurners.csv')
df.drop('CLIENTNUM', axis=1, inplace=True)
df.drop('Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', axis=1, inplace=True)
df.drop('Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', axis=1, inplace=True)

In [109]:
# 컬럼 이름 변경
df.rename(columns={
    'Attrition_Flag' : 'Exited',
    'Customer_Age' : 'Age',
    'Dependent_count' : 'Dependents',
    'Education_Level' : 'Education', 
    'Marital_Status' : 'Marital', 
    'Income_Category' : 'Income', 
    'Card_Category' : 'Card_Type',
    'Months_on_book' : 'Tenure', 
    'Total_Relationship_Count' : 'Product_Cnt', 
    'Months_Inactive_12_mon' : 'Inactive_Months',
    'Contacts_Count_12_mon' : 'Contacts_Cnt', 
    'Total_Revolving_Bal' : 'Revolv_Bal',   
    'Avg_Open_To_Buy' : 'Avg_OTB', 
    'Total_Amt_Chng_Q4_Q1' : 'Amt_Chng_Q4_Q1', 
    'Total_Trans_Amt' : 'Trans_Amt',
    'Total_Trans_Ct' : 'Trans_Cnt', 
    'Total_Ct_Chng_Q4_Q1' : 'Cnt_Chng_Q4_Q1', 
    'Avg_Utilization_Ratio' : 'Avg_Util_Ratio'
}, inplace=True)

In [110]:
# 컬럼 순서변경
df = df[['Exited', 'Age', 'Gender', 'Dependents', 'Education', 'Marital', 'Income', 'Card_Type', 'Tenure', 'Product_Cnt', 'Inactive_Months',
         'Contacts_Cnt', 'Credit_Limit', 'Revolv_Bal', 'Avg_OTB', 'Avg_Util_Ratio', 'Trans_Amt', 'Trans_Cnt', 'Amt_Chng_Q4_Q1',  'Cnt_Chng_Q4_Q1']]

## 2. Label Encoder

In [111]:
# 변수별 encoding
mapping = {
    'Exited': {'Existing Customer': 0, 'Attrited Customer': 1},
    'Gender': {'M': 0, 'F': 1},
    'Education': {'Uneducated': 0, 'High School': 1, 'College': 2, 'Graduate' : 3, 'Post-Graduate': 4, 'Doctorate': 5, 'Unknown': 6},
    'Marital': {'Single': 0, 'Married': 1, 'Divorced': 2, 'Unknown': 3},
    'Income': {'Less than $40K': 0, '$40K - $60K': 1, '$60K - $80K': 2, '$80K - $120K': 3, '$120K +': 4, 'Unknown': 5},
    'Card_Type': {'Blue': 0, 'Silver': 1, 'Gold': 2, 'Platinum': 3}
}

df = df.replace(mapping)

## 3. 교차검증 및 스케일링

> ### 영주님 코드 참고

In [112]:
# 교차검증(stratified k-fold cross validation (n_split:5))

from sklearn.model_selection import StratifiedKFold

X = df.drop('Exited', axis=1).values
y = df['Exited'].values

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)

fold_idx = 1

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    print(f"{fold_idx}번째 폴드: 훈련 샘플 수 - {len(X_train)}, 테스트 샘플 수 - {len(X_test)}")
    
    fold_idx += 1

1번째 폴드: 훈련 샘플 수 - 8101, 테스트 샘플 수 - 2026
2번째 폴드: 훈련 샘플 수 - 8101, 테스트 샘플 수 - 2026
3번째 폴드: 훈련 샘플 수 - 8102, 테스트 샘플 수 - 2025
4번째 폴드: 훈련 샘플 수 - 8102, 테스트 샘플 수 - 2025
5번째 폴드: 훈련 샘플 수 - 8102, 테스트 샘플 수 - 2025


In [113]:
from sklearn.metrics import accuracy_score

fold_idx = 1
accuracy_scores = []

for train_index, test_index in skf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    model = LogisticRegression(solver='liblinear', random_state=13, C=3, penalty='l1' )

    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    accuracy_scores.append(accuracy)
    
    fold_idx += 1

for idx, acc in enumerate(accuracy_scores, start=1):
    print(f"Fold {idx} 정확도: {acc}")

mean_accuracy = np.mean(accuracy_scores)
print(f"평균 정확도: {mean_accuracy}")


Fold 1 정확도: 0.9042448173741362
Fold 2 정확도: 0.9012833168805529
Fold 3 정확도: 0.9022222222222223
Fold 4 정확도: 0.9066666666666666
Fold 5 정확도: 0.9071604938271605
평균 정확도: 0.9043155033941476


In [114]:
# 데이터 스케일링(standard scaler)
from sklearn.preprocessing import StandardScaler

features = df.drop('Exited', axis=1).values
label = df['Exited'].values

std = StandardScaler()
features_scaled = std.fit_transform(features)

In [115]:
# 데이터를 훈련/테스트로 분리
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features_scaled, label,
                                                    test_size=0.2,
                                                    stratify=label,
                                                    random_state=13)

In [116]:
model = LogisticRegression(solver='liblinear', random_state=13, C=3, penalty='l1' )
model.fit(X_train, y_train)

In [117]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

# 테스트 데이터로 예측 수행
y_pred = model.predict(X_test)
y_pred_probs = model.predict_proba(X_test)[:, 1]

# 정확도 평가
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# 정밀도 평가
precision = precision_score(y_test, y_pred)
print("Precision:", precision)

# 재현율 평가
recall = recall_score(y_test, y_pred)
print("Recall:", recall)

# F1-score 평가
f1 = f1_score(y_test, y_pred)
print("F1-score:", f1)

# AUC-ROC 평가
roc_auc = roc_auc_score(y_test, y_pred_probs)
print("AUC-ROC:", roc_auc)

Accuracy: 0.9017769002961501
Precision: 0.752
Recall: 0.5784615384615385
F1-score: 0.6539130434782608
AUC-ROC: 0.9303016325238547


In [118]:
# 피쳐의 중요도 확인
feature_importance = np.abs(model.coef_[0])

feature_names = df.drop('Exited', axis=1).columns

sorted_indices = sorted(range(len(feature_importance)), key=lambda k: feature_importance[k], reverse=True)
sorted_feature_importance = [feature_importance[i] for i in sorted_indices]
sorted_feature_names = [feature_names[i] for i in sorted_indices]

for name, importance in zip(sorted_feature_names, sorted_feature_importance):
    print(f'{name}: {importance}')

Trans_Cnt: 2.6589656280321647
Trans_Amt: 1.555848514382384
Revolv_Bal: 0.7764501175871036
Product_Cnt: 0.715255812196321
Cnt_Chng_Q4_Q1: 0.6765008265333101
Contacts_Cnt: 0.5312052132138345
Inactive_Months: 0.484383842145075
Gender: 0.3140623827789579
Dependents: 0.17838899763732638
Card_Type: 0.13674997471718453
Amt_Chng_Q4_Q1: 0.10843233541063191
Tenure: 0.07755930501442877
Marital: 0.06939151994694039
Avg_OTB: 0.044101804141627794
Income: 0.04211010097920088
Education: 0.03378580848342969
Age: 0.02316922607501537
Credit_Limit: 0.014848506281727087
Avg_Util_Ratio: 0.006853706976101743


> ### 내가 했던 코드

In [119]:
X = df.drop(['Exited'], axis=1)
y = df['Exited']

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)
lr = LogisticRegression(solver='liblinear', random_state=13)

cv_accuracy = []
coefficients = []

for train_idx, test_idx in skfold.split(X, y):
    X_train = X.iloc[train_idx]
    X_test = X.iloc[test_idx]
    y_train = y.iloc[train_idx]
    y_test = y.iloc[test_idx]

    sc = StandardScaler()

    X_train_scaled = sc.fit_transform(X_train)
    X_test_scaled = sc.transform(X_test)

    lr.fit(X_train_scaled, y_train)
    pred = lr.predict(X_test_scaled)
    cv_accuracy.append(accuracy_score(y_test, pred))

    coefficients.append(lr.coef_[0])

print('StratifiedKFold accuracy : ', cv_accuracy)
print('Average of accuracy : ', np.mean(cv_accuracy))

mean_coefficients = np.mean(coefficients, axis=0)

# 각 특성의 회귀 계수를 출력합니다.
print("Logistic Regression Coefficients:")

feature_list = []
coef_list = []

for feature, importance in zip(X.columns, np.abs(mean_coefficients)):
    feature_list.append(feature)
    coef_list.append(importance)

coef_df = pd.DataFrame({
    'feature' : feature_list,
    'coef' : coef_list
})
coef_df.sort_values('coef', ascending=False)

StratifiedKFold accuracy :  [0.9052319842053307, 0.9032576505429417, 0.9012345679012346, 0.9076543209876543, 0.9066666666666666]
Average of accuracy :  0.9048090380607656
Logistic Regression Coefficients:


Unnamed: 0,feature,coef
16,Trans_Cnt,2.654058
15,Trans_Amt,1.558898
12,Revolv_Bal,0.768497
8,Product_Cnt,0.697632
18,Cnt_Chng_Q4_Q1,0.662789
10,Contacts_Cnt,0.546194
9,Inactive_Months,0.504609
1,Gender,0.32583
2,Dependents,0.178515
6,Card_Type,0.120812


- Trans_Cnt: 2.6589656280321647
- Trans_Amt: 1.555848514382384
- Revolv_Bal: 0.7764501175871036
- Product_Cnt: 0.715255812196321
- Cnt_Chng_Q4_Q1: 0.6765008265333101
- Contacts_Cnt: 0.5312052132138345
- Inactive_Months: 0.484383842145075
- Gender: 0.3140623827789579
- Dependents: 0.17838899763732638
- Card_Type: 0.13674997471718453
- Amt_Chng_Q4_Q1: 0.10843233541063191
- Tenure: 0.07755930501442877
- Marital: 0.06939151994694039
- Avg_OTB: 0.044101804141627794
- Income: 0.04211010097920088
- Education: 0.03378580848342969
- Age: 0.02316922607501537
- Credit_Limit: 0.014848506281727087
- Avg_Util_Ratio: 0.006853706976101743