In [1]:
import koreanize_matplotlib
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import sklearn.tree as tree

from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve)
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier

In [2]:
# 데이터 불러오기

df = pd.read_csv('../data/BankChurners.csv')
df.drop('CLIENTNUM', axis=1, inplace=True)
df.drop('Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', axis=1, inplace=True)
df.drop('Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', axis=1, inplace=True)

In [3]:
# 컬럼 이름 변경
df.rename(columns={
    'Attrition_Flag' : 'Exited',
    'Customer_Age' : 'Age',
    'Dependent_count' : 'Dependents',
    'Education_Level' : 'Education', 
    'Marital_Status' : 'Marital', 
    'Income_Category' : 'Income', 
    'Card_Category' : 'Card_Type',
    'Months_on_book' : 'Tenure', 
    'Total_Relationship_Count' : 'Product_Cnt', 
    'Months_Inactive_12_mon' : 'Inactive_Months',
    'Contacts_Count_12_mon' : 'Contacts_Cnt', 
    'Total_Revolving_Bal' : 'Revolv_Bal',   
    'Avg_Open_To_Buy' : 'Avg_OTB', 
    'Total_Amt_Chng_Q4_Q1' : 'Amt_Chng_Q4_Q1', 
    'Total_Trans_Amt' : 'Trans_Amt',
    'Total_Trans_Ct' : 'Trans_Cnt', 
    'Total_Ct_Chng_Q4_Q1' : 'Cnt_Chng_Q4_Q1', 
    'Avg_Utilization_Ratio' : 'Avg_Util_Ratio'
}, inplace=True)

In [4]:
# 컬럼 순서변경
df = df[['Exited', 'Age', 'Gender', 'Dependents', 'Education', 'Marital', 'Income', 'Card_Type', 'Tenure', 'Product_Cnt', 'Inactive_Months',
         'Contacts_Cnt', 'Credit_Limit', 'Revolv_Bal', 'Avg_OTB', 'Avg_Util_Ratio', 'Trans_Amt', 'Trans_Cnt', 'Amt_Chng_Q4_Q1',  'Cnt_Chng_Q4_Q1']]

In [5]:
# 변수별 encoding
mapping = {
    'Exited': {'Existing Customer': 0, 'Attrited Customer': 1},
    'Gender': {'M': 0, 'F': 1},
    'Education': {'Uneducated': 0, 'High School': 1, 'College': 2, 'Graduate' : 3, 'Post-Graduate': 4, 'Doctorate': 5, 'Unknown': 6},
    'Marital': {'Single': 0, 'Married': 1, 'Divorced': 2, 'Unknown': 3},
    'Income': {'Less than $40K': 0, '$40K - $60K': 1, '$60K - $80K': 2, '$80K - $120K': 3, '$120K +': 4, 'Unknown': 5},
    'Card_Type': {'Blue': 0, 'Silver': 1, 'Gold': 2, 'Platinum': 3}
}

df = df.replace(mapping)

In [6]:
X = df[['Product_Cnt', 'Inactive_Months', 'Contacts_Cnt', 'Revolv_Bal', 'Avg_Util_Ratio', 'Trans_Amt', 'Trans_Cnt', 'Amt_Chng_Q4_Q1', 'Cnt_Chng_Q4_Q1']]
y = df['Exited']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=13)
lr = LogisticRegression(solver='liblinear', random_state=13)

# 탐색할 하이퍼파라미터들의 후보 값들
params = {
    'C': [0.1, 1, 3, 5, 7, 9, 10],  # C 값 후보
    'penalty': ['l1', 'l2'],  # L1 정규화 또는 L2 정규화 후보
}

# GridSearchCV를 이용하여 모델과 파라미터 후보들을 입력하여 튜닝
gridsearch = GridSearchCV(estimator=lr, param_grid=params, cv=5)
gridsearch.fit(X_train, y_train)

# 최적의 하이퍼파라미터와 정확도 출력
print("Best Hyperparameters: ", gridsearch.best_params_)
print("Best Accuracy: ", gridsearch.best_score_)

Best Hyperparameters:  {'C': 3, 'penalty': 'l1'}
Best Accuracy:  0.8996418153707892
