# Credit Card Customer 데이터 분류분석
(https://www.kaggle.com/sakshigoyal7/credit-card-customers)

# 1. 데이터 불러오기 및 전처리

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# 데이터를 불러오기 
data=pd.read_csv("bankchurners.csv")
data

In [None]:
# 1 : “Naïve_Bayes…”로 시작하는 변수2개 (가장 뒤쪽) 제거. 
# 2 : 'CLIENTNUM' 제거
d = data.iloc[:,1:21].copy()
d

In [None]:
# 3 :“Attrition_Flag” 과 “Gender” 는 0/1로
d['Attrition_Flag'] = np.where(d['Attrition_Flag'] =="Attrited Customer", 1,0) 
print(d['Attrition_Flag'].value_counts())

d['Gender'] = np.where(d['Gender'] =="M", 1,0)
print(d['Gender'].value_counts())

In [None]:
#4 :“Income_Category”는 정수화
d['Income_Category'] = d['Income_Category'].replace(
    {'Unknown': 0 , 'Less than $40K':1, '$40K - $60K':2, '$80K - $120K':3, '$60K - $80K':4, '$120K +':5})
print(d['Income_Category'].value_counts())

In [None]:
# 5: Education_Level”는 정수화
d['Education_Level']= d['Education_Level'].replace(
    {'Unknown': 0, 'High School':1, 'Graduate':2, 'Uneducated':3,'College':4, 'Post-Graduate':5, 'Doctorate':6})
print(d['Education_Level'].value_counts())

In [None]:
# 6 :“Marital Status”는 Married =1, 기타=0
d['Marital_Status'] = np.where(d['Marital_Status'] =="Married", 1,0)
print(d['Marital_Status'].value_counts())

In [None]:
# 7 : "Card_Category”는 Blue=0, 기타=1
d['Card_Category'] = np.where(d['Card_Category'] =="Blue", 0, 1)
print(d['Card_Category'].value_counts())

In [None]:
# 8 : 모든 변수 correlation heatmap
plt.figure(figsize=(15,10))
sns.heatmap(d.corr(),cmap='coolwarm',annot=True)
plt.show()

In [None]:
# 9 : 새로운 변수 생성
d['Rel_Length'] = d['Months_on_book']/(d['Customer_Age']*12)*100
d['Avg_Trans_Amt']= d['Total_Trans_Amt']/d['Total_Trans_Ct']

In [None]:
#10 : 'Months_on_book' 변수 제거
#11 :'Total_Trans_Amt' 변수 제거
#12 :‘Avg_Open_To_Buy' 변수 제거 (Credit_Limit과 상관관계 높음)
d=d.drop(["Months_on_book","Total_Trans_Amt","Avg_Open_To_Buy"], axis=1)

In [None]:
# 13: Correlation heatmap
plt.figure(figsize=(15,10))
sns.heatmap(d.corr(),cmap='coolwarm',annot=True)
plt.show()

## 모델링
	Attrition_Flag 를 종속변수로 하여 분류분석 실시
1.	로지스틱회귀분석 (statsmodels Logit)
2.	DecisionTreeClassifier
3.	MLPClassifier
4.	SVC (linear)
5.	SVC (poly)
6.	BaggingClassifier
7.	RandomForestClassifier
8.	AdaBoostClassifier
9.	GradientBoostingClassifier


# 2. 변수선택 및 데이터 분할
### Attrition_Flag(이탈여부) 를 종속변수로 하여 분류분석을 수행한다.

In [None]:
X=d.drop(["Attrition_Flag"],axis=1)
y=d.Attrition_Flag 

### 2.1 Variable selection을 수행한다. 변수선택은 랜덤포레스트의 변수중요도를 이용한다, 변수중요도가 거의 없는 변수들을 제거하고 나서 진행한다.

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [None]:
model = RandomForestClassifier(n_estimators=100, random_state=0)
model.fit(X, y)

In [None]:
sns.barplot(x=model.feature_importances_, y=X.columns)
plt.axvline(0.02, color='red')
plt.show()

In [None]:
# SelectFromModel threshold 기준값을 통과한 변수만 선택 
select = SelectFromModel(RandomForestClassifier(n_estimators = 100), threshold =0.02 )
select.fit(X,y)

In [None]:
xname = X.columns[(select.get_support())]
xname

In [None]:
# 변수 선택 적용
X = X[xname].copy()

### 2.2 데이터 분할 (층화추출)

In [None]:
print("Number of the event: ")
print(y.value_counts())
print("Ratio of the event: ")
print(y.value_counts()/len(y))

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0, stratify=y)
print("Ratio of the event: ")
print(y_train.value_counts()/len(y_train))

# 3. 분류분석 9개 모형 수행

### 3-1. 로지스틱 회귀분석 

#### Full model

In [None]:
import statsmodels.api as sm
logit_full = sm.Logit(y_train, sm.add_constant(X_train))
model1 = logit_full.fit()
model1.summary()

#### Reduced model 1
* 가장 유효하지 않았던 Rel_Length 변수 제거 

In [None]:
logit_r1 = sm.Logit(y_train, sm.add_constant(X_train.drop(["Rel_Length"],axis=1)))
model_r1 = logit_r1.fit()
model_r1.summary()

#### Reduced model 2
* 가장 유효하지 않았던 Customer_Age 변수 제거

In [None]:
logit_r2 = sm.Logit(y_train, sm.add_constant(X_train.drop(["Customer_Age","Rel_Length"],axis=1)))
model_r2 = logit_r2.fit()
model_r2.summary()

#### Reduced model 3
* 가장 유효하지 않았던 Avg_Utilization_Ratio 변수 제거

In [None]:
logit_r3 = sm.Logit(y_train, sm.add_constant(X_train.drop(["Avg_Utilization_Ratio","Customer_Age","Rel_Length"],axis=1)))
model_r3 = logit_r3.fit()
model_r3.summary()

#### Reduced model 3 해석
* Total_Relationship_Count(보유상품수), Credit_Limit(사용한도), Total_Revolving_Bal(리볼빙 잔액), Total_Amt_Chng_Q4_Q1(거래금액변화), Total_Trans_Ct(총거래횟수), Total_Ct_Chng_Q4_Q1(거래횟수변화) 가 커질수록 계좌해지 가능성 작아짐
* Months_Inactive_12_mon(지난 12 개월 동안 비활성 상태 개월 수), Contacts_Count_12_mon(12개월간 고객센터연락횟수), Avg_Trans_Amt(평균거래금액) 커질수록 계좌해지 가능성 커짐

In [None]:
X_train_reduced =  X_train.drop(["Avg_Utilization_Ratio","Customer_Age","Rel_Length"],axis=1)
X_test_reduced = X_test.drop(["Avg_Utilization_Ratio","Customer_Age","Rel_Length"],axis=1)

#### 예측용 모델링

In [None]:
from sklearn.linear_model import LogisticRegression
model1 = LogisticRegression(random_state=0, max_iter = 1000)
model1.fit(X_train_reduced, y_train)

### 3-2. DecisionTreeClassifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree

#### 해석용 모델

In [None]:
tree1 = DecisionTreeClassifier(min_impurity_decrease=0.01, random_state =0)
tree1.fit(X_train,y_train)
yname = ['Existing Customer','Attrited Customer']
plt.figure(figsize=(15,10))
plot_tree(tree1, feature_names=xname, class_names=yname, filled=True, fontsize=12) 
plt.show()

* Total_Trans_Ct <= 54.5 이면, 왼쪽 노드로 이동,
   * 이 경우 Total_Revolving_Bal <= 573 이면, Attrited Customer 이고, 
   * Total_Revolving_Bal > 573 이면, Total_Relationship_Count <=2 이면 Attrited Customer로 분류
*  Total_Trans_Ct >  54.5 이면, 오른쪽 노드로 이동(gini =0.087), EXisting Customer 로 분류 


#### Hyperparameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV
np.random.seed(0)
grid = {'ccp_alpha': np.arange(0.000, 0.005, 0.001)}
g_cv = GridSearchCV(DecisionTreeClassifier(random_state=0),
              param_grid=grid, cv=5, scoring='roc_auc')
g_cv.fit(X_train, y_train)

In [None]:
print('selected:',g_cv.best_params_)
print('score   :',g_cv.best_score_)

#### 예측용 트리

In [None]:
model2 = DecisionTreeClassifier(ccp_alpha=0.002, random_state =0)
model2.fit(X_train,y_train)

### 3-3. 신경망 모델 

#### 정규화

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.neural_network import MLPClassifier

####  Hyperparameter tuning

In [None]:
np.random.seed(0)
grid = {'hidden_layer_sizes': np.arange(8, 13)}
g_cv = GridSearchCV(MLPClassifier(random_state=0, max_iter=1000),
                    param_grid=grid, cv=3, scoring='roc_auc')
g_cv.fit(X_train_scaled, y_train)

In [None]:
print('selected:',g_cv.best_params_)
print('score   :',g_cv.best_score_)

#### 예측용 모델링

In [None]:
# training the model & prediction
model3 = MLPClassifier(hidden_layer_sizes=(10), random_state=0, max_iter = 1000)
model3.fit(X_train_scaled, y_train)

### 3-4. SVC (linear)

In [None]:
from sklearn.svm import SVC

#### Hyperparameter tuning

In [None]:
np.random.seed(0)
grid = {'C': np.arange(11,13,0.1)}
g_cv = GridSearchCV(SVC(kernel='linear', random_state=0, max_iter=100000),
                    param_grid=grid, cv=3, scoring='roc_auc')
g_cv.fit(X_train_scaled, y_train)

In [None]:
print('selected:',g_cv.best_params_)
print('score   :',g_cv.best_score_)

#### 예측용 모델링

In [None]:
# linear kernel 이용 
model4= SVC(kernel='linear', random_state=0, C=11.9)
model4.fit(X_train_scaled, y_train)

### 3-5. SVC (polynomial)

#### Hyperparameter tuning

In [None]:
grid = {'C': np.arange(3,5,0.1)}
g_cv = GridSearchCV(SVC(kernel='poly',random_state=0, max_iter=500000),
                    param_grid=grid, cv=3, scoring='roc_auc')
g_cv.fit(X_train_scaled, y_train)

In [None]:
print('selected:',g_cv.best_params_)
print('score   :',g_cv.best_score_)

#### 예측용 모델링

In [None]:
# polynomial kernel 이용 
model5 = SVC(kernel='poly', random_state=0,  C=3.7)
model5.fit(X_train_scaled,y_train)

### 3-6. BaggingClassifier
*  base_estimator : DecisionTreeClassifie

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
model6 = BaggingClassifier(n_estimators=100, random_state=0)
model6.fit(X_train, y_train)

### 3-7.	RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
np.random.seed(0)
grid = {'max_features': ["sqrt",4,5,6]}
g_cv = GridSearchCV(RandomForestClassifier(n_estimators=100, random_state=0), 
                    param_grid=grid, cv=3, scoring='roc_auc')
g_cv.fit(X_train, y_train)

In [None]:
print('selected:',g_cv.best_params_)
print('score   :',g_cv.best_score_)

In [None]:
model7 = RandomForestClassifier(max_features="sqrt", random_state=0, n_estimators=100)
model7.fit(X_train,y_train)

### 3-8. AdaBoostClassifier
* default: base estimator 는 DecisionTreeClassifier initialized with max_depth=1.

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
model8 = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3),random_state=0, n_estimators=100)
model8.fit(X_train,y_train)

### 3-9. GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
np.random.seed(0)
grid = {'max_depth': [3,4,5,6,7]}
g_cv = GridSearchCV(GradientBoostingClassifier(n_estimators=100, random_state=0), 
                    param_grid=grid, cv=3, scoring='roc_auc')
g_cv.fit(X_train, y_train)

In [None]:
print('selected:',g_cv.best_params_)
print('score   :',g_cv.best_score_)

In [None]:
model9=GradientBoostingClassifier(max_depth=5, random_state=0, n_estimators=100) 
model9.fit(X_train,y_train)

### 평가

In [None]:
from sklearn.metrics import accuracy_score, f1_score

#### Accuracy

In [None]:
accuracy_table = pd.DataFrame([['LogisticRegression', accuracy_score(y_test, model1.predict(X_test_reduced))],
                          ['DecisionTreeClassifier', accuracy_score(y_test, model2.predict(X_test))],
                          ['MLPClassifier', accuracy_score(y_test, model3.predict(X_test_scaled))],
                          ['SVC (linear)', accuracy_score(y_test, model4.predict(X_test_scaled))],
                          ['SVC (poly)', accuracy_score(y_test, model5.predict(X_test_scaled))],
                          ['BaggingClassifier', accuracy_score(y_test, model6.predict(X_test))],
                          ['RandomForestClassifier', accuracy_score(y_test, model7.predict(X_test))],
                          ['AdaBoostClassifier', accuracy_score(y_test, model8.predict(X_test))],
                          ['GradientBoostingClassifier', accuracy_score(y_test, model9.predict(X_test))]], 
                         columns=['Model', 'Accuracy'])
accuracy_table                        

In [None]:
plt.subplots(figsize=(8, 6))
g=sns.lineplot(data=accuracy_table, x="Model", y="Accuracy")
g.set_xticklabels(accuracy_table['Model'], rotation=30, horizontalalignment='right')
plt.show()

#### F1 Score

In [None]:
f1_table = pd.DataFrame([['LogisticRegression', f1_score(y_test, model1.predict(X_test_reduced))],
                          ['DecisionTreeClassifier', f1_score(y_test, model2.predict(X_test))],
                          ['MLPClassifier', f1_score(y_test, model3.predict(X_test_scaled))],
                          ['SVC (linear)', f1_score(y_test, model4.predict(X_test_scaled))],
                          ['SVC (poly)', f1_score(y_test, model5.predict(X_test_scaled))],
                          ['BaggingClassifier', f1_score(y_test, model6.predict(X_test))],
                          ['RandomForestClassifier', f1_score(y_test, model7.predict(X_test))],
                          ['AdaBoostClassifier', f1_score(y_test, model8.predict(X_test))],
                          ['GradientBoostingClassifier', f1_score(y_test, model9.predict(X_test))]], 
                         columns=['Model', 'F1'])
f1_table      

#### ROC Curve

In [None]:
from sklearn.metrics import plot_roc_curve
m1_roc=plot_roc_curve(model1, X_test_reduced, y_test)
plot_roc_curve(model2, X_test, y_test, ax = m1_roc.ax_)
plot_roc_curve(model3, X_test_scaled, y_test, ax = m1_roc.ax_)
plot_roc_curve(model4, X_test_scaled, y_test, ax = m1_roc.ax_)
plot_roc_curve(model5, X_test_scaled, y_test, ax = m1_roc.ax_)
plot_roc_curve(model6, X_test, y_test, ax = m1_roc.ax_)
plot_roc_curve(model7, X_test, y_test, ax = m1_roc.ax_)
plot_roc_curve(model8, X_test, y_test, ax = m1_roc.ax_)
plot_roc_curve(model9, X_test, y_test, ax = m1_roc.ax_)
plt.title("ROC curve comparison")
plt.show()