In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import f1_score, matthews_corrcoef, recall_score, make_scorer

#pip install scikit-learn==1.2.2 에서 작동
from imblearn.pipeline import Pipeline as IMBPipeline
from imblearn.over_sampling import SMOTE, ADASYN
from xgboost import XGBClassifier

## Data Load

In [2]:
df = pd.read_csv('./data/BankChurners.csv')

## 편의를 위해 target 열 이탈고객의 경우 1, 유지 고객은 0으로 변경

In [3]:
df['Attrition_Flag'] = df['Attrition_Flag'].map({'Existing Customer': 0, 'Attrited Customer': 1})

## 필요없는 열 제거

In [4]:
df.drop(['CLIENTNUM','Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'], axis=1, inplace = True)

# Data Augmentation

## 1. SMOTE, ADASYN 이용

In [5]:
# 데이터 분리
X = df.drop('Attrition_Flag', axis=1)
y = df['Attrition_Flag']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 수치형 및 범주형 컬럼 분리
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object', 'category']).columns

# 데이터 전처리 파이프라인
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# 매튜스 상관계수와 G-평균 계산을 위한 사용자 정의 함수
def gmean(y_true, y_pred):
    sensitivity = recall_score(y_true, y_pred, pos_label=1)
    specificity = recall_score(y_true, y_pred, pos_label=0)
    return np.sqrt(sensitivity * specificity)

# Scorer 객체 생성
scorers = {
    'accuracy': 'accuracy',
    'f1': 'f1',
    'f1_macro': make_scorer(f1_score, average='macro'),
    'f1_weighted': make_scorer(f1_score, average='weighted'),
    'mcc': make_scorer(matthews_corrcoef),
    'gmean': make_scorer(gmean)
}

# 모델 정의
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting Machine': GradientBoostingClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
}

# 증강기법 및 모델 학습 파이프라인
results = []
for augmenter in [SMOTE(random_state=42), ADASYN(random_state=42), None]:
    for name, model in models.items():
        if augmenter:
            pipeline = IMBPipeline(steps=[('preprocessor', preprocessor),
                                          ('augmenter', augmenter),
                                          ('classifier', model)])
        else:
            pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                       ('classifier', model)])
        
        # 교차 검증
        cv_results = cross_validate(pipeline, X_train, y_train, cv=5, scoring=scorers)
        results.append({
            'Method': augmenter.__class__.__name__ if augmenter else 'Original',
            'Model': name,
            'Accuracy': np.mean(cv_results['test_accuracy']),
            'F1 Score': np.mean(cv_results['test_f1']),
            'F1 Macro': np.mean(cv_results['test_f1_macro']),
            'F1 Weighted': np.mean(cv_results['test_f1_weighted']),
            'MCC': np.mean(cv_results['test_mcc']),
            'G-Mean': np.mean(cv_results['test_gmean'])
        })

# 결과 데이터프레임 생성
results_df = pd.DataFrame(results)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [6]:
results_df

Unnamed: 0,Method,Model,Accuracy,F1 Score,F1 Macro,F1 Weighted,MCC,G-Mean
0,SMOTE,Logistic Regression,0.85186,0.644489,0.775458,0.864633,0.582532,0.847272
1,SMOTE,Random Forest,0.954007,0.853259,0.912994,0.953665,0.826335,0.904412
2,SMOTE,Gradient Boosting Machine,0.955559,0.865417,0.9194,0.956157,0.83968,0.930546
3,SMOTE,XGBoost,0.971642,0.910154,0.946658,0.971514,0.893535,0.941651
4,ADASYN,Logistic Regression,0.827028,0.617308,0.752778,0.845018,0.556988,0.845059
5,ADASYN,Random Forest,0.955982,0.859854,0.916872,0.955695,0.833997,0.909105
6,ADASYN,Gradient Boosting Machine,0.954007,0.863032,0.917698,0.954917,0.837012,0.934971
7,ADASYN,XGBoost,0.97136,0.908647,0.945832,0.97115,0.891938,0.938425
8,Original,Logistic Regression,0.905333,0.66537,0.805117,0.900268,0.618566,0.754361
9,Original,Random Forest,0.954148,0.843166,0.908157,0.95241,0.821296,0.873985


> 굳이 데이터 증강할 필요가 없다

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def preprocess_and_split(df, target, test_size=0.3, random_state=42):
    # 데이터를 훈련 데이터와 테스트 데이터로 분할
    train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state)
    
    # 타겟 변수와 특성 변수 분리
    X_train = train_df.drop(columns=[target])
    y_train = train_df[target]
    X_test = test_df.drop(columns=[target])
    y_test = test_df[target]

    # 범주형 변수 식별 (훈련 데이터 기준)
    categorical_columns = X_train.select_dtypes(include=['object', 'category']).columns
    
    # 수치형 변수 식별 (훈련 데이터 기준)
    numeric_columns = X_train.select_dtypes(include=[np.number]).columns
    
    # 훈련 데이터에서 수치형 변수를 표준화
    scaler = StandardScaler()
    X_train[numeric_columns] = scaler.fit_transform(X_train[numeric_columns])
    X_test[numeric_columns] = scaler.transform(X_test[numeric_columns])  # 테스트 데이터는 오직 transform만 적용

    # 범주형 변수를 더미 변수로 변환
    X_train = pd.get_dummies(X_train, columns=categorical_columns, drop_first=True)
    X_test = pd.get_dummies(X_test, columns=categorical_columns, drop_first=True)

    # 더미화 후 생기는 열의 불일치 문제 해결
    X_train, X_test = X_train.align(X_test, join='inner', axis=1)
    
    return X_train, X_test, y_train, y_test

In [8]:
X_train, X_test, y_train, y_test = preprocess_and_split(df, 'Attrition_Flag')

In [9]:
sm = SMOTE(random_state=42)
x_sm, y_sm = sm.fit_resample(X_train, y_train)

## GBM

In [10]:
model_sm = GradientBoostingClassifier(random_state=42)
model_sm.fit(x_sm, y_sm)
y_pred = model_sm.predict(X_test)

print(classification_report(y_test, y_pred))

NameError: name 'classification_report' is not defined

In [None]:
model_raw = GradientBoostingClassifier(random_state=42)
model_raw.fit(X_train, y_train)

y_pred = model_raw.predict(X_test)

print(classification_report(y_test, y_pred))

## XGB

In [None]:
from xgboost import XGBClassifier

In [None]:
model_sm = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')#colsample_bytree=0.9, learning_rate=0.2, max_depth=7)
model_sm.fit(x_sm, y_sm)

y_pred = model_sm.predict(X_test)

print(classification_report(y_test, y_pred))

In [None]:
model_raw = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')#colsample_bytree=0.9, learning_rate=0.2, max_depth=7)
model_raw.fit(X_train, y_train)

y_pred = model_raw.predict(X_test)

print(classification_report(y_test, y_pred))