In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import classification_report, roc_auc_score

#전처리된 데이터 불러오기
pd.set_option("display.max_rows", 10)
customer_churn_reviews = pd.read_csv("../../data/customer_churn_preprocessed.csv", index_col = 0)

#데이터셋 분리
X = customer_churn_reviews.drop('Churn', axis = 1)
y = customer_churn_reviews['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



In [2]:
# 로지스틱 회귀 : 이진 분류 baseline

#1. 로지스틱 회귀
log_reg = LogisticRegression(max_iter = 1000)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic Regression
Accuracy: 0.8055358410220014
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.56      0.60       374

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409



In [3]:
# 랜덤포레스트 : 비선형 관계 잘 잡음
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("\nRandom Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest
Accuracy: 0.7849538679914834
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1035
           1       0.62      0.51      0.56       374

    accuracy                           0.78      1409
   macro avg       0.72      0.70      0.71      1409
weighted avg       0.77      0.78      0.78      1409



In [4]:
#두 모델 recall(1) 부분을 살펴보면 이탈 고객 중 절반 정도 밖에 못 잡고 있음
#이 부분을 개선하기 위해 데이터 불균형 처리
#좀 더 자세히 알아보기 위해 Confusion Matrix, ROC-AUC 지표 추가

# 해결책
# Oversampling (Smote) : 이탈 고객 데이터를 synthetic하게 늘림
# Undersampling : 비이탈 고객 일부 줄임
# Class weights 조정 : 모델이 이탈 고객을 더 중요하게 보도록 가중치 설정

# ── Logistic Regression ─────────────────────────────────────────
print("\n=== Logistic Regression ===")
print(classification_report(y_test, y_pred_lr, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_lr))
lr_proba = log_reg.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, lr_proba))

# ── Random Forest ──────────────────────────────────────────────
print("\n=== Random Forest ===")
print(classification_report(y_test, y_pred_rf, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_rf))
rf_proba = rf.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, rf_proba))


=== Logistic Regression ===
              precision    recall  f1-score   support

           0      0.849     0.895     0.871      1035
           1      0.657     0.559     0.604       374

    accuracy                          0.806      1409
   macro avg      0.753     0.727     0.738      1409
weighted avg      0.798     0.806     0.800      1409

Confusion matrix:
 [[926 109]
 [165 209]]
ROC-AUC: 0.8421116536206051

=== Random Forest ===
              precision    recall  f1-score   support

           0      0.832     0.886     0.858      1035
           1      0.616     0.505     0.555       374

    accuracy                          0.785      1409
   macro avg      0.724     0.696     0.707      1409
weighted avg      0.775     0.785     0.778      1409

Confusion matrix:
 [[917 118]
 [185 189]]
ROC-AUC: 0.8186016171949676


In [5]:
#Confusion Matrix에서 FN비율을 줄이는 방향으로 설계
#class weight 조정하기

#Logistic Regression with class_weight
log_reg = LogisticRegression(max_iter=1000, class_weight="balanced")
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

print("\n=== Logistic Regression (class_weight=balanced) ===")
print(classification_report(y_test, y_pred_lr, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_lr))
lg_proba = log_reg.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, lg_proba))


=== Logistic Regression (class_weight=balanced) ===
              precision    recall  f1-score   support

           0      0.902     0.721     0.801      1035
           1      0.503     0.783     0.613       374

    accuracy                          0.737      1409
   macro avg      0.703     0.752     0.707      1409
weighted avg      0.796     0.737     0.751      1409

Confusion matrix:
 [[746 289]
 [ 81 293]]
ROC-AUC: 0.8416543956185901
