In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.metrics import precision_recall_curve
import joblib

#전처리된 데이터 불러오기
pd.set_option("display.max_rows", 10)
customer_churn_reviews = pd.read_csv("../../data/customer_churn_preprocessed.csv", index_col = 0)

#데이터셋 분리
X = customer_churn_reviews.drop('Churn', axis = 1)
y = customer_churn_reviews['Churn']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)



In [22]:
# 로지스틱 회귀 : 이진 분류 baseline

#1. 로지스틱 회귀
log_reg = LogisticRegression(max_iter = 1000)
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

Logistic Regression
Accuracy: 0.8055358410220014
              precision    recall  f1-score   support

           0       0.85      0.89      0.87      1035
           1       0.66      0.56      0.60       374

    accuracy                           0.81      1409
   macro avg       0.75      0.73      0.74      1409
weighted avg       0.80      0.81      0.80      1409



In [23]:
# 랜덤포레스트 : 비선형 관계 잘 잡음
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("\nRandom Forest")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest
Accuracy: 0.7849538679914834
              precision    recall  f1-score   support

           0       0.83      0.89      0.86      1035
           1       0.62      0.51      0.56       374

    accuracy                           0.78      1409
   macro avg       0.72      0.70      0.71      1409
weighted avg       0.77      0.78      0.78      1409



In [24]:
#두 모델 recall(1) 부분을 살펴보면 이탈 고객 중 절반 정도 밖에 못 잡고 있음
#이 부분을 개선하기 위해 데이터 불균형 처리
#좀 더 자세히 알아보기 위해 Confusion Matrix, ROC-AUC 지표 추가

# 해결책
# Oversampling (Smote) : 이탈 고객 데이터를 synthetic하게 늘림
# Undersampling : 비이탈 고객 일부 줄임
# Class weights 조정 : 모델이 이탈 고객을 더 중요하게 보도록 가중치 설정

# ── Logistic Regression ─────────────────────────────────────────
print("\n=== Logistic Regression ===")
print(classification_report(y_test, y_pred_lr, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_lr))
lr_proba = log_reg.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, lr_proba))

# ── Random Forest ──────────────────────────────────────────────
print("\n=== Random Forest ===")
print(classification_report(y_test, y_pred_rf, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_rf))
rf_proba = rf.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, rf_proba))


=== Logistic Regression ===
              precision    recall  f1-score   support

           0      0.849     0.895     0.871      1035
           1      0.657     0.559     0.604       374

    accuracy                          0.806      1409
   macro avg      0.753     0.727     0.738      1409
weighted avg      0.798     0.806     0.800      1409

Confusion matrix:
 [[926 109]
 [165 209]]
ROC-AUC: 0.8421116536206051

=== Random Forest ===
              precision    recall  f1-score   support

           0      0.832     0.886     0.858      1035
           1      0.616     0.505     0.555       374

    accuracy                          0.785      1409
   macro avg      0.724     0.696     0.707      1409
weighted avg      0.775     0.785     0.778      1409

Confusion matrix:
 [[917 118]
 [185 189]]
ROC-AUC: 0.8186016171949676


In [25]:
#Confusion Matrix에서 FN비율을 줄이는 방향으로 설계
#class weight 조정하기

#Logistic Regression with class_weight
log_reg = LogisticRegression(max_iter=1000, class_weight="balanced")
log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

print("\n=== Logistic Regression (class_weight=balanced) ===")
print(classification_report(y_test, y_pred_lr, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_lr))
lg_proba = log_reg.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, lg_proba))

#실행 결과 FN 185 -> 81, 다만 FP가 증가하게 됌.


=== Logistic Regression (class_weight=balanced) ===
              precision    recall  f1-score   support

           0      0.902     0.721     0.801      1035
           1      0.503     0.783     0.613       374

    accuracy                          0.737      1409
   macro avg      0.703     0.752     0.707      1409
weighted avg      0.796     0.737     0.751      1409

Confusion matrix:
 [[746 289]
 [ 81 293]]
ROC-AUC: 0.8416543956185901


In [26]:
# Random Forest with class_weight
rf = RandomForestClassifier(n_estimators=100, random_state=42, class_weight="balanced")
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("\n=== Random Forest (class_weight=balanced) ===")
print(classification_report(y_test, y_pred_rf, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_rf))
rf_proba = rf.predict_proba(X_test)[:, 1]
print("ROC-AUC:", roc_auc_score(y_test, rf_proba))

#실행 결과 FN이 줄어들지않고, 오히려 조금 증가함. (조정이 필요)
#RandomForest는 다수 트리의 투표 기반이기에, 클래스 1에 가중치를 줘도
#Threshold(0.5)는 그대로라서 오히려 증가할 수 있음.


=== Random Forest (class_weight=balanced) ===
              precision    recall  f1-score   support

           0      0.827     0.885     0.855      1035
           1      0.606     0.489     0.541       374

    accuracy                          0.780      1409
   macro avg      0.717     0.687     0.698      1409
weighted avg      0.769     0.780     0.772      1409

Confusion matrix:
 [[916 119]
 [191 183]]
ROC-AUC: 0.8186481180087317


In [27]:
#Randeom Forest 개선 방향 : Threshold 조정

y_proba = rf.predict_proba(X_test)[:,1]
y_pred_custom = (y_proba >= 0.4).astype(int) #Threshold 낮추기

print("\n=== Random Forest (adjust Threshold ) ===")
print(classification_report(y_test, y_pred_custom, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_custom))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))

#FN이 191->151로 유의미하게 감소.
#하지만 Threshold 0.4는 임의의 값이다.
#최적의 Threshold를 찾기 위해 "Precision-Recall Curve 기반" 적용


=== Random Forest (adjust Threshold ) ===
              precision    recall  f1-score   support

           0      0.851     0.834     0.842      1035
           1      0.565     0.596     0.580       374

    accuracy                          0.771      1409
   macro avg      0.708     0.715     0.711      1409
weighted avg      0.775     0.771     0.773      1409

Confusion matrix:
 [[863 172]
 [151 223]]
ROC-AUC: 0.8186481180087317


In [28]:
#최적의 Threshold 찾기 (Precision-Recall Curve)

prec, rec, thresholds = precision_recall_curve(y_test, y_proba)

#F1 계산
f1_scores = 2 * (prec * rec) / (prec + rec)
best_idx = f1_scores.argmax()
best_threshold_f1 = thresholds[best_idx]

# 최적 threshold 적용
print("\n=== Random Forest (Best Threshold ) ===")
print("Best Threshold : ", best_threshold_f1 , "\n" )
y_pred_f1 = (y_proba >= best_threshold_f1).astype(int)
print(classification_report(y_test, y_pred_f1, digits=3))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred_f1))
print("ROC-AUC:", roc_auc_score(y_test, y_proba))



=== Random Forest (Best Threshold ) ===
Best Threshold :  0.25 

              precision    recall  f1-score   support

           0      0.895     0.726     0.801      1035
           1      0.502     0.765     0.606       374

    accuracy                          0.736      1409
   macro avg      0.698     0.745     0.704      1409
weighted avg      0.791     0.736     0.750      1409

Confusion matrix:
 [[751 284]
 [ 88 286]]
ROC-AUC: 0.8186481180087317


In [29]:
#RF 실행결과 : FN이 165->81로 감소
#LG 실행결과 : FN이 191->88로 감소

#최종적으로 두 모델의 성능이 거의 비슷함.
#중점은 "이탈 고객을 최대한 놓치지 않겠다" 이기 때문에 FN이 더 낮은 "Logistic Regression" 선택

#최종 선택한 모델 저장
joblib.dump(log_reg, "../../results/final_lg_model.pkl")

['../../results/final_lg_model.pkl']