# 어뷰징 탐지 모델 개선

기존 모델의 과적합 문제를 해결하고, 더 robust한 모델을 개발합니다.

## 개선 항목
1. K-Fold 교차검증으로 모델 안정성 검증
2. 하이퍼파라미터 튜닝 (GridSearchCV)
3. 앙상블 기법 (Voting, Stacking)
4. 클래스 불균형 처리
5. SHAP 분석으로 모델 해석성 개선

In [None]:
import sys
import os
sys.path.append(os.path.abspath('..'))

import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

from sklearn.model_selection import (
    train_test_split, cross_val_score, StratifiedKFold,
    GridSearchCV
)
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (
    RandomForestClassifier, GradientBoostingClassifier,
    VotingClassifier, StackingClassifier, AdaBoostClassifier
)
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve
)
import joblib
import warnings
warnings.filterwarnings('ignore')

print("라이브러리 로드 완료")

## 1. 데이터 로드 및 전처리

In [None]:
from src.database.supabase_client import get_supabase_client

client = get_supabase_client()

def load_all_data(table_name):
    response = client.table(table_name).select("*").execute()
    return pd.DataFrame(response.data)

# 데이터 로드
sellers_df = load_all_data('sellers')
products_df = load_all_data('products')
reviews_df = load_all_data('reviews')
questions_df = load_all_data('questions')

print(f"판매자: {len(sellers_df)}개")
print(f"상품: {len(products_df)}개")
print(f"리뷰: {len(reviews_df)}개")
print(f"질문: {len(questions_df)}개")

In [None]:
# 피처 엔지니어링 (이전 노트북과 동일)
product_vendor_map = products_df[['product_id', 'vendor_name']].drop_duplicates()

# 판매자별 상품 통계
product_stats = products_df.groupby('vendor_name').agg({
    'product_id': 'count',
    'price': ['mean', 'std', 'min', 'max'],
    'product_rating': ['mean', 'std'],
    'review_count': ['sum', 'mean'],
    'discount_rate': ['mean', 'max'],
    'shipping_fee': 'mean',
    'shipping_days': 'mean'
}).reset_index()

product_stats.columns = [
    'company_name', 'product_count_actual',
    'price_mean', 'price_std', 'price_min', 'price_max',
    'rating_mean', 'rating_std', 'review_sum', 'review_mean',
    'discount_mean', 'discount_max', 'shipping_fee_mean', 'shipping_days_mean'
]

# 리뷰 통계
reviews_with_vendor = reviews_df.merge(product_vendor_map, on='product_id', how='left')
reviews_with_vendor['text_length'] = reviews_with_vendor['review_text'].apply(
    lambda x: len(str(x)) if pd.notna(x) else 0
)

review_stats = reviews_with_vendor.groupby('vendor_name').agg({
    'id': 'count',
    'review_rating': ['mean', 'std'],
    'text_length': ['mean', 'std', 'max']
}).reset_index()

review_stats.columns = [
    'company_name', 'review_count_actual',
    'review_rating_mean', 'review_rating_std',
    'review_length_mean', 'review_length_std', 'review_length_max'
]

# 질문 통계
questions_with_vendor = questions_df.merge(product_vendor_map, on='product_id', how='left')
questions_with_vendor['has_answer'] = questions_with_vendor['answer'].apply(
    lambda x: 1 if pd.notna(x) and str(x).strip() != '' else 0
)

question_stats = questions_with_vendor.groupby('vendor_name').agg({
    'id': 'count',
    'has_answer': 'mean'
}).reset_index()
question_stats.columns = ['company_name', 'question_count', 'answer_rate']

# 피처 병합
features_df = sellers_df[[
    'company_name', 'satisfaction_score', 'review_count',
    'total_product_count', 'is_abusing_seller'
]].copy()

features_df = features_df.merge(product_stats, on='company_name', how='left')
features_df = features_df.merge(review_stats, on='company_name', how='left')
features_df = features_df.merge(question_stats, on='company_name', how='left')
features_df = features_df.fillna(0)

print(f"피처 데이터: {features_df.shape}")

In [None]:
# 피처와 타겟 분리
feature_columns = [
    'satisfaction_score', 'review_count', 'total_product_count',
    'product_count_actual', 'price_mean', 'price_std', 'price_min', 'price_max',
    'rating_mean', 'rating_std', 'review_sum', 'review_mean',
    'discount_mean', 'discount_max', 'shipping_fee_mean', 'shipping_days_mean',
    'review_count_actual', 'review_rating_mean', 'review_rating_std',
    'review_length_mean', 'review_length_std', 'review_length_max',
    'question_count', 'answer_rate'
]

X = features_df[feature_columns]
y = features_df['is_abusing_seller'].astype(int)

# Train/Test 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"훈련 세트: {X_train.shape[0]}개 (어뷰징: {y_train.sum()}개, {y_train.mean()*100:.1f}%)")
print(f"테스트 세트: {X_test.shape[0]}개 (어뷰징: {y_test.sum()}개, {y_test.mean()*100:.1f}%)")

## 2. K-Fold 교차검증으로 과적합 검증

기존 Random Forest 100% 정확도가 과적합인지 확인합니다.

In [None]:
# Stratified K-Fold 교차검증
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

models_cv = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(probability=True, random_state=42),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'AdaBoost': AdaBoostClassifier(n_estimators=100, random_state=42)
}

cv_results = []

print("5-Fold 교차검증 수행 중...\n")
for name, model in models_cv.items():
    # 스케일링이 필요한 모델
    if name in ['Logistic Regression', 'SVM', 'KNN']:
        X_cv = X_train_scaled
    else:
        X_cv = X_train
    
    # 여러 메트릭으로 교차검증
    acc_scores = cross_val_score(model, X_cv, y_train, cv=cv, scoring='accuracy')
    f1_scores = cross_val_score(model, X_cv, y_train, cv=cv, scoring='f1')
    roc_scores = cross_val_score(model, X_cv, y_train, cv=cv, scoring='roc_auc')
    
    cv_results.append({
        'model': name,
        'acc_mean': acc_scores.mean(),
        'acc_std': acc_scores.std(),
        'f1_mean': f1_scores.mean(),
        'f1_std': f1_scores.std(),
        'roc_mean': roc_scores.mean(),
        'roc_std': roc_scores.std()
    })
    
    print(f"{name}:")
    print(f"  Accuracy: {acc_scores.mean():.4f} (+/- {acc_scores.std():.4f})")
    print(f"  F1-Score: {f1_scores.mean():.4f} (+/- {f1_scores.std():.4f})")
    print(f"  ROC-AUC:  {roc_scores.mean():.4f} (+/- {roc_scores.std():.4f})")
    print()

In [None]:
# 교차검증 결과 시각화
cv_df = pd.DataFrame(cv_results)

fig = make_subplots(rows=1, cols=3, subplot_titles=('Accuracy', 'F1-Score', 'ROC-AUC'))

colors = px.colors.qualitative.Set2

for i, (metric, title) in enumerate([('acc', 'Accuracy'), ('f1', 'F1-Score'), ('roc', 'ROC-AUC')], 1):
    fig.add_trace(
        go.Bar(
            x=cv_df['model'],
            y=cv_df[f'{metric}_mean'],
            error_y=dict(type='data', array=cv_df[f'{metric}_std']),
            marker_color=colors[:len(cv_df)],
            showlegend=False
        ),
        row=1, col=i
    )

fig.update_layout(
    title='5-Fold 교차검증 결과 (평균 ± 표준편차)',
    height=400,
    template='plotly_white'
)
fig.update_xaxes(tickangle=45)
fig.show()

## 3. 하이퍼파라미터 튜닝

In [None]:
# Random Forest 하이퍼파라미터 튜닝
print("Random Forest 하이퍼파라미터 튜닝 중...")

rf_param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', None]
}

rf_grid = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_param_grid,
    cv=cv,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

rf_grid.fit(X_train, y_train)

print(f"\n최적 파라미터: {rf_grid.best_params_}")
print(f"최고 CV F1-Score: {rf_grid.best_score_:.4f}")

In [None]:
# Gradient Boosting 하이퍼파라미터 튜닝
print("Gradient Boosting 하이퍼파라미터 튜닝 중...")

gb_param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'subsample': [0.8, 1.0]
}

gb_grid = GridSearchCV(
    GradientBoostingClassifier(random_state=42),
    gb_param_grid,
    cv=cv,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

gb_grid.fit(X_train, y_train)

print(f"\n최적 파라미터: {gb_grid.best_params_}")
print(f"최고 CV F1-Score: {gb_grid.best_score_:.4f}")

In [None]:
# Logistic Regression 하이퍼파라미터 튜닝
print("Logistic Regression 하이퍼파라미터 튜닝 중...")

lr_param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'class_weight': ['balanced', None]
}

lr_grid = GridSearchCV(
    LogisticRegression(random_state=42, max_iter=1000),
    lr_param_grid,
    cv=cv,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)

lr_grid.fit(X_train_scaled, y_train)

print(f"\n최적 파라미터: {lr_grid.best_params_}")
print(f"최고 CV F1-Score: {lr_grid.best_score_:.4f}")

## 4. 앙상블 모델 구축

In [None]:
# 튜닝된 모델들로 앙상블 구성
best_rf = rf_grid.best_estimator_
best_gb = gb_grid.best_estimator_
best_lr = lr_grid.best_estimator_

# Voting Classifier (Soft Voting)
voting_clf = VotingClassifier(
    estimators=[
        ('rf', best_rf),
        ('gb', best_gb),
        ('lr', LogisticRegression(**lr_grid.best_params_, random_state=42, max_iter=1000))
    ],
    voting='soft'
)

# 스케일링된 데이터로 학습 (LR 때문에)
# 주의: 실제로는 파이프라인을 사용해야 하지만, 여기서는 RF/GB가 스케일링에 민감하지 않아 간소화
voting_clf.fit(X_train_scaled, y_train)

# 교차검증
voting_cv_scores = cross_val_score(voting_clf, X_train_scaled, y_train, cv=cv, scoring='f1')
print(f"Voting Classifier CV F1: {voting_cv_scores.mean():.4f} (+/- {voting_cv_scores.std():.4f})")

In [None]:
# Stacking Classifier
stacking_clf = StackingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(**rf_grid.best_params_, random_state=42)),
        ('gb', GradientBoostingClassifier(**gb_grid.best_params_, random_state=42)),
        ('lr', LogisticRegression(**lr_grid.best_params_, random_state=42, max_iter=1000))
    ],
    final_estimator=LogisticRegression(random_state=42, max_iter=1000),
    cv=5
)

stacking_clf.fit(X_train_scaled, y_train)

# 교차검증
stacking_cv_scores = cross_val_score(stacking_clf, X_train_scaled, y_train, cv=cv, scoring='f1')
print(f"Stacking Classifier CV F1: {stacking_cv_scores.mean():.4f} (+/- {stacking_cv_scores.std():.4f})")

## 5. 최종 모델 평가

In [None]:
# 모든 모델 테스트 세트 평가
final_models = {
    'Tuned RF': (best_rf, X_test),
    'Tuned GB': (best_gb, X_test),
    'Tuned LR': (best_lr, X_test_scaled),
    'Voting': (voting_clf, X_test_scaled),
    'Stacking': (stacking_clf, X_test_scaled)
}

final_results = []

for name, (model, X_eval) in final_models.items():
    y_pred = model.predict(X_eval)
    y_proba = model.predict_proba(X_eval)[:, 1]
    
    results = {
        'model': name,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'f1': f1_score(y_test, y_pred),
        'roc_auc': roc_auc_score(y_test, y_proba)
    }
    final_results.append(results)

final_df = pd.DataFrame(final_results)
print("=== 최종 모델 성능 비교 (테스트 세트) ===")
print(final_df.to_string(index=False))

In [None]:
# 성능 비교 시각화
metrics = ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']
metric_names = ['정확도', '정밀도', '재현율', 'F1-Score', 'ROC-AUC']

fig = go.Figure()

for _, row in final_df.iterrows():
    fig.add_trace(go.Bar(
        name=row['model'],
        x=metric_names,
        y=[row[m] for m in metrics]
    ))

fig.update_layout(
    title='튜닝된 모델 성능 비교 (테스트 세트)',
    barmode='group',
    yaxis_title='Score',
    template='plotly_white',
    height=500
)
fig.show()

In [None]:
# ROC Curve 비교
fig = go.Figure()

for name, (model, X_eval) in final_models.items():
    y_proba = model.predict_proba(X_eval)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc = roc_auc_score(y_test, y_proba)
    
    fig.add_trace(go.Scatter(
        x=fpr, y=tpr,
        name=f'{name} (AUC={auc:.3f})',
        mode='lines'
    ))

fig.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1],
    name='Random',
    mode='lines',
    line=dict(dash='dash', color='gray')
))

fig.update_layout(
    title='튜닝된 모델 ROC Curve 비교',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    template='plotly_white'
)
fig.show()

## 6. SHAP 분석 (모델 해석성)

In [None]:
# SHAP 설치 확인
try:
    import shap
    print(f"SHAP 버전: {shap.__version__}")
except ImportError:
    print("SHAP 설치 중...")
    !uv add shap
    import shap
    print("SHAP 설치 완료")

In [None]:
import shap

# 최적 모델 선택 (F1 기준)
best_model_name = final_df.loc[final_df['f1'].idxmax(), 'model']
print(f"SHAP 분석 대상 모델: {best_model_name}")

# TreeExplainer 사용 (RF, GB)
if 'RF' in best_model_name:
    explainer = shap.TreeExplainer(best_rf)
    shap_values = explainer.shap_values(X_test)
    if isinstance(shap_values, list):
        shap_values = shap_values[1]  # 어뷰징 클래스
elif 'GB' in best_model_name:
    explainer = shap.TreeExplainer(best_gb)
    shap_values = explainer.shap_values(X_test)
else:
    # 다른 모델의 경우 KernelExplainer 사용
    model, X_eval = final_models[best_model_name]
    explainer = shap.KernelExplainer(model.predict_proba, shap.sample(X_train_scaled, 100))
    shap_values = explainer.shap_values(X_eval[:50])

In [None]:
# SHAP Beeswarm Plot (피처 영향 방향)
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 8))
shap.summary_plot(shap_values, X_test, feature_names=feature_columns, show=False)
plt.title('SHAP 피처 영향 분석')
plt.tight_layout()
plt.show()

In [None]:
# SHAP Summary Plot (피처 중요도)
shap.summary_plot(shap_values, X_test, feature_names=feature_columns, plot_type='bar', show=False)
plt.title('SHAP 피처 중요도')
plt.tight_layout()
plt.show()

In [None]:
# 개별 예측 설명 (첫 번째 어뷰징 샘플)
abusing_idx = y_test[y_test == 1].index[0]
sample_idx = list(y_test.index).index(abusing_idx)

# ---------------------------------------------------------
# 1. 데이터 준비 (Class 1에 대한 값만 추출)
# ---------------------------------------------------------
# shap_values가 (samples, features, 2) 형태라고 가정
sv_class1 = shap_values[sample_idx][:, 1]   # Class 1(어뷰징) SHAP 값
base_value = explainer.expected_value[1]    # Base Value (평균 예측값)
features = X_test.iloc[sample_idx]          # 해당 샘플의 실제 피처값

# ---------------------------------------------------------
# 2. 시각화를 위해 중요도 순으로 정렬 (절대값 기준)
# ---------------------------------------------------------
df_shap = pd.DataFrame({
    'feature': feature_columns,
    'shap_value': sv_class1,
    'feature_value': features.values
})

# SHAP 값의 절대값 크기순으로 정렬 (상위 20개만 보기 위해)
df_shap['abs_shap'] = df_shap['shap_value'].abs()
df_shap = df_shap.sort_values('abs_shap', ascending=True).tail(20) # 하위 20개(중요도 높은순)

# ---------------------------------------------------------
# 3. Plotly Waterfall 그리기
# ---------------------------------------------------------
fig = go.Figure(go.Waterfall(
    name = "SHAP",
    orientation = "h",  # 가로 방향
    measure = ["relative"] * len(df_shap),
    y = df_shap['feature'],  # Y축: 피처 이름
    x = df_shap['shap_value'], # X축: SHAP 기여도
    text = df_shap['feature_value'].apply(lambda x: f"{x:.2f}" if isinstance(x, float) else str(x)), # 막대 옆에 실제 값 표시
    textposition = "outside",
    connector = {"mode":"between", "line":{"width":1, "color":"rgb(150,150,150)", "dash":"dot"}}
))

# 레이아웃 설정 (Base Value 표시 등)
final_prob = base_value + df_shap['shap_value'].sum() # 근사치 (top 20개만 합쳤으므로)

fig.update_layout(
    title = {
        'text': f"<b>Sample #{sample_idx} 예측 설명 (Waterfall)</b><br>Base: {base_value:.3f} → Prediction: {base_value + sv_class1.sum():.3f}",
        'y':0.95, 'x':0.5, 'xanchor': 'center', 'yanchor': 'top'
    },
    showlegend = False,
    height = 600, # 그래프 높이 조절
    xaxis = dict(title = "SHAP Value (기여도)"),
)

fig.show()

## 7. 최종 모델 저장

In [None]:
# 최고 성능 모델 저장
best_idx = final_df['f1'].idxmax()
best_model_name = final_df.loc[best_idx, 'model']
best_model_obj = final_models[best_model_name][0]

os.makedirs('../models', exist_ok=True)

# 모델 저장
model_filename = f'abusing_detector_tuned_{best_model_name.lower().replace(" ", "_")}.pkl'
joblib.dump(best_model_obj, f'../models/{model_filename}')
joblib.dump(scaler, '../models/scaler_tuned.pkl')

# 튜닝 파라미터 저장
tuning_results = {
    'rf_params': rf_grid.best_params_,
    'gb_params': gb_grid.best_params_,
    'lr_params': lr_grid.best_params_,
    'best_model': best_model_name,
    'best_f1': final_df.loc[best_idx, 'f1'],
    'cv_results': cv_results
}
joblib.dump(tuning_results, '../models/tuning_results.pkl')

print(f"최종 모델 저장 완료: models/{model_filename}")
print(f"스케일러 저장 완료: models/scaler_tuned.pkl")
print(f"튜닝 결과 저장 완료: models/tuning_results.pkl")

## 8. 요약

In [None]:
print("="*70)
print("어뷰징 탐지 모델 개선 완료")
print("="*70)

print("\n[1] 교차검증 결과")
print("-" * 50)
cv_summary = pd.DataFrame(cv_results)[['model', 'f1_mean', 'f1_std']].sort_values('f1_mean', ascending=False)
print(cv_summary.to_string(index=False))

print("\n[2] 하이퍼파라미터 튜닝 결과")
print("-" * 50)
print(f"Random Forest 최적 파라미터:")
for k, v in rf_grid.best_params_.items():
    print(f"  - {k}: {v}")
print(f"\nGradient Boosting 최적 파라미터:")
for k, v in gb_grid.best_params_.items():
    print(f"  - {k}: {v}")

print("\n[3] 최종 모델 성능 (테스트 세트)")
print("-" * 50)
print(final_df.to_string(index=False))

print(f"\n[4] 최종 선택 모델: {best_model_name}")
print(f"    F1-Score: {final_df.loc[best_idx, 'f1']:.4f}")
print(f"    ROC-AUC: {final_df.loc[best_idx, 'roc_auc']:.4f}")