# 어뷰징 판매자 탐지 모델 개발

쿠팡 판매자 데이터를 기반으로 어뷰징 판매자를 탐지하는 머신러닝 모델을 개발합니다.

이 노트북은 `03_feature_engineering.ipynb`에서 생성된 피처를 로드하여 모델을 학습합니다.

In [None]:
import os
import pandas as pd
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, roc_curve
)
import joblib
import warnings
warnings.filterwarnings('ignore')

print("라이브러리 로드 완료")

## 1. 데이터 로드

`03_feature_engineering.ipynb`에서 생성된 피처 데이터를 로드합니다.

In [None]:
# 피처 데이터 로드
FEATURE_PATH = '../data/processed/features.csv'

if not os.path.exists(FEATURE_PATH):
    print(f"오류: {FEATURE_PATH} 파일이 없습니다. 03_feature_engineering.ipynb를 먼저 실행해주세요.")
else:
    features_df = pd.read_csv(FEATURE_PATH)
    print(f"최종 피처 데이터 로드 완료: {features_df.shape}")
    
    # 피처 목록 확인
    feature_columns = [
        'satisfaction_score', 'review_count', 'total_product_count', 
        'product_count_actual', 'price_mean', 'price_std', 'price_min', 'price_max', 
        'rating_mean', 'rating_std', 'review_sum', 'review_mean', 
        'discount_mean', 'discount_max', 'shipping_fee_mean', 'shipping_days_mean', 
        'review_count_actual', 'review_rating_mean', 'review_rating_std', 
        'review_length_mean', 'review_length_std', 'review_length_max', 
        'question_count', 'answer_rate'
    ]
    
    print(f"총 피처 수: {len(feature_columns)}개")

## 2. 데이터 분할 및 전처리

In [None]:
# 피처와 타겟 분리
X = features_df[feature_columns]
y = features_df['is_abusing_seller'].astype(int)

# Train/Test 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"훈련 세트: {X_train.shape[0]}개 (어뷰징: {y_train.sum()}개)")
print(f"테스트 세트: {X_test.shape[0]}개 (어뷰징: {y_test.sum()}개)")

In [None]:
# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("스케일링 완료")

## 3. 모델 학습

In [None]:
# 모델 평가 함수
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """모델 평가 및 결과 반환"""
    # 예측
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    y_test_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # 메트릭 계산
    results = {
        'model': model_name,
        'train_accuracy': accuracy_score(y_train, y_train_pred),
        'test_accuracy': accuracy_score(y_test, y_test_pred),
        'precision': precision_score(y_test, y_test_pred, zero_division=0),
        'recall': recall_score(y_test, y_test_pred, zero_division=0),
        'f1': f1_score(y_test, y_test_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_test, y_test_proba) if y_test_proba is not None else None
    }
    
    return results, y_test_pred, y_test_proba

In [None]:
# 모델 저장용 딕셔너리
models = {}
results_list = []

# 1. Logistic Regression
print("1. Logistic Regression 학습 중...")
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)
models['Logistic Regression'] = lr_model

results, _, _ = evaluate_model(lr_model, X_train_scaled, X_test_scaled, y_train, y_test, 'Logistic Regression')
results_list.append(results)
print(f"   완료 - Test Accuracy: {results['test_accuracy']:.4f}, F1: {results['f1']:.4f}")

In [None]:
# 2. Random Forest
print("2. Random Forest 학습 중...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)  # RF는 스케일링 불필요
models['Random Forest'] = rf_model

results, _, _ = evaluate_model(rf_model, X_train, X_test, y_train, y_test, 'Random Forest')
results_list.append(results)
print(f"   완료 - Test Accuracy: {results['test_accuracy']:.4f}, F1: {results['f1']:.4f}")

In [None]:
# 3. Gradient Boosting
print("3. Gradient Boosting 학습 중...")
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
models['Gradient Boosting'] = gb_model

results, _, _ = evaluate_model(gb_model, X_train, X_test, y_train, y_test, 'Gradient Boosting')
results_list.append(results)
print(f"   완료 - Test Accuracy: {results['test_accuracy']:.4f}, F1: {results['f1']:.4f}")

In [None]:
# 결과 비교
results_df = pd.DataFrame(results_list)
print("\n=== 모델 성능 비교 ===")
print(results_df.to_string(index=False))

## 4. 모델 평가 시각화

In [None]:
# 성능 비교 차트
metrics = ['test_accuracy', 'precision', 'recall', 'f1']
metric_names = ['정확도', '정밀도', '재현율', 'F1-Score']

fig = go.Figure()

for _, row in results_df.iterrows():
    fig.add_trace(go.Bar(
        name=row['model'],
        x=metric_names,
        y=[row[m] for m in metrics]
    ))

fig.update_layout(
    title='모델별 성능 비교',
    barmode='group',
    yaxis_title='Score',
    template='plotly_white'
)
fig.show()

In [None]:
# 최고 성능 모델 선택 (F1 기준)
best_model_name = results_df.loc[results_df['f1'].idxmax(), 'model']
print(f"최고 성능 모델: {best_model_name}")

if best_model_name == 'Logistic Regression':
    best_model = lr_model
    X_test_final = X_test_scaled
else:
    best_model = models[best_model_name]
    X_test_final = X_test

# 혼동 행렬
y_pred = best_model.predict(X_test_final)
cm = confusion_matrix(y_test, y_pred)

fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=['정상 예측', '어뷰징 예측'],
    y=['정상 실제', '어뷰징 실제'],
    text=cm,
    texttemplate='%{text}',
    colorscale='Blues'
))

fig.update_layout(
    title=f'{best_model_name} - 혼동 행렬',
    template='plotly_white'
)
fig.show()

In [None]:
# ROC Curve
fig = go.Figure()

for name, model in models.items():
    if name == 'Logistic Regression':
        y_proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        y_proba = model.predict_proba(X_test)[:, 1]
    
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc = roc_auc_score(y_test, y_proba)
    
    fig.add_trace(go.Scatter(
        x=fpr, y=tpr,
        name=f'{name} (AUC={auc:.3f})',
        mode='lines'
    ))

fig.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1],
    name='Random',
    mode='lines',
    line=dict(dash='dash', color='gray')
))

fig.update_layout(
    title='ROC Curve 비교',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    template='plotly_white'
)
fig.show()

## 5. 피처 중요도 분석

In [None]:
# Random Forest 피처 중요도
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=True)

fig = go.Figure(data=go.Bar(
    x=feature_importance['importance'],
    y=feature_importance['feature'],
    orientation='h',
    marker_color='#636EFA'
))

fig.update_layout(
    title='Random Forest - 피처 중요도',
    xaxis_title='Importance',
    yaxis_title='Feature',
    height=600,
    template='plotly_white'
)
fig.show()

print("\n=== Top 10 중요 피처 ===")
print(feature_importance.tail(10).to_string(index=False))

## 6. 모델 저장

In [None]:
# models 디렉토리 생성
os.makedirs('../models', exist_ok=True)

# 최고 성능 모델 저장
joblib.dump(best_model, f'../models/abusing_detector_{best_model_name.lower().replace(" ", "_")}.pkl')
joblib.dump(scaler, '../models/scaler.pkl')

# 피처 목록 저장
with open('../models/feature_columns.txt', 'w') as f:
    f.write('\n'.join(feature_columns))

print(f"모델 저장 완료: models/abusing_detector_{best_model_name.lower().replace(' ', '_')}.pkl")
print("스케일러 저장 완료: models/scaler.pkl")
print("피처 목록 저장 완료: models/feature_columns.txt")