# 어뷰징 판매자 탐지 모델 개발

쿠팡 판매자 데이터를 기반으로 어뷰징 판매자를 탐지하는 머신러닝 모델을 개발합니다.

In [None]:
import sys
import os
sys.path.append(os.path.abspath('..'))

import pandas as pd
import plotly.graph_objects as go

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, confusion_matrix, classification_report, roc_curve
)
import joblib
import warnings
warnings.filterwarnings('ignore')

print("라이브러리 로드 완료")

## 1. 데이터 로드

In [None]:
from src.database.supabase_client import get_supabase_client

client = get_supabase_client()

def load_all_data(table_name):
    """테이블의 모든 데이터 로드"""
    response = client.table(table_name).select("*").execute()
    return pd.DataFrame(response.data)

# 데이터 로드
sellers_df = load_all_data('sellers')
products_df = load_all_data('products')
reviews_df = load_all_data('reviews')
questions_df = load_all_data('questions')

print(f"판매자: {len(sellers_df)}개")
print(f"상품: {len(products_df)}개")
print(f"리뷰: {len(reviews_df)}개")
print(f"질문: {len(questions_df)}개")

In [None]:
# 타겟 변수 확인
print("=== 어뷰징 판매자 분포 ===")
print(sellers_df['is_abusing_seller'].value_counts())
print(f"\n어뷰징 비율: {sellers_df['is_abusing_seller'].mean()*100:.1f}%")

## 2. 피처 엔지니어링

판매자별로 다양한 피처를 생성합니다.

In [None]:
# 판매자별 상품 통계
product_stats = products_df.groupby('vendor_name').agg({
    'product_id': 'count',
    'price': ['mean', 'std', 'min', 'max'],
    'product_rating': ['mean', 'std'],
    'review_count': ['sum', 'mean'],
    'discount_rate': ['mean', 'max'],
    'shipping_fee': 'mean',
    'shipping_days': 'mean'
}).reset_index()

# 컬럼명 정리
product_stats.columns = [
    'company_name', 
    'product_count_actual',
    'price_mean', 'price_std', 'price_min', 'price_max',
    'rating_mean', 'rating_std',
    'review_sum', 'review_mean',
    'discount_mean', 'discount_max',
    'shipping_fee_mean', 'shipping_days_mean'
]

print(f"상품 통계 생성: {len(product_stats)}개 판매자")
product_stats.head()

In [None]:
# 판매자별 리뷰 통계
# 먼저 상품-판매자 매핑
product_vendor_map = products_df[['product_id', 'vendor_name']].drop_duplicates()
reviews_with_vendor = reviews_df.merge(product_vendor_map, on='product_id', how='left')

# 리뷰 텍스트 길이
reviews_with_vendor['text_length'] = reviews_with_vendor['review_text'].apply(
    lambda x: len(str(x)) if pd.notna(x) else 0
)

review_stats = reviews_with_vendor.groupby('vendor_name').agg({
    'id': 'count',
    'review_rating': ['mean', 'std'],
    'text_length': ['mean', 'std', 'max']
}).reset_index()

review_stats.columns = [
    'company_name',
    'review_count_actual',
    'review_rating_mean', 'review_rating_std',
    'review_length_mean', 'review_length_std', 'review_length_max'
]

print(f"리뷰 통계 생성: {len(review_stats)}개 판매자")

In [None]:
# 판매자별 질문 통계
questions_with_vendor = questions_df.merge(product_vendor_map, on='product_id', how='left')

# 답변 여부
questions_with_vendor['has_answer'] = questions_with_vendor['answer'].apply(
    lambda x: 1 if pd.notna(x) and str(x).strip() != '' else 0
)

question_stats = questions_with_vendor.groupby('vendor_name').agg({
    'id': 'count',
    'has_answer': 'mean'  # 답변율
}).reset_index()

question_stats.columns = ['company_name', 'question_count', 'answer_rate']

print(f"질문 통계 생성: {len(question_stats)}개 판매자")

In [None]:
# 모든 피처 병합
features_df = sellers_df[[
    'company_name', 'satisfaction_score', 'review_count', 
    'total_product_count', 'is_abusing_seller'
]].copy()

features_df = features_df.merge(product_stats, on='company_name', how='left')
features_df = features_df.merge(review_stats, on='company_name', how='left')
features_df = features_df.merge(question_stats, on='company_name', how='left')

# 결측치 처리
features_df = features_df.fillna(0)

print(f"최종 피처 데이터: {features_df.shape}")
features_df.head()

In [None]:
# 피처 목록 확인
feature_columns = [
    'satisfaction_score', 'review_count', 'total_product_count',
    'product_count_actual', 'price_mean', 'price_std', 'price_min', 'price_max',
    'rating_mean', 'rating_std', 'review_sum', 'review_mean',
    'discount_mean', 'discount_max', 'shipping_fee_mean', 'shipping_days_mean',
    'review_count_actual', 'review_rating_mean', 'review_rating_std',
    'review_length_mean', 'review_length_std', 'review_length_max',
    'question_count', 'answer_rate'
]

print(f"총 피처 수: {len(feature_columns)}개")
print("\n피처 목록:")
for i, col in enumerate(feature_columns, 1):
    print(f"  {i}. {col}")

## 3. 데이터 분할 및 전처리

In [None]:
# 피처와 타겟 분리
X = features_df[feature_columns]
y = features_df['is_abusing_seller'].astype(int)

# Train/Test 분할
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"훈련 세트: {X_train.shape[0]}개 (어뷰징: {y_train.sum()}개)")
print(f"테스트 세트: {X_test.shape[0]}개 (어뷰징: {y_test.sum()}개)")

In [None]:
# 스케일링
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("스케일링 완료")

## 4. 모델 학습

In [None]:
# 모델 평가 함수
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    """모델 평가 및 결과 반환"""
    # 예측
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    y_test_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else None
    
    # 메트릭 계산
    results = {
        'model': model_name,
        'train_accuracy': accuracy_score(y_train, y_train_pred),
        'test_accuracy': accuracy_score(y_test, y_test_pred),
        'precision': precision_score(y_test, y_test_pred, zero_division=0),
        'recall': recall_score(y_test, y_test_pred, zero_division=0),
        'f1': f1_score(y_test, y_test_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_test, y_test_proba) if y_test_proba is not None else None
    }
    
    return results, y_test_pred, y_test_proba

In [None]:
# 모델 저장용 딕셔너리
models = {}
results_list = []

# 1. Logistic Regression
print("1. Logistic Regression 학습 중...")
lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)
models['Logistic Regression'] = lr_model

results, _, _ = evaluate_model(lr_model, X_train_scaled, X_test_scaled, y_train, y_test, 'Logistic Regression')
results_list.append(results)
print(f"   완료 - Test Accuracy: {results['test_accuracy']:.4f}, F1: {results['f1']:.4f}")

In [None]:
# 2. Random Forest
print("2. Random Forest 학습 중...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)  # RF는 스케일링 불필요
models['Random Forest'] = rf_model

results, _, _ = evaluate_model(rf_model, X_train, X_test, y_train, y_test, 'Random Forest')
results_list.append(results)
print(f"   완료 - Test Accuracy: {results['test_accuracy']:.4f}, F1: {results['f1']:.4f}")

In [None]:
# 3. Gradient Boosting
print("3. Gradient Boosting 학습 중...")
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
gb_model.fit(X_train, y_train)
models['Gradient Boosting'] = gb_model

results, _, _ = evaluate_model(gb_model, X_train, X_test, y_train, y_test, 'Gradient Boosting')
results_list.append(results)
print(f"   완료 - Test Accuracy: {results['test_accuracy']:.4f}, F1: {results['f1']:.4f}")

In [None]:
# 결과 비교
results_df = pd.DataFrame(results_list)
print("\n=== 모델 성능 비교 ===")
print(results_df.to_string(index=False))

## 5. 모델 평가 시각화

In [None]:
# 성능 비교 차트
metrics = ['test_accuracy', 'precision', 'recall', 'f1']
metric_names = ['정확도', '정밀도', '재현율', 'F1-Score']

fig = go.Figure()

for _, row in results_df.iterrows():
    fig.add_trace(go.Bar(
        name=row['model'],
        x=metric_names,
        y=[row[m] for m in metrics]
    ))

fig.update_layout(
    title='모델별 성능 비교',
    barmode='group',
    yaxis_title='Score',
    template='plotly_white'
)
fig.show()

In [None]:
# 최고 성능 모델 선택 (F1 기준)
best_model_name = results_df.loc[results_df['f1'].idxmax(), 'model']
print(f"최고 성능 모델: {best_model_name}")

if best_model_name == 'Logistic Regression':
    best_model = lr_model
    X_test_final = X_test_scaled
else:
    best_model = models[best_model_name]
    X_test_final = X_test

# 혼동 행렬
y_pred = best_model.predict(X_test_final)
cm = confusion_matrix(y_test, y_pred)

fig = go.Figure(data=go.Heatmap(
    z=cm,
    x=['정상 예측', '어뷰징 예측'],
    y=['정상 실제', '어뷰징 실제'],
    text=cm,
    texttemplate='%{text}',
    colorscale='Blues'
))

fig.update_layout(
    title=f'{best_model_name} - 혼동 행렬',
    template='plotly_white'
)
fig.show()

In [None]:
# ROC Curve
fig = go.Figure()

for name, model in models.items():
    if name == 'Logistic Regression':
        y_proba = model.predict_proba(X_test_scaled)[:, 1]
    else:
        y_proba = model.predict_proba(X_test)[:, 1]
    
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc = roc_auc_score(y_test, y_proba)
    
    fig.add_trace(go.Scatter(
        x=fpr, y=tpr,
        name=f'{name} (AUC={auc:.3f})',
        mode='lines'
    ))

fig.add_trace(go.Scatter(
    x=[0, 1], y=[0, 1],
    name='Random',
    mode='lines',
    line=dict(dash='dash', color='gray')
))

fig.update_layout(
    title='ROC Curve 비교',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    template='plotly_white'
)
fig.show()

## 6. 피처 중요도 분석

In [None]:
# Random Forest 피처 중요도
feature_importance = pd.DataFrame({
    'feature': feature_columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=True)

fig = go.Figure(data=go.Bar(
    x=feature_importance['importance'],
    y=feature_importance['feature'],
    orientation='h',
    marker_color='#636EFA'
))

fig.update_layout(
    title='Random Forest - 피처 중요도',
    xaxis_title='Importance',
    yaxis_title='Feature',
    height=600,
    template='plotly_white'
)
fig.show()

print("\n=== Top 10 중요 피처 ===")
print(feature_importance.tail(10).to_string(index=False))

## 7. 모델 저장

In [None]:
import os

# models 디렉토리 생성
os.makedirs('../models', exist_ok=True)

# 최고 성능 모델 저장
joblib.dump(best_model, f'../models/abusing_detector_{best_model_name.lower().replace(" ", "_")}.pkl')
joblib.dump(scaler, '../models/scaler.pkl')

# 피처 목록 저장
with open('../models/feature_columns.txt', 'w') as f:
    f.write('\n'.join(feature_columns))

print(f"모델 저장 완료: models/abusing_detector_{best_model_name.lower().replace(' ', '_')}.pkl")
print("스케일러 저장 완료: models/scaler.pkl")
print("피처 목록 저장 완료: models/feature_columns.txt")

## 8. 예측 테스트

In [None]:
# 저장된 모델 로드 테스트
loaded_model = joblib.load(f'../models/abusing_detector_{best_model_name.lower().replace(" ", "_")}.pkl')

# 테스트 예측
sample_idx = 0
sample = X_test.iloc[[sample_idx]]
actual = y_test.iloc[sample_idx]

if best_model_name == 'Logistic Regression':
    sample_scaled = scaler.transform(sample)
    pred = loaded_model.predict(sample_scaled)[0]
    proba = loaded_model.predict_proba(sample_scaled)[0]
else:
    pred = loaded_model.predict(sample)[0]
    proba = loaded_model.predict_proba(sample)[0]

print(f"샘플 예측 테스트:")
print(f"  실제 값: {'어뷰징' if actual else '정상'}")
print(f"  예측 값: {'어뷰징' if pred else '정상'}")
print(f"  확률: 정상 {proba[0]:.2%}, 어뷰징 {proba[1]:.2%}")

## 9. 요약

### 모델 성능

In [None]:
print("="*60)
print("어뷰징 판매자 탐지 모델 개발 완료")
print("="*60)
print(f"\n최종 모델: {best_model_name}")
print("\n성능 지표:")
best_results = results_df[results_df['model'] == best_model_name].iloc[0]
print(f"  - 정확도: {best_results['test_accuracy']:.4f}")
print(f"  - 정밀도: {best_results['precision']:.4f}")
print(f"  - 재현율: {best_results['recall']:.4f}")
print(f"  - F1-Score: {best_results['f1']:.4f}")
print(f"  - ROC-AUC: {best_results['roc_auc']:.4f}")
print("\n저장된 파일:")
print(f"  - models/abusing_detector_{best_model_name.lower().replace(' ', '_')}.pkl")
print("  - models/scaler.pkl")
print("  - models/feature_columns.txt")