# 피처 엔지니어링

In [None]:
import os
import pandas as pd

# 결과 저장 경로 설정
OUTPUT_PATH = '../data/processed/features.csv'
os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

print("라이브러리 로드 완료")

## 1. 데이터 로드

In [None]:
# 데이터 로드
sellers_df = pd.read_csv('../data/processed/ml_sellers.csv')
products_df = pd.read_csv('../data/processed/ml_products.csv')
reviews_df = pd.read_csv('../data/processed/ml_reviews.csv')
questions_df = pd.read_csv('../data/processed/ml_questions.csv')

print(f"판매자: {len(sellers_df)}개")
print(f"상품: {len(products_df)}개")
print(f"리뷰: {len(reviews_df)}개")
print(f"질문: {len(questions_df)}개")

## 2. 피처 엔지니어링

In [None]:
# 판매자별 상품 통계
product_stats = products_df.groupby('vendor_name').agg({
    'product_id': 'count',
    'price': ['mean', 'std', 'min', 'max'],
    'product_rating': ['mean', 'std'],
    'review_count': ['sum', 'mean'],
    'discount_rate': ['mean', 'max'],
    'shipping_fee': 'mean',
    'shipping_days': 'mean'
}).reset_index()

# 컬럼명 정리
product_stats.columns = [
    'company_name', 
    'product_count_actual', 
    'price_mean', 'price_std', 'price_min', 'price_max',
    'rating_mean', 'rating_std',
    'review_sum', 'review_mean',
    'discount_mean', 'discount_max',
    'shipping_fee_mean', 'shipping_days_mean'
]

# NaN 처리 (단일 상품 판매자의 경우 std가 NaN이 됨)
product_stats = product_stats.fillna(0)

print(f"상품 통계 생성: {len(product_stats)}개 판매자")
product_stats.head()

In [None]:
# 판매자별 리뷰 통계
product_vendor_map = products_df[['product_id', 'vendor_name']].drop_duplicates()
reviews_with_vendor = reviews_df.merge(product_vendor_map, on='product_id', how='left')

reviews_with_vendor['text_length'] = reviews_with_vendor['review_text'].apply(
    lambda x: len(str(x)) if pd.notna(x) else 0
)

review_stats = reviews_with_vendor.groupby('vendor_name').agg({
    'id': 'count',
    'review_rating': ['mean', 'std'],
    'text_length': ['mean', 'std', 'max']
}).reset_index()

review_stats.columns = [
    'company_name',
    'review_count_actual',
    'review_rating_mean', 'review_rating_std',
    'review_length_mean', 'review_length_std', 'review_length_max'
]

# NaN 처리
review_stats = review_stats.fillna(0)

print(f"리뷰 통계 생성: {len(review_stats)}개 판매자")

In [None]:
# 판매자별 질문 통계
questions_with_vendor = questions_df.merge(product_vendor_map, on='product_id', how='left')

questions_with_vendor['has_answer'] = questions_with_vendor['answer'].apply(
    lambda x: 1 if pd.notna(x) and str(x).strip() != '' else 0
)

question_stats = questions_with_vendor.groupby('vendor_name').agg({
    'id': 'count',
    'has_answer': 'mean'
}).reset_index()

question_stats.columns = ['company_name', 'question_count', 'answer_rate']

print(f"질문 통계 생성: {len(question_stats)}개 판매자")

In [None]:
# 모든 피처 병합
features_df = sellers_df[[
    'company_name', 'satisfaction_score', 'review_count', 
    'total_product_count', 'is_abusing_seller'
]].copy()

features_df = features_df.merge(product_stats, on='company_name', how='left')
features_df = features_df.merge(review_stats, on='company_name', how='left')
features_df = features_df.merge(question_stats, on='company_name', how='left')

# 최종 결측치 처리
features_df = features_df.fillna(0)

print(f"최종 피처 데이터: {features_df.shape}")
features_df.head()

## 결과 저장

In [None]:
features_df.to_csv(OUTPUT_PATH, index=False)
print(f"피처 데이터 저장 완료: {OUTPUT_PATH}")