# PHASE 3: E-commerce Fraud Detection System
## Advanced ML for Marketplace Fraud Detection

**Goal:** Build production-ready fraud detection system with 5 fraud indicators

In [None]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    roc_auc_score, roc_curve, precision_recall_curve, auc,
    confusion_matrix, classification_report
)
from sklearn.cluster import KMeans, DBSCAN
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
plt.style.use('default')
sns.set_palette('husl')

In [None]:
# Load data
df = pd.read_csv('all_products.csv')
print(f"Dataset: {df.shape[0]:,} rows √ó {df.shape[1]} columns")
print(f"\nColumns: {df.columns.tolist()}")
df.head()

---
# PART A: SUPERVISED CLASSIFICATION
---

## Step 1: Data Analysis & Fraud Label Creation

In [None]:
# Analyze data distributions
print("DATA ANALYSIS")
print("="*60)

key_cols = ['price_rub', 'seller_rating', 'seller_total_sold', 'seller_age_months', 'feedbacks']

for col in key_cols:
    if col in df.columns:
        print(f"\n{col}:")
        print(f"  Count: {df[col].notna().sum():,}")
        print(f"  Missing: {df[col].isna().sum():,} ({df[col].isna().sum()/len(df)*100:.1f}%)")
        if df[col].notna().sum() > 0:
            print(f"  Min: {df[col].min():.2f}")
            print(f"  Max: {df[col].max():.2f}")
            print(f"  Mean: {df[col].mean():.2f}")
            print(f"  Median: {df[col].median():.2f}")
            print(f"  Zeros: {(df[col] == 0).sum():,} ({(df[col] == 0).sum()/df[col].notna().sum()*100:.1f}%)")

In [None]:
# Create fraud labels with STRICT business logic
df_fraud = df.copy()

print("\n" + "="*60)
print("CREATING FRAUD LABELS - STRICT CRITERIA")
print("="*60)

# Calculate statistics for price outliers
category_stats = df_fraud.groupby('category')['price_rub'].agg(['mean', 'std']).reset_index()
category_stats.columns = ['category', 'cat_mean', 'cat_std']
df_fraud = df_fraud.merge(category_stats, on='category', how='left')
df_fraud['price_zscore'] = np.abs((df_fraud['price_rub'] - df_fraud['cat_mean']) / (df_fraud['cat_std'] + 1))

# ============================================================
# 1. is_fake_reviews: Suspicious review patterns
# ============================================================
# Only flag if feedbacks actually exists and is suspicious
df_fraud['is_fake_reviews'] = 0

# Check if feedbacks column has meaningful data
if df_fraud['feedbacks'].notna().sum() > 0 and df_fraud['feedbacks'].max() > 0:
    feedback_high = df_fraud['feedbacks'].quantile(0.9)  # Top 10%
    age_young = 6  # Less than 6 months
    
    df_fraud['is_fake_reviews'] = (
        # Many reviews but very new seller (suspicious growth)
        ((df_fraud['feedbacks'] > feedback_high) & 
         (df_fraud['seller_age_months'] < age_young)) |
        # Many reviews but terrible rating (fake positive reviews)
        ((df_fraud['feedbacks'] > df_fraud['feedbacks'].median()) & 
         (df_fraud['seller_rating'] < 3.5))
    ).astype(int)

print(f"\n1. is_fake_reviews:")
print(f"   Logic: High feedbacks (>{feedback_high:.0f}) + new seller (<6 months)")
print(f"          OR High feedbacks + low rating (<3.5)")
print(f"   Result: {df_fraud['is_fake_reviews'].sum():,} / {len(df_fraud):,} ({df_fraud['is_fake_reviews'].mean()*100:.2f}%)")

# ============================================================
# 2. is_fraud_seller: MAIN TARGET - Low rating sellers
# ============================================================
RATING_THRESHOLD = 3.8  # Your requirement
AGE_THRESHOLD = 12      # Less than 1 year = risky

df_fraud['is_fraud_seller'] = 0

# Condition 1: Low rating (main indicator)
df_fraud['is_fraud_seller'] = (df_fraud['seller_rating'] < RATING_THRESHOLD).astype(int)

# Condition 2: Low rating + young account (double risk)
df_fraud.loc[
    (df_fraud['seller_rating'] < RATING_THRESHOLD) & 
    (df_fraud['seller_age_months'] < AGE_THRESHOLD),
    'is_fraud_seller'
] = 1

# Condition 3: Extremely low rating (always fraud)
df_fraud.loc[df_fraud['seller_rating'] < 3.0, 'is_fraud_seller'] = 1

print(f"\n2. is_fraud_seller (MAIN TARGET):")
print(f"   Logic: seller_rating < {RATING_THRESHOLD}")
print(f"          OR (rating < {RATING_THRESHOLD} AND age < {AGE_THRESHOLD} months)")
print(f"          OR rating < 3.0")
print(f"   Result: {df_fraud['is_fraud_seller'].sum():,} / {len(df_fraud):,} ({df_fraud['is_fraud_seller'].mean()*100:.2f}%)")

# ============================================================
# 3. is_low_quality: Proven bad quality (many complaints)
# ============================================================
df_fraud['is_low_quality'] = 0

if df_fraud['feedbacks'].notna().sum() > 0 and df_fraud['feedbacks'].max() > 0:
    feedback_median = df_fraud['feedbacks'].median()
    
    df_fraud['is_low_quality'] = (
        # Low rating with many feedbacks = proven bad quality
        (df_fraud['seller_rating'] < 4.0) & 
        (df_fraud['feedbacks'] > feedback_median)
    ).astype(int)

print(f"\n3. is_low_quality:")
print(f"   Logic: rating < 4.0 AND feedbacks > median")
print(f"   Result: {df_fraud['is_low_quality'].sum():,} / {len(df_fraud):,} ({df_fraud['is_low_quality'].mean()*100:.2f}%)")

# ============================================================
# 4. is_price_manipulation: Extreme price outliers only
# ============================================================
ZSCORE_THRESHOLD = 3  # 3 standard deviations = extreme outlier

df_fraud['is_price_manipulation'] = (
    # Only extreme outliers (3+ std deviations)
    df_fraud['price_zscore'] > ZSCORE_THRESHOLD
).astype(int)

print(f"\n4. is_price_manipulation:")
print(f"   Logic: price z-score > {ZSCORE_THRESHOLD} (extreme outlier)")
print(f"   Result: {df_fraud['is_price_manipulation'].sum():,} / {len(df_fraud):,} ({df_fraud['is_price_manipulation'].mean()*100:.2f}%)")

# ============================================================
# 5. fraud_score: Composite score (0-100)
# ============================================================
df_fraud['fraud_score'] = (
    df_fraud['is_fake_reviews'] * 15 +
    df_fraud['is_fraud_seller'] * 50 +      # Most important (50%)
    df_fraud['is_low_quality'] * 15 +
    df_fraud['is_price_manipulation'] * 10 +
    # Continuous penalty based on rating
    ((5 - df_fraud['seller_rating'].clip(1, 5)) / 4 * 10)
).clip(0, 100)

print(f"\n5. fraud_score (0-100):")
print(f"   Weights: fake_reviews(15%) + fraud_seller(50%) + low_quality(15%) + price(10%) + rating_penalty(10%)")
print(f"   Mean: {df_fraud['fraud_score'].mean():.2f}")
print(f"   Median: {df_fraud['fraud_score'].median():.2f}")
print(f"   High risk (>50): {(df_fraud['fraud_score'] > 50).sum():,} ({(df_fraud['fraud_score'] > 50).mean()*100:.2f}%)")

In [None]:
# Verify fraud labels
fraud_cols = ['is_fake_reviews', 'is_fraud_seller', 'is_low_quality', 'is_price_manipulation']

print("\n" + "="*60)
print("FRAUD LABEL SUMMARY")
print("="*60)

for col in fraud_cols:
    count = df_fraud[col].sum()
    pct = df_fraud[col].mean() * 100
    print(f"{col:25s}: {count:5,} ({pct:5.2f}%)")

df_fraud['any_fraud'] = (df_fraud[fraud_cols].sum(axis=1) > 0).astype(int)
print(f"\n{'ANY fraud flag':25s}: {df_fraud['any_fraud'].sum():5,} ({df_fraud['any_fraud'].mean()*100:5.2f}%)")

# Check if we have enough fraud cases for ML
fraud_count = df_fraud['is_fraud_seller'].sum()
if fraud_count < 50:
    print(f"\n‚ö†Ô∏è WARNING: Only {fraud_count} fraud cases! May need to adjust thresholds.")
elif fraud_count > len(df_fraud) * 0.4:
    print(f"\n‚ö†Ô∏è WARNING: Too many fraud cases ({fraud_count}/{len(df_fraud)} = {fraud_count/len(df_fraud)*100:.1f}%)!")
    print("   Thresholds may be too lenient.")
else:
    print(f"\n‚úÖ Good fraud rate: {fraud_count} cases ({fraud_count/len(df_fraud)*100:.1f}%)")

In [None]:
# Visualize fraud labels
fig, axes = plt.subplots(2, 3, figsize=(16, 10))

for i, col in enumerate(fraud_cols):
    ax = axes[i//3, i%3]
    counts = df_fraud[col].value_counts()
    bars = ax.bar(['Normal', 'Fraud'], [counts.get(0, 0), counts.get(1, 0)], 
                   color=['green', 'red'], alpha=0.7, edgecolor='black')
    ax.set_title(col.replace('_', ' ').title(), fontweight='bold', fontsize=12)
    ax.set_ylabel('Count')
    ax.grid(True, alpha=0.3, axis='y')
    
    for j, v in enumerate([counts.get(0, 0), counts.get(1, 0)]):
        if v > 0:
            ax.text(j, v + 50, f'{v:,}\n({v/len(df_fraud)*100:.1f}%)', 
                   ha='center', fontsize=10, fontweight='bold')

# Fraud score
ax = axes[1, 1]
ax.hist(df_fraud['fraud_score'], bins=50, color='orange', alpha=0.7, edgecolor='black')
ax.axvline(50, color='red', linestyle='--', linewidth=2, label='High Risk (>50)')
ax.set_title('Fraud Score Distribution', fontweight='bold', fontsize=12)
ax.set_xlabel('Score (0-100)')
ax.set_ylabel('Count')
ax.legend()
ax.grid(True, alpha=0.3)

# Any fraud
ax = axes[1, 2]
counts = df_fraud['any_fraud'].value_counts()
ax.bar(['Clean', 'Fraud'], [counts.get(0, 0), counts.get(1, 0)], 
       color=['green', 'red'], alpha=0.7, edgecolor='black')
ax.set_title('Overall Fraud Status', fontweight='bold', fontsize=12)
ax.set_ylabel('Count')
ax.grid(True, alpha=0.3, axis='y')
for j, v in enumerate([counts.get(0, 0), counts.get(1, 0)]):
    if v > 0:
        ax.text(j, v + 50, f'{v:,}\n({v/len(df_fraud)*100:.1f}%)', 
               ha='center', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.show()

## Step 2: Feature Analysis

In [None]:
# Feature distributions by fraud status
TARGET = 'is_fraud_seller'

features = ['price_rub', 'seller_rating', 'seller_age_months']
# Only use features that have data
features = [f for f in features if df_fraud[f].notna().sum() > 100]

if len(features) > 0:
    fig, axes = plt.subplots(1, len(features), figsize=(6*len(features), 5))
    if len(features) == 1:
        axes = [axes]
    
    for i, col in enumerate(features):
        fraud = df_fraud[df_fraud[TARGET] == 1][col].dropna()
        normal = df_fraud[df_fraud[TARGET] == 0][col].dropna()
        
        if len(fraud) > 0 and len(normal) > 0:
            axes[i].hist(normal, bins=50, alpha=0.6, label='Normal', color='green', density=True)
            axes[i].hist(fraud, bins=50, alpha=0.6, label='Fraud', color='red', density=True)
            axes[i].set_title(col, fontweight='bold')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Density')
            axes[i].legend()
            axes[i].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
else:
    print("Not enough feature data for distribution plots")

In [None]:
# Correlation analysis
corr_cols = features + fraud_cols + ['fraud_score']
df_corr = df_fraud[corr_cols].dropna()

if len(df_corr) > 0:
    plt.figure(figsize=(12, 10))
    corr_matrix = df_corr.corr()
    sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='RdYlGn_r', center=0, 
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Feature Correlation Matrix', fontsize=14, fontweight='bold')
    plt.tight_layout()
    plt.show()
    
    print("\nTop correlations with fraud:")
    fraud_corr = corr_matrix[TARGET].abs().sort_values(ascending=False)
    print(fraud_corr.head(8))

## Step 3: Model Training

In [None]:
# Prepare data
model_features = [f for f in features if f in df_fraud.columns]
df_model = df_fraud[model_features + [TARGET]].dropna()

X = df_model[model_features]
y = df_model[TARGET]

print(f"\n{'='*60}")
print("MODEL TRAINING")
print(f"{'='*60}")
print(f"\nDataset: {X.shape}")
print(f"Features: {model_features}")
print(f"\nTarget distribution:")
print(y.value_counts())

# Check if we can proceed
if len(y.value_counts()) < 2:
    print("\n‚ùå ERROR: Only one class! Cannot train models.")
    raise ValueError("Insufficient fraud cases for training")

if y.sum() < 20:
    print(f"\n‚ö†Ô∏è WARNING: Very few fraud cases ({y.sum()}). Results may be unreliable.")

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nTrain: {X_train.shape} ({y_train.sum()} fraud, {(y_train==0).sum()} normal)")
print(f"Test:  {X_test.shape} ({y_test.sum()} fraud, {(y_test==0).sum()} normal)")

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Handle imbalance
print(f"\nClass balance strategy:")

if y_train.sum() >= 6:  # Need at least 6 minority samples for SMOTE
    try:
        smote = SMOTE(random_state=42)
        X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)
        print(f"‚úÖ SMOTE applied")
        print(f"   Before: Normal={( y_train==0).sum()}, Fraud={y_train.sum()}")
        print(f"   After:  Normal={(y_train_balanced==0).sum()}, Fraud={y_train_balanced.sum()}")
        use_smote = True
    except:
        print(f"‚ö†Ô∏è SMOTE failed, using class weights")
        X_train_balanced = X_train_scaled
        y_train_balanced = y_train
        use_smote = False
else:
    print(f"‚ö†Ô∏è Too few minority samples, using class weights")
    X_train_balanced = X_train_scaled
    y_train_balanced = y_train
    use_smote = False

In [None]:
# Train models
models = {}

# Logistic Regression
print("\nTraining Logistic Regression...")
if use_smote:
    lr = LogisticRegression(random_state=42, max_iter=1000)
    lr.fit(X_train_balanced, y_train_balanced)
else:
    lr = LogisticRegression(random_state=42, max_iter=1000, class_weight='balanced')
    lr.fit(X_train_scaled, y_train)
models['Logistic Regression'] = lr

# Random Forest
print("Training Random Forest...")
if use_smote:
    rf = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
    rf.fit(X_train_balanced, y_train_balanced)
else:
    rf = RandomForestClassifier(n_estimators=100, max_depth=10, class_weight='balanced', 
                                random_state=42, n_jobs=-1)
    rf.fit(X_train_scaled, y_train)
models['Random Forest'] = rf

# Gradient Boosting
print("Training Gradient Boosting...")
if use_smote:
    gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
    gb.fit(X_train_balanced, y_train_balanced)
else:
    gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
    gb.fit(X_train_scaled, y_train)
models['Gradient Boosting'] = gb

print("\n‚úÖ All models trained")

## Step 4: Evaluation

In [None]:
# Evaluate all models
results = []

for name, model in models.items():
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]
    
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, zero_division=0)
    rec = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc = roc_auc_score(y_test, y_proba)
    
    precision_curve, recall_curve, _ = precision_recall_curve(y_test, y_proba)
    pr_auc = auc(recall_curve, precision_curve)
    
    results.append({
        'Model': name,
        'Accuracy': acc,
        'Precision': prec,
        'Recall': rec,
        'F1': f1,
        'ROC-AUC': roc,
        'PR-AUC': pr_auc
    })

results_df = pd.DataFrame(results)
print("\n" + "="*80)
print("MODEL PERFORMANCE")
print("="*80)
print(results_df.to_string(index=False))

best_idx = results_df['F1'].idxmax()
best = results_df.loc[best_idx]
print(f"\nüèÜ Best Model: {best['Model']} (F1={best['F1']:.4f})")

In [None]:
# Confusion Matrices
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for i, (name, model) in enumerate(models.items()):
    y_pred = model.predict(X_test_scaled)
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i],
                xticklabels=['Normal', 'Fraud'],
                yticklabels=['Normal', 'Fraud'])
    axes[i].set_title(name, fontweight='bold')
    axes[i].set_ylabel('True')
    axes[i].set_xlabel('Predicted')

plt.tight_layout()
plt.show()

In [None]:
# ROC & PR Curves
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

colors = {'Logistic Regression': 'blue', 'Random Forest': 'green', 'Gradient Boosting': 'red'}

# ROC
for name, model in models.items():
    y_proba = model.predict_proba(X_test_scaled)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = roc_auc_score(y_test, y_proba)
    axes[0].plot(fpr, tpr, label=f'{name} (AUC={roc_auc:.3f})', 
                color=colors[name], linewidth=2)

axes[0].plot([0, 1], [0, 1], 'k--', label='Random', linewidth=1)
axes[0].set_xlabel('False Positive Rate')
axes[0].set_ylabel('True Positive Rate')
axes[0].set_title('ROC Curves', fontweight='bold')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# PR
for name, model in models.items():
    y_proba = model.predict_proba(X_test_scaled)[:, 1]
    precision, recall, _ = precision_recall_curve(y_test, y_proba)
    pr_auc = auc(recall, precision)
    axes[1].plot(recall, precision, label=f'{name} (AUC={pr_auc:.3f})', 
                color=colors[name], linewidth=2)

baseline = y_test.mean()
axes[1].axhline(y=baseline, color='k', linestyle='--', label=f'Baseline ({baseline:.3f})', linewidth=1)
axes[1].set_xlabel('Recall')
axes[1].set_ylabel('Precision')
axes[1].set_title('Precision-Recall Curves', fontweight='bold')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Feature importance (Random Forest)
if hasattr(rf, 'feature_importances_'):
    importance_df = pd.DataFrame({
        'Feature': model_features,
        'Importance': rf.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    print("\nFeature Importance (Random Forest):")
    print(importance_df.to_string(index=False))
    
    plt.figure(figsize=(8, 5))
    plt.barh(importance_df['Feature'], importance_df['Importance'], color='steelblue', alpha=0.7)
    plt.xlabel('Importance')
    plt.title('Feature Importance', fontweight='bold')
    plt.gca().invert_yaxis()
    plt.grid(True, alpha=0.3, axis='x')
    plt.tight_layout()
    plt.show()

## Step 5: Business Insights

In [None]:
print("\n" + "="*80)
print("FRAUD DETECTION SYSTEM - RESULTS")
print("="*80)

print(f"\nüéØ FRAUD DETECTION CRITERIA:")
print(f"   ‚Ä¢ Low rating: seller_rating < {RATING_THRESHOLD}")
print(f"   ‚Ä¢ Young account: seller_age_months < {AGE_THRESHOLD}")
print(f"   ‚Ä¢ Price outlier: z-score > {ZSCORE_THRESHOLD}")

print(f"\nüìä FRAUD STATISTICS:")
print(f"   Total products: {len(df_fraud):,}")
print(f"   Fraud sellers: {df_fraud['is_fraud_seller'].sum():,} ({df_fraud['is_fraud_seller'].mean()*100:.2f}%)")
print(f"   Fake reviews: {df_fraud['is_fake_reviews'].sum():,} ({df_fraud['is_fake_reviews'].mean()*100:.2f}%)")
print(f"   Low quality: {df_fraud['is_low_quality'].sum():,} ({df_fraud['is_low_quality'].mean()*100:.2f}%)")
print(f"   Price manipulation: {df_fraud['is_price_manipulation'].sum():,} ({df_fraud['is_price_manipulation'].mean()*100:.2f}%)")

print(f"\nüèÜ BEST MODEL: {best['Model']}")
print(f"   F1-Score:  {best['F1']:.4f}")
print(f"   Precision: {best['Precision']:.4f} ({best['Precision']*100:.1f}% of flagged are real fraud)")
print(f"   Recall:    {best['Recall']:.4f} (catches {best['Recall']*100:.1f}% of frauds)")
print(f"   ROC-AUC:   {best['ROC-AUC']:.4f}")

print(f"\nüí° KEY INSIGHTS:")
if hasattr(rf, 'feature_importances_'):
    print(f"   Most important: {importance_df.iloc[0]['Feature']} ({importance_df.iloc[0]['Importance']:.3f})")
print(f"   Main fraud indicator: seller_rating < {RATING_THRESHOLD}")
print(f"   System ready for production")

print("\n‚úÖ CLASSIFICATION COMPLETE")

---
# PART B: CLUSTERING - SELLER SEGMENTATION
---

In [None]:
# Prepare clustering data
cluster_features = [f for f in ['seller_rating', 'seller_age_months', 'price_rub'] 
                    if f in df_fraud.columns and df_fraud[f].notna().sum() > 100]

df_cluster = df_fraud[cluster_features].dropna()

print(f"\n{'='*60}")
print("PART B: SELLER CLUSTERING")
print(f"{'='*60}")
print(f"\nDataset: {df_cluster.shape}")
print(f"Features: {cluster_features}")

# Scale
scaler_c = StandardScaler()
X_cluster = scaler_c.fit_transform(df_cluster)

# PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_cluster)

print(f"\nPCA variance explained: {pca.explained_variance_ratio_.sum()*100:.1f}%")

In [None]:
# Find optimal k
K_range = range(2, 9)
inertias = []
silhouettes = []

for k in K_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = km.fit_predict(X_cluster)
    inertias.append(km.inertia_)
    silhouettes.append(silhouette_score(X_cluster, labels))

optimal_k = K_range[np.argmax(silhouettes)]
print(f"\nOptimal k: {optimal_k} (silhouette={max(silhouettes):.3f})")

# Plot
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(K_range, inertias, 'bo-', linewidth=2)
axes[0].set_xlabel('k')
axes[0].set_ylabel('Inertia')
axes[0].set_title('Elbow Method')
axes[0].grid(True, alpha=0.3)

axes[1].plot(K_range, silhouettes, 'ro-', linewidth=2)
axes[1].axvline(optimal_k, color='g', linestyle='--', label=f'Optimal k={optimal_k}')
axes[1].set_xlabel('k')
axes[1].set_ylabel('Silhouette Score')
axes[1].set_title('Silhouette Analysis')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Final clustering
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_cluster)
df_cluster['cluster'] = clusters

print(f"\nClusters:")
print(df_cluster['cluster'].value_counts().sort_index())

# Visualize
plt.figure(figsize=(10, 7))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='viridis', alpha=0.6, s=10)
plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)')
plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)')
plt.title(f'K-Means Clustering (k={optimal_k})', fontweight='bold')
plt.colorbar(scatter, label='Cluster')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Cluster profiles
profiles = df_cluster.groupby('cluster')[cluster_features].mean()
print("\nCluster Profiles:")
print(profiles.round(2))

# Add fraud rates
df_cluster['is_fraud'] = df_fraud.loc[df_cluster.index, 'is_fraud_seller']
fraud_rates = df_cluster.groupby('cluster')['is_fraud'].agg(['mean', 'sum', 'count'])
fraud_rates.columns = ['Fraud_Rate', 'Fraud_Count', 'Total']
fraud_rates['Fraud_Rate'] *= 100
print("\nFraud Rates:")
print(fraud_rates.round(2))

# Heatmap
plt.figure(figsize=(10, 5))
profiles_norm = (profiles - profiles.mean()) / profiles.std()
sns.heatmap(profiles_norm.T, annot=True, fmt='.2f', cmap='RdYlGn', center=0, linewidths=1)
plt.title('Cluster Profiles (Normalized)', fontweight='bold')
plt.xlabel('Cluster')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

In [None]:
# Business interpretation
print("\n" + "="*60)
print("CLUSTER INTERPRETATION")
print("="*60)

for c in range(optimal_k):
    data = df_cluster[df_cluster['cluster'] == c]
    rating = data['seller_rating'].mean() if 'seller_rating' in cluster_features else 0
    age = data['seller_age_months'].mean() if 'seller_age_months' in cluster_features else 0
    price = data['price_rub'].mean() if 'price_rub' in cluster_features else 0
    fraud_rate = data['is_fraud'].mean()
    
    print(f"\nCluster {c} ({len(data):,} sellers):")
    if 'seller_rating' in cluster_features:
        print(f"  Rating: {rating:.2f}")
    if 'seller_age_months' in cluster_features:
        print(f"  Age: {age:.1f} months")
    if 'price_rub' in cluster_features:
        print(f"  Avg Price: {price:.0f} RUB")
    print(f"  Fraud rate: {fraud_rate*100:.1f}%")
    
    if fraud_rate > 0.3:
        label = "üö® HIGH RISK"
    elif fraud_rate > 0.15:
        label = "‚ö†Ô∏è MEDIUM RISK"
    elif rating >= 4.5:
        label = "üåü TRUSTED"
    else:
        label = "‚úÖ RELIABLE"
    
    print(f"  Label: {label}")

print("\n‚úÖ CLUSTERING COMPLETE")
print("\n" + "="*60)
print("üéâ PHASE 3 COMPLETE")
print("="*60)