In [6]:
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np
import matplotlib.patches as mpatches

class FraudVisualization:
    def __init__(self, output_dir='./outputs/text'):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
        plt.rcParams['figure.figsize'] = (10, 6)
        plt.rcParams['font.size'] = 10
        sns.set_palette("viridis")

    def plot_class_distribution(self, df):
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        class_counts = df['is_fraud'].value_counts()
        colors = ['#2E8B57', '#DC143C']
        ax1.bar(['Legitimate (0)', 'Fraud (1)'], class_counts.values, color=colors, alpha=0.8)
        ax1.set_title('Sample Count Distribution')
        ax1.set_ylabel('Number of Samples')
        for i, v in enumerate(class_counts.values):
            ax1.text(i, v + 10, str(v), ha='center', va='bottom', fontweight='bold')
        ax2.pie(class_counts.values, labels=['Legitimate', 'Fraud'], colors=colors, autopct='%1.1f%%', startangle=90)
        ax2.set_title('Class Distribution Percentage')
        plt.suptitle('Dataset Class Distribution', fontsize=16, fontweight='bold')
        plt.tight_layout()
        filename = os.path.join(self.output_dir, 'class_distribution.png')
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        plt.close()

    def plot_fraud_features(self, df):
        fraud_features = ['has_money_mention', 'has_urgency', 'has_action_request', 
                         'has_caps_abuse', 'has_free_offer', 'has_excessive_exclamation']
        feature_stats = {}
        for feature in fraud_features:
            stats = df.groupby('is_fraud')[feature].mean()
            feature_stats[feature] = {'legitimate': stats[0], 'fraud': stats[1]}
        fig, ax = plt.subplots(figsize=(12, 8))
        x = np.arange(len(fraud_features))
        width = 0.35
        legit_values = [feature_stats[f]['legitimate'] for f in fraud_features]
        fraud_values = [feature_stats[f]['fraud'] for f in fraud_features]
        bars1 = ax.bar(x - width/2, legit_values, width, label='Legitimate', color='#2E8B57', alpha=0.8)
        bars2 = ax.bar(x + width/2, fraud_values, width, label='Fraud', color='#DC143C', alpha=0.8)
        ax.set_xlabel('Fraud Indicator Features')
        ax.set_ylabel('Prevalence Rate')
        ax.set_title('Fraud Indicator Features Comparison')
        ax.set_xticks(x)
        ax.set_xticklabels([f.replace('has_', '').replace('_', ' ').title() for f in fraud_features], rotation=45, ha='right')
        ax.legend()
        ax.grid(True, alpha=0.3)
        def add_value_labels(bars):
            for bar in bars:
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height + 0.01, f'{height:.2f}', ha='center', va='bottom', fontsize=9)
        add_value_labels(bars1)
        add_value_labels(bars2)
        plt.tight_layout()
        filename = os.path.join(self.output_dir, 'fraud_features_comparison.png')
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        plt.close()


    def plot_performance_metrics(self, metrics):
        fig, ax = plt.subplots(figsize=(10, 6))
        metric_names = list(metrics.keys())
        metric_values = list(metrics.values())
        colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']
        bars = ax.bar(metric_names, metric_values, color=colors, alpha=0.8)
        ax.set_title('Model Performance Metrics')
        ax.set_ylabel('Score')
        ax.set_ylim(0, 1.1)
        ax.grid(True, alpha=0.3)
        for bar, value in zip(bars, metric_values):
            ax.text(bar.get_x() + bar.get_width()/2., bar.get_height() + 0.01, f'{value:.3f}', ha='center', va='bottom', fontweight='bold')
        plt.tight_layout()
        filename = os.path.join(self.output_dir, 'performance_metrics.png')
        plt.savefig(filename, dpi=300, bbox_inches='tight')
        plt.close()


def add_visualization_to_pipeline(spam_csv_path='spam.csv'):
    # Load and adapt spam data
    spam_df = load_and_adapt_spam_data(spam_csv_path)
    # Extract fraud-relevant features
    feature_df = extract_fraud_features_from_spam(spam_df)
    
    viz = FraudVisualization()
    # Plot class distribution
    viz.plot_class_distribution(feature_df)
    # Plot fraud feature comparison
    viz.plot_fraud_features(feature_df)

    # Prepare data for training
    X_text = feature_df['message'].values
    y = feature_df['is_fraud'].values

    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X_text, y, test_size=0.3, random_state=RANDOM_STATE, stratify=y)

    # Train classifier
    classifier = SpamToFraudClassifier()
    classifier.fit(X_train, y_train)

    # Make predictions
    y_pred = classifier.predict(X_test)
    y_proba = classifier.predict_proba(X_test)[:, 1]

    # Evaluate metrics
    metrics = {
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, zero_division=0),
        'recall': recall_score(y_test, y_pred, zero_division=0),
        'f1': f1_score(y_test, y_pred, zero_division=0),
        'auc': roc_auc_score(y_test, y_proba)
    }

    # Plot performance
    viz.plot_performance_metrics(metrics)

    # Save predictions
    predictions_df = save_predictions_for_multimodal(y_test, y_pred, y_proba)
    return predictions_df, metrics, classifier

# Execute pipeline with visualization
predictions, performance, model = add_visualization_to_pipeline('spam.csv')

print("\nðŸŽ¯ Performance Metrics with Visualization:")
for metric, value in performance.items():
    print(f"{metric.upper()}: {value:.4f}")

print("\nâœ… Visualization-enhanced spam-to-fraud pipeline complete. See './outputs' for charts.")

ðŸ“§ Loading and adapting spam.csv for fraud detection...
âœ… Loaded real spam.csv with 5169 samples
   Fraud (spam): 653
   Legitimate (ham): 4516
ðŸ’¾ Saved predictions to: ./outputs/text\Linear_SVM_predictions.csv
   Format: y_true, y_pred, y_prob
   Samples: 1551

ðŸŽ¯ Performance Metrics with Visualization:
ACCURACY: 0.9736
PRECISION: 0.8934
RECALL: 0.8980
F1: 0.8957
AUC: 0.9900

âœ… Visualization-enhanced spam-to-fraud pipeline complete. See './outputs' for charts.
