In [4]:
import gdown
import zipfile
import pandas as pd

# Download ZIP file
zip_path = "/content/sample_data/user-wallet-transactions.json.zip"
url = "https://drive.google.com/uc?id=14ceBCLQ-BTcydDrFJauVA_PKAZ7VtDor"
gdown.download(url, zip_path, quiet=False)

# Extract only the main JSON file
with zipfile.ZipFile(zip_path, 'r') as z:
    # Get list of files and select the primary JSON file
    file_list = z.namelist()
    json_file = [f for f in file_list
                 if "user-wallet-transactions.json" in f and "__MACOSX" not in f][0]

    # Extract the target JSON file
    z.extract(json_file)

# Load data from extracted JSON
df = pd.read_json(json_file, convert_dates=['timestamp'])

print(f"Successfully loaded {len(df):,} transactions")
print("Columns:", df.columns.tolist())

Downloading...
From: https://drive.google.com/uc?id=14ceBCLQ-BTcydDrFJauVA_PKAZ7VtDor
To: /content/sample_data/user-wallet-transactions.json.zip
100%|██████████| 10.4M/10.4M [00:00<00:00, 61.3MB/s]


Successfully loaded 100,000 transactions
Columns: ['_id', 'userWallet', 'network', 'protocol', 'txHash', 'logId', 'timestamp', 'blockNumber', 'action', 'actionData', '__v', 'createdAt', 'updatedAt']


In [5]:
!pip install gradio pandas numpy matplotlib seaborn scikit-learn yellowbrick fpdf yfinance networkx shap diffprivlib joblib



In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, silhouette_score  # Added silhouette_score here
from sklearn.calibration import calibration_curve
from sklearn.decomposition import PCA
from yellowbrick.cluster import KElbowVisualizer
from sklearn.manifold import TSNE
import json
import os
from fpdf import FPDF
import yfinance as yf
import networkx as nx
import shap
import joblib
from diffprivlib.models import KMeans as DPKMeans
from sklearn.cluster import DBSCAN
from scipy.stats import zscore
import warnings

warnings.filterwarnings('ignore')

# Rest of your code remains the same...

# Configuration
RANDOM_STATE = 42
MAX_CLUSTERS = 10  # Max clusters for elbow method
N_CLUSTERS = None  # Will be determined dynamically
MIN_SAMPLES = 5     # Minimum samples per cluster
REPORT_PATH = "reports/"
os.makedirs(REPORT_PATH, exist_ok=True)

def process_transactions(df):
    """Process transaction data and extract wallet-level features using vectorized operations"""
    print("Processing transactions...")

    # Use 'userWallet' as wallet identifier
    wallet_col = 'userWallet'

    # Convert timestamp to datetime
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
    df['date'] = df['timestamp'].dt.date

    # Parse actionData - handle both string and dict formats
    if isinstance(df['actionData'].iloc[0], str):
        try:
            df['actionData'] = df['actionData'].apply(json.loads)
        except:
            df['actionData'] = df['actionData'].apply(lambda x: {} if pd.isna(x) else json.loads(x))

    # Extract relevant fields from actionData
    df['actionType'] = df['actionData'].apply(lambda x: x.get('type', '').lower() if isinstance(x, dict) else '')
    df['amount'] = df['actionData'].apply(lambda x: float(x.get('amount', 0)) if isinstance(x, dict) else 0)
    df['assetPriceUSD'] = df['actionData'].apply(lambda x: float(x.get('assetPriceUSD', 0)) if isinstance(x, dict) else 0)
    df['amountUSD'] = df['amount'] * df['assetPriceUSD']

    # Add protocol information if available
    df['protocol'] = df['actionData'].apply(
        lambda x: x.get('protocol', 'unknown') if isinstance(x, dict) else 'unknown'
    )

    # Feature engineering using vectorized operations
    print("Performing vectorized feature engineering...")

    # Group by wallet
    grouped = df.groupby(wallet_col)

    # Basic transaction counts
    features = grouped.agg(
        total_tx=('actionType', 'size'),
        deposit_count=('actionType', lambda x: (x == 'deposit').sum()),
        borrow_count=('actionType', lambda x: (x == 'borrow').sum()),
        repay_count=('actionType', lambda x: (x == 'repay').sum()),
        redeem_count=('actionType', lambda x: (x == 'redeemunderlying').sum()),
        liquidation_count=('actionType', lambda x: (x == 'liquidationcall').sum()),
        first_tx=('timestamp', 'min'),
        last_tx=('timestamp', 'max'),
        unique_protocols=('protocol', pd.Series.nunique)
    )

    # Time-based features
    features['activity_duration_days'] = (
        features['last_tx'] - features['first_tx']
    ).dt.total_seconds() / 86400
    features['tx_frequency'] = features['total_tx'] / features['activity_duration_days'].replace(0, 1)
    features['recency_days'] = (datetime.now() - features['last_tx']).dt.days

    # FIXED: Create financial features using pivot_table instead of looping
    financial_features = df.pivot_table(
        index=wallet_col,
        columns='actionType',
        values='amountUSD',
        aggfunc=['sum', 'mean', 'max', 'min'],
        fill_value=0
    )

    # Flatten multi-index columns
    financial_features.columns = [f'{agg}_{action}' for agg, action in financial_features.columns]

    # Select only the actions we need
    actions = ['deposit', 'borrow', 'repay', 'redeemunderlying']
    action_cols = [col for col in financial_features.columns if any(action in col for action in actions)]
    financial_features = financial_features[action_cols]

    # Merge with main features
    features = features.join(financial_features)

    # Ratios and risk metrics
    features['deposit_borrow_ratio'] = (features['sum_deposit'] + 1) / (features['sum_borrow'] + 1)
    features['repay_borrow_ratio'] = (features['sum_repay'] + 1) / (features['sum_borrow'] + 1)
    features['redeem_deposit_ratio'] = (features['sum_redeemunderlying'] + 1) / (features['sum_deposit'] + 1)
    features['net_utilization'] = (features['sum_deposit'] - features['sum_redeemunderlying']) / (features['sum_deposit'] + 1)

    # Risk flags
    features['has_liquidation'] = (features['liquidation_count'] > 0).astype(int)
    features['high_frequency'] = (features['tx_frequency'] > 100).astype(int)

    # Transaction trend analysis
    def calculate_trend(group):
        if len(group) > 1:
            x = np.arange(len(group))
            y = group['amountUSD'].values
            return np.polyfit(x, y, 1)[0]  # Return slope
        return 0

    trends = df.groupby(wallet_col).apply(calculate_trend)
    features['tx_trend'] = trends

    # Fill NA values
    features.fillna(0, inplace=True)

    # Replace infinities
    features.replace([np.inf, -np.inf], 1e6, inplace=True)

    # Create interaction features
    features['borrow_recency_ratio'] = features['sum_borrow'] / (features['recency_days'] + 1)
    features['deposit_frequency_ratio'] = features['sum_deposit'] / (features['tx_frequency'] + 1)

    # Protocol diversity
    features['protocol_diversity'] = features['unique_protocols'] / features['total_tx']

    print(f"Processed features for {len(features)} wallets")
    return features

def optimize_clusters(X_scaled):
    """Determine optimal number of clusters using elbow method and silhouette score"""
    print("Optimizing cluster count...")

    # Visual elbow method
    visualizer = KElbowVisualizer(
        KMeans(random_state=RANDOM_STATE),
        k=(2, MAX_CLUSTERS),
        metric='distortion',
        timings=False
    )

    plt.figure(figsize=(10, 6))
    visualizer.fit(X_scaled)
    visualizer.show(outpath="elbow_plot.png", clear_figure=True)
    plt.close()

    # Silhouette score analysis
    silhouette_scores = []
    cluster_range = range(2, min(MAX_CLUSTERS, X_scaled.shape[0]//MIN_SAMPLES))

    for n_clusters in cluster_range:
        clusterer = KMeans(n_clusters=n_clusters, random_state=RANDOM_STATE, n_init=10)
        cluster_labels = clusterer.fit_predict(X_scaled)
        score = silhouette_score(X_scaled, cluster_labels)
        silhouette_scores.append(score)
        print(f"Silhouette score for {n_clusters} clusters: {score:.4f}")

    # Find best cluster count
    best_n = cluster_range[np.argmax(silhouette_scores)]
    print(f"Optimal clusters: {best_n}")

    return best_n

def generate_credit_scores(features_df):
    """Generate credit scores using supervised learning and clustering"""
    print("Generating credit scores...")

    # Feature selection
    feature_cols = [
        'total_tx', 'deposit_count', 'borrow_count', 'repay_count',
        'tx_frequency', 'deposit_borrow_ratio', 'repay_borrow_ratio',
        'redeem_deposit_ratio', 'net_utilization', 'has_liquidation',
        'high_frequency', 'recency_days', 'tx_trend', 'protocol_diversity',
        'borrow_recency_ratio', 'deposit_frequency_ratio'
    ]

    # Preprocessing
    X = features_df[feature_cols].fillna(0)

    # Handle infinities
    for col in ['deposit_borrow_ratio', 'repay_borrow_ratio', 'redeem_deposit_ratio']:
        X[col] = X[col].replace([np.inf, -np.inf], X[col].max())

    # Scaling
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)

    # Determine optimal clusters
    global N_CLUSTERS
    if N_CLUSTERS is None:
        N_CLUSTERS = optimize_clusters(X_scaled)

    # Differential Privacy Clustering
    print(f"Clustering with {N_CLUSTERS} clusters using differential privacy...")
    dp_kmeans = DPKMeans(
        n_clusters=N_CLUSTERS,
        random_state=RANDOM_STATE,
        epsilon=0.5
    )
    cluster_labels = dp_kmeans.fit_predict(X_scaled)
    features_df['cluster'] = cluster_labels

    # Supervised Learning with Gradient Boosting
    print("Training supervised credit model...")
    y = features_df['has_liquidation']
    features_df = features_df.sort_values('last_tx')
    split_idx = int(0.8 * len(features_df))
    X_train, X_test = X_scaled[:split_idx], X_scaled[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]

    gbm = GradientBoostingClassifier(
        n_estimators=200,
        learning_rate=0.05,
        max_depth=5,
        subsample=0.8,
        random_state=RANDOM_STATE
    )
    gbm.fit(X_train, y_train)

    # Evaluate model
    y_pred = gbm.predict(X_test)
    y_proba = gbm.predict_proba(X_test)[:, 1]
    auc = roc_auc_score(y_test, y_proba)
    print(f"Model AUC: {auc:.4f}")
    print(classification_report(y_test, y_pred))

    # Save model
    joblib.dump(gbm, 'credit_model.pkl')
    joblib.dump(scaler, 'credit_scaler.pkl')

    # Predict on entire dataset
    full_proba = gbm.predict_proba(X_scaled)[:, 1]
    features_df['default_probability'] = full_proba
    features_df['credit_score'] = (1 - full_proba) * 1000

    # Cluster quality metrics
    cluster_stats = []
    for i in range(N_CLUSTERS):
        cluster_data = features_df[features_df['cluster'] == i]
        stats = {
            'cluster': i,
            'size': len(cluster_data),
            'risk_score': (
                cluster_data['default_probability'].mean() * 0.6 +
                cluster_data['high_frequency'].mean() * 0.2 +
                (1 - cluster_data['repay_borrow_ratio'].mean()) * 0.2
            ),
            'avg_score': cluster_data['credit_score'].mean(),
            'avg_tx_freq': cluster_data['tx_frequency'].mean(),
            'avg_deposit_borrow_ratio': cluster_data['deposit_borrow_ratio'].mean(),
            'avg_repay_borrow_ratio': cluster_data['repay_borrow_ratio'].mean(),
            'liquidation_rate': cluster_data['has_liquidation'].mean(),
            'high_freq_rate': cluster_data['high_frequency'].mean()
        }
        cluster_stats.append(stats)

    cluster_stats_df = pd.DataFrame(cluster_stats)
    cluster_stats_df = cluster_stats_df.sort_values('risk_score').reset_index(drop=True)
    cluster_stats_df['quality_rank'] = cluster_stats_df.index
    features_df['quality_rank'] = features_df['cluster'].map(cluster_stats_df.set_index('cluster')['quality_rank'])
    # In generate_credit_scores() function:

    print("Calculating SHAP values...")
    explainer = shap.TreeExplainer(gbm)
    shap_values = explainer(X_scaled)

    # Create Explanation object
    shap_exp = shap.Explanation(
        values=shap_values.values,
        base_values=shap_values.base_values,
        data=X_scaled,
        feature_names=feature_cols
    )

    # Store the Explanation object for each wallet
    features_df['shap_explanation'] = [shap_exp[i] for i in range(len(shap_exp))]



    # Create SHAP plots
    plt.figure()
    shap.summary_plot(shap_exp, X_scaled, feature_names=feature_cols, show=False)
    plt.savefig(f'{REPORT_PATH}shap_summary.png', bbox_inches='tight')
    plt.close()

    # Create results dataframe
    results = pd.DataFrame({
        'wallet': features_df.index,
        'credit_score': features_df['credit_score'].astype(int),
        'default_probability': features_df['default_probability'],
        'cluster': features_df['cluster'],
        'quality_rank': features_df['quality_rank']
    }).sort_values('credit_score', ascending=False)

    return results, cluster_stats_df, X_scaled, features_df


def generate_tsne_visualization(X_scaled, scores_df):
    """Generate t-SNE visualization of wallet behavior clusters"""
    print("Generating t-SNE visualization...")

    # Reduce dimensionality with PCA first for efficiency
    n_components = min(10, X_scaled.shape[1])  # Use min of 10 or number of features
    pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X_scaled)

    # t-SNE visualization
    tsne = TSNE(
        n_components=2,
        perplexity=30,
        learning_rate=200,
        random_state=RANDOM_STATE,
        n_iter=1000
    )

    # Sample if too many points
    if X_pca.shape[0] > 5000:
        sample_idx = np.random.choice(X_pca.shape[0], 5000, replace=False)
        X_sample = X_pca[sample_idx]
        score_sample = scores_df.iloc[sample_idx]
        print("Sampled 5000 wallets for t-SNE visualization")
    else:
        X_sample = X_pca
        score_sample = scores_df

    # Fit and transform
    tsne_results = tsne.fit_transform(X_sample)

    # Create DataFrame for plotting
    tsne_df = pd.DataFrame({
        'x': tsne_results[:, 0],
        'y': tsne_results[:, 1],
        'credit_score': score_sample['credit_score'].values,
        'cluster': score_sample['cluster'].values,
        'score_category': pd.cut(
            score_sample['credit_score'],
            bins=[0, 300, 700, 1000],
            labels=['High Risk (0-300)', 'Medium Risk (301-700)', 'Low Risk (701-1000)']
        )
    })

    # Plot
    plt.figure(figsize=(16, 12))

    # Cluster visualization
    plt.subplot(2, 1, 1)
    sns.scatterplot(
        x='x', y='y',
        hue='cluster',
        palette='viridis',
        data=tsne_df,
        alpha=0.7,
        s=50
    )
    plt.title('Behavior Cluster Visualization (t-SNE Projection)')
    plt.xlabel('t-SNE Dimension 1')
    plt.ylabel('t-SNE Dimension 2')
    plt.legend(title='Cluster ID')

    # Risk category visualization
    plt.subplot(2, 1, 2)
    sns.scatterplot(
        x='x', y='y',
        hue='score_category',
        palette={'High Risk (0-300)': 'red',
                 'Medium Risk (301-700)': 'orange',
                 'Low Risk (701-1000)': 'green'},
        data=tsne_df,
        alpha=0.7,
        s=50
    )
    plt.title('Credit Risk Categories (t-SNE Projection)')
    plt.xlabel('t-SNE Dimension 1')
    plt.ylabel('t-SNE Dimension 2')
    plt.legend(title='Risk Category')

    plt.tight_layout()
    plt.savefig(f'{REPORT_PATH}kmeans_clustering.png', dpi=300)
    plt.close()

    print("t-SNE visualization saved")
    return tsne_df


def analyze_results(scores_df, cluster_stats_df, features_df):
    """Generate analysis of credit score distribution and clusters"""
    print("Analyzing results...")

    # Score distribution
    bins = list(range(0, 1001, 100))
    labels = [f"{i}-{i+99}" for i in range(0, 1000, 100)]
    score_distribution = pd.cut(
        scores_df['credit_score'],
        bins=bins,
        labels=labels,
        right=False
    ).value_counts().sort_index()

    # Plot distribution
    plt.figure(figsize=(12, 6))
    score_distribution.plot(kind='bar', color='skyblue')
    plt.title('Credit Score Distribution')
    plt.xlabel('Score Range')
    plt.ylabel('Number of Wallets')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(f'{REPORT_PATH}score_distribution.png')
    plt.close()

    # Cluster distribution
    plt.figure(figsize=(10, 6))
    cluster_dist = scores_df['quality_rank'].value_counts().sort_index()
    cluster_dist.plot(kind='bar', color='lightgreen')
    plt.title('Wallet Distribution by Behavior Cluster')
    plt.xlabel('Cluster Quality Rank (0 = Best)')
    plt.ylabel('Number of Wallets')
    plt.xticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f'{REPORT_PATH}cluster_distribution.png')
    plt.close()

    # Calculate statistics
    stats = {
        'mean_score': scores_df['credit_score'].mean(),
        'median_score': scores_df['credit_score'].median(),
        'min_score': scores_df['credit_score'].min(),
        'max_score': scores_df['credit_score'].max(),
        'top_10_percentile': scores_df['credit_score'].quantile(0.9),
        'bottom_10_percentile': scores_df['credit_score'].quantile(0.1),
        'default_rate': features_df['has_liquidation'].mean()
    }

    # Generate behavior analysis
    cluster_descriptions = {}
    for _, row in cluster_stats_df.iterrows():
        rank = row['quality_rank']
        description = f"**Cluster {row['cluster']} (Rank {rank})**\n"
        description += f"- Size: {row['size']} wallets ({row['size']/len(scores_df)*100:.1f}%)\n"
        description += f"- Avg Credit Score: {row['avg_score']:.0f}\n"
        description += f"- Avg TX Frequency: {row['avg_tx_freq']:.1f} tx/day\n"
        description += f"- Avg Deposit/Borrow Ratio: {row['avg_deposit_borrow_ratio']:.2f}\n"
        description += f"- Avg Repay/Borrow Ratio: {row['avg_repay_borrow_ratio']:.2f}\n"
        description += f"- Liquidation Rate: {row['liquidation_rate']*100:.1f}%\n"
        description += f"- High Frequency Rate: {row['high_freq_rate']*100:.1f}%\n"
        description += f"- Risk Score: {row['risk_score']:.3f}\n\n"

        # Behavioral characteristics
        if rank == 0:
            description += "**Characteristics:**\n- Responsible long-term users\n- High deposit/borrow ratios\n- Consistent repayments\n- No liquidations\n- Low transaction frequency"
        elif rank == 1:
            description += "**Characteristics:**\n- Moderate users\n- Balanced activity\n- Occasional borrowing\n- Rare liquidations"
        elif rank == N_CLUSTERS - 1:
            description += "**Characteristics:**\n- High-risk behavior\n- Frequent liquidations\n- High transaction frequency\n- Low repay/borrow ratios\n- Potential bot activity"
        else:
            description += "**Characteristics:**\n- Mixed behavior patterns\n- Moderate risk profile\n- Variable financial ratios"

        cluster_descriptions[rank] = description

    # Model calibration plot
    prob_true, prob_pred = calibration_curve(
        features_df['has_liquidation'],
        features_df['default_probability'],
        n_bins=10
    )

    plt.figure(figsize=(10, 8))
    plt.plot(prob_pred, prob_true, 's-', label='Model')
    plt.plot([0, 1], [0, 1], '--', color='gray', label='Perfect calibration')
    plt.xlabel('Predicted Probability')
    plt.ylabel('Actual Probability')
    plt.title('Model Calibration')
    plt.legend()
    plt.grid(True)
    plt.savefig(f'{REPORT_PATH}calibration_plot.png')
    plt.close()

    return score_distribution, stats, cluster_descriptions

def build_transaction_network(df):
    """Build transaction network graph for network analysis"""
    print("Building transaction network...")
    G = nx.MultiGraph()

    # Add nodes and edges
    for _, row in df.iterrows():
        wallet = row['userWallet']
        protocol = row.get('protocol', 'unknown')

        # Add wallet node
        G.add_node(wallet, type='wallet')

        # Add protocol node
        G.add_node(protocol, type='protocol')

        # Add transaction edge
        G.add_edge(wallet, protocol,
                   amount=row['amountUSD'],
                   type=row['actionType'],
                   timestamp=row['timestamp'])

    return G

# def calculate_network_metrics(G, features_df):
#     """Calculate network metrics for each wallet"""
#     print("Calculating network metrics...")

#     # Centrality measures
#     degree_centrality = nx.degree_centrality(G)
#     betweenness_centrality = nx.betweenness_centrality(G, weight='amount')

#     # Create metrics dataframe
#     wallets = [n for n in G.nodes if G.nodes[n]['type'] == 'wallet']
#     network_metrics = pd.DataFrame(index=wallets)
#     network_metrics['degree_centrality'] = network_metrics.index.map(degree_centrality)
#     network_metrics['betweenness_centrality'] = network_metrics.index.map(betweenness_centrality)

#     # Community detection
#     communities = nx.algorithms.community.louvain_communities(G, weight='amount', seed=RANDOM_STATE)
#     for i, comm in enumerate(communities):
#         for node in comm:
#             if G.nodes[node]['type'] == 'wallet':
#                 network_metrics.loc[node, 'community'] = i

#     # Merge with existing features
#     features_df = features_df.merge(network_metrics, left_index=True, right_index=True, how='left')
#     features_df.fillna({'degree_centrality': 0, 'betweenness_centrality': 0, 'community': -1}, inplace=True)

#     return features_df
def calculate_network_metrics(G, features_df):
    """Calculate network metrics for each wallet with optimizations for large networks"""
    print("Calculating network metrics...")

    # First get all wallet nodes
    wallets = [n for n in G.nodes if G.nodes[n]['type'] == 'wallet']
    network_metrics = pd.DataFrame(index=wallets)

    # 1. Degree centrality is fast even for large networks
    print("Calculating degree centrality...")
    degree_centrality = nx.degree_centrality(G)
    network_metrics['degree_centrality'] = network_metrics.index.map(degree_centrality)

    # 2. Betweenness centrality - use approximation for large networks
    print("Calculating approximate betweenness centrality...")
    if len(G.nodes) > 1000:
        # Use k=100 random nodes for approximation
        betweenness = nx.betweenness_centrality(G, k=100, weight='amount', seed=RANDOM_STATE)
    else:
        betweenness = nx.betweenness_centrality(G, weight='amount')
    network_metrics['betweenness_centrality'] = network_metrics.index.map(betweenness)

    # 3. Community detection - use Louvain with resolution tuning
    print("Detecting communities...")
    communities = nx.algorithms.community.louvain_communities(
        G,
        weight='amount',
        resolution=0.8,  # Adjust for more/less communities
        seed=RANDOM_STATE
    )

    for i, comm in enumerate(communities):
        for node in comm:
            if G.nodes[node]['type'] == 'wallet':
                network_metrics.loc[node, 'community'] = i

    # Merge with existing features
    features_df = features_df.merge(
        network_metrics,
        left_index=True,
        right_index=True,
        how='left'
    )
    features_df.fillna({
        'degree_centrality': 0,
        'betweenness_centrality': 0,
        'community': -1
    }, inplace=True)

    return features_df


class RiskReportGenerator:
    def __init__(self, wallet_data, features_df, cluster_stats_df):
        self.wallet = wallet_data.name
        self.data = wallet_data
        self.features_df = features_df
        self.cluster_stats_df = cluster_stats_df
        self.report_path = f"{REPORT_PATH}risk_report_{self.wallet[:8]}.pdf"
        self.temp_files = []

    def generate_report(self):
        """Generate comprehensive risk assessment report"""
        pdf = FPDF()
        pdf.add_page()
        pdf.set_font("Arial", 'B', 24)

        # Header
        self._add_header(pdf)

        # Credit Score Summary
        self._add_score_summary(pdf)

        # Risk Factors
        self._add_risk_factors(pdf)

        # Behavioral Analysis
        self._add_behavioral_analysis(pdf)

        # SHAP Explanation
        self._add_shap_explanation(pdf)

        # Stress Test Results
        self._add_stress_tests(pdf)

        # Cluster Comparison
        self._add_cluster_comparison(pdf)

        # Save report
        pdf.output(self.report_path)

        # Clean up temporary files
        for file in self.temp_files:
            if os.path.exists(file):
                os.remove(file)

        return self.report_path

    def _add_header(self, pdf):
        """Add report header section"""
        pdf.cell(0, 10, "DeFi Wallet Risk Assessment Report", ln=1, align='C')
        pdf.set_font("Arial", '', 12)
        pdf.cell(0, 10, f"Wallet: {self.wallet}", ln=1, align='C')
        pdf.cell(0, 5, f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}", ln=1, align='C')
        pdf.ln(10)

        # Add risk summary box
        pdf.set_fill_color(230, 230, 230)
        pdf.cell(0, 8, "Risk Summary", ln=1, fill=True)
        pdf.set_font("Arial", 'B', 14)

        risk_level = "Low Risk" if self.data['credit_score'] > 700 else \
                    "Medium Risk" if self.data['credit_score'] > 400 else "High Risk"

        risk_color = (0, 128, 0) if risk_level == "Low Risk" else \
                    (255, 165, 0) if risk_level == "Medium Risk" else (220, 0, 0)

        pdf.set_text_color(*risk_color)
        pdf.cell(0, 10, f"Risk Level: {risk_level}", ln=1, align='C')
        pdf.set_text_color(0, 0, 0)
        pdf.ln(5)

    def _add_score_summary(self, pdf):
        """Add credit score summary section"""
        pdf.set_font("Arial", 'B', 16)
        pdf.cell(0, 10, "Credit Score Summary", ln=1)
        pdf.set_font("Arial", '', 12)

        # Score visualization
        fig, ax = plt.subplots(figsize=(6, 2))
        ax.barh(0, self.data['credit_score'], color='skyblue', height=0.5)
        ax.set_xlim(0, 1000)
        ax.set_xticks([0, 250, 500, 750, 1000])
        ax.set_yticks([])
        ax.set_title('Credit Score')
        ax.text(self.data['credit_score'] + 10, 0,
                f"{self.data['credit_score']}", va='center', fontsize=12)

        # Add score ranges
        ax.axvline(300, color='red', linestyle='--', alpha=0.3)
        ax.axvline(700, color='green', linestyle='--', alpha=0.3)
        ax.text(150, 0.7, "High Risk", fontsize=8, color='red')
        ax.text(500, 0.7, "Medium Risk", fontsize=8, color='orange')
        ax.text(850, 0.7, "Low Risk", fontsize=8, color='green')

        # Save to PDF
        self._add_figure_to_pdf(pdf, fig)
        plt.close(fig)

        # Score details
        pdf.ln(5)
        pdf.cell(0, 8, f"Score: {self.data['credit_score']}/1000", ln=1)
        pdf.cell(0, 8, f"Default Probability: {self.data['default_probability']*100:.1f}%", ln=1)
        pdf.cell(0, 8, f"Percentile: {self._calculate_percentile():.1f}%", ln=1)
        pdf.cell(0, 8, f"Behavior Cluster: #{self.data['quality_rank']} of {len(self.cluster_stats_df)}", ln=1)
        pdf.ln(10)

    def _add_risk_factors(self, pdf):
        """Add key risk factors section"""
        pdf.set_font("Arial", 'B', 16)
        pdf.cell(0, 10, "Key Risk Factors", ln=1)
        pdf.set_font("Arial", '', 12)

        risk_factors = []

        # Liquidation risk
        if self.data['liquidation_count'] > 0:
            risk_factors.append(
                f"WARNING: History of liquidations ({self.data['liquidation_count']} events)"
            )

        # High frequency
        if self.data['high_frequency']:
            risk_factors.append(
                f"WARNING: High transaction frequency ({self.data['tx_frequency']:.1f} tx/day)"
            )

        # Borrowing risk
        if self.data['deposit_borrow_ratio'] < 0.5:
            risk_factors.append(
                f"WARNING: High borrowing relative to deposits (ratio: {self.data['deposit_borrow_ratio']:.2f})"
            )

        # Repayment risk
        if self.data['repay_borrow_ratio'] < 0.7:
            risk_factors.append(
                f"WARNING: Low repayment ratio (only {self.data['repay_borrow_ratio']*100:.1f}% of borrows repaid)"
            )

        # Utilization risk
        if self.data['net_utilization'] < 0.3:
            risk_factors.append(
                f"WARNING: Low capital utilization ({self.data['net_utilization']*100:.1f}%)"
            )

        # Protocol concentration
        if self.data['unique_protocols'] < 2:
            risk_factors.append(
                f"WARNING: Limited protocol diversity ({self.data['unique_protocols']} protocols)"
            )

        # Add risk factors to report
        if risk_factors:
            for factor in risk_factors:
                pdf.cell(0, 8, factor, ln=1)
        else:
            pdf.cell(0, 8, "No significant risk factors identified", ln=1)

        pdf.ln(5)

        # Risk factor visualization
        fig, ax = plt.subplots(figsize=(6, 3))
        metrics = ['deposit_borrow_ratio', 'repay_borrow_ratio', 'net_utilization', 'protocol_diversity']
        values = [self.data[m] for m in metrics]
        labels = ['Deposit/Borrow', 'Repay/Borrow', 'Net Utilization', 'Protocol Diversity']

        # Calculate reference values
        ref_values = [1.5, 1.0, 0.7, 0.5]  # Ideal targets

        # Plot comparison
        x = np.arange(len(metrics))
        width = 0.35
        ax.bar(x - width/2, values, width, label='Wallet')
        ax.bar(x + width/2, ref_values, width, label='Target', alpha=0.5)

        ax.set_ylabel('Ratio')
        ax.set_title('Key Financial Ratios')
        ax.set_xticks(x)
        ax.set_xticklabels(labels, rotation=45)
        ax.legend()

        self._add_figure_to_pdf(pdf, fig)
        plt.close(fig)
        pdf.ln(10)

    def _add_behavioral_analysis(self, pdf):
        """Add behavioral analysis section"""
        pdf.set_font("Arial", 'B', 16)
        pdf.cell(0, 10, "Behavioral Analysis", ln=1)
        pdf.set_font("Arial", '', 12)

        cluster_id = self.data['cluster']
        cluster_data = self.cluster_stats_df[self.cluster_stats_df['cluster'] == cluster_id].iloc[0]

        # Cluster description
        pdf.cell(0, 8, f"Behavior Cluster #{self.data['quality_rank']} (of {len(self.cluster_stats_df)} clusters)", ln=1)
        pdf.cell(0, 8, f"- Cluster Risk Score: {cluster_data['risk_score']:.2f}/1.0", ln=1)
        pdf.cell(0, 8, f"- Liquidation Rate: {cluster_data['liquidation_rate']*100:.1f}%", ln=1)
        pdf.cell(0, 8, f"- High Frequency Rate: {cluster_data['high_freq_rate']*100:.1f}%", ln=1)
        pdf.ln(5)

        # Cluster characteristics
        pdf.cell(0, 8, "Typical Behavior Patterns:", ln=1)
        if self.data['quality_rank'] == 0:
            pdf.cell(0, 8, "- Responsible, long-term user", ln=1)
            pdf.cell(0, 8, "- Consistent deposit/borrow patterns", ln=1)
            pdf.cell(0, 8, "- High repayment ratios", ln=1)
        elif self.data['quality_rank'] == len(self.cluster_stats_df) - 1:
            pdf.cell(0, 8, "- High-risk behavior patterns", ln=1)
            pdf.cell(0, 8, "- Frequent liquidations", ln=1)
            pdf.cell(0, 8, "- Bot-like transaction patterns", ln=1)
        else:
            pdf.cell(0, 8, "- Moderate risk profile", ln=1)
            pdf.cell(0, 8, "- Mixed behavior patterns", ln=1)
            pdf.cell(0, 8, "- Variable financial ratios", ln=1)

        # Cluster comparison visualization
        fig, ax = plt.subplots(figsize=(6, 4))
        clusters = self.cluster_stats_df.sort_values('quality_rank')['quality_rank']
        risk_scores = self.cluster_stats_df.sort_values('quality_rank')['risk_score']

        colors = []
        for rank in clusters:
            if rank == 0:
                colors.append('green')
            elif rank == len(clusters) - 1:
                colors.append('red')
            else:
                colors.append('orange')

        ax.bar(clusters.astype(str), risk_scores, color=colors)
        ax.set_title('Cluster Risk Comparison')
        ax.set_xlabel('Cluster Quality Rank')
        ax.set_ylabel('Risk Score')
        ax.set_ylim(0, 1)

        # Highlight current cluster
        current_idx = list(clusters).index(self.data['quality_rank'])
        ax.patches[current_idx].set_alpha(0.7)
        ax.patches[current_idx].set_edgecolor('black')
        ax.patches[current_idx].set_linewidth(2)

        self._add_figure_to_pdf(pdf, fig)
        plt.close(fig)
        pdf.ln(10)

    def _add_shap_explanation(self, pdf):
        """Add SHAP explanation for credit score"""
        pdf.set_font("Arial", 'B', 16)
        pdf.cell(0, 10, "Credit Score Explanation", ln=1)
        pdf.set_font("Arial", '', 12)
        pdf.cell(0, 8, "Factors influencing this wallet's credit score:", ln=1)

        # Get SHAP values
        if 'shap_values' in self.data:
            shap_values = self.data['shap_values']
            feature_names = [
                'Total TX', 'Deposit Count', 'Borrow Count', 'Repay Count',
                'TX Frequency', 'Deposit/Borrow', 'Repay/Borrow',
                'Redeem/Deposit', 'Net Utilization', 'Has Liquidation',
                'High Frequency', 'Recency Days', 'TX Trend', 'Protocol Diversity',
                'Borrow/Recency', 'Deposit/Frequency'
            ]

            # Create waterfall plot
            plt.figure()
            shap.plots.waterfall(shap_values, max_display=10, show=False)
            plt.title("Credit Score Factors", fontsize=14)
            plt.tight_layout()

            # Save to PDF
            temp_path = f"{REPORT_PATH}temp_shap_{self.wallet[:6]}.png"
            plt.savefig(temp_path, dpi=150, bbox_inches='tight')
            plt.close()
            self.temp_files.append(temp_path)

            pdf.image(temp_path, x=10, w=190)
            pdf.ln(5)
        else:
            pdf.cell(0, 8, "SHAP explanation not available", ln=1)

        pdf.ln(10)

    def _add_stress_tests(self, pdf):
        """Add stress test results section with real market data"""
        pdf.set_font("Arial", 'B', 16)
        pdf.cell(0, 10, "Stress Test Results", ln=1)
        pdf.set_font("Arial", '', 12)

        # Get current market data
        try:
            eth = yf.Ticker("ETH-USD")
            eth_history = eth.history(period="1d")
            eth_price = eth_history['Close'].iloc[0]
            btc = yf.Ticker("BTC-USD")
            btc_history = btc.history(period="1d")
            btc_price = btc_history['Close'].iloc[0]
            vix = yf.Ticker("^VIX")
            vix_history = vix.history(period="1d")
            vix_value = vix_history['Close'].iloc[0]
        except:
            eth_price = 3500
            btc_price = 60000
            vix_value = 20

        # Simulate different market conditions
        scenarios = [
            {"name": "Normal Market",
             "volatility": 0.0,
             "collateral_factor": 1.0,
             "eth_price": eth_price,
             "btc_price": btc_price,
             "vix": vix_value},

            {"name": "Moderate Volatility",
             "volatility": 0.2 * (vix_value/20),
             "collateral_factor": 0.85,
             "eth_price": eth_price * 0.9,
             "btc_price": btc_price * 0.9,
             "vix": vix_value * 1.5},

            {"name": "Market Crash",
             "volatility": 0.5 * (vix_value/20),
             "collateral_factor": 0.6,
             "eth_price": eth_price * 0.7,
             "btc_price": btc_price * 0.65,
             "vix": vix_value * 2.5},

            {"name": "Extreme Conditions",
             "volatility": 0.8 * (vix_value/20),
             "collateral_factor": 0.4,
             "eth_price": eth_price * 0.5,
             "btc_price": btc_price * 0.45,
             "vix": vix_value * 3.5}
        ]

        results = []
        for scenario in scenarios:
            health, prob = self._simulate_stress_test(scenario)
            results.append({
                "Scenario": scenario['name'],
                "ETH Price": f"${scenario['eth_price']:,.0f}",
                "VIX Index": f"{scenario['vix']:.1f}",
                "Health Factor": health,
                "Liquidation Prob": prob
            })

        # Create table
        pdf.set_fill_color(240, 240, 240)
        pdf.cell(50, 8, "Scenario", border=1, fill=True)
        pdf.cell(30, 8, "ETH Price", border=1, fill=True)
        pdf.cell(25, 8, "VIX", border=1, fill=True)
        pdf.cell(35, 8, "Health Factor", border=1, fill=True)
        pdf.cell(35, 8, "Liquidation Prob", border=1, fill=True, ln=1)

        for res in results:
            health_color = (0, 100, 0) if res['Health Factor'] > 1.5 else \
                          (205, 133, 0) if res['Health Factor'] > 1.0 else (220, 0, 0)

            prob_color = (220, 0, 0) if res['Liquidation Prob'] > 0.7 else \
                        (205, 133, 0) if res['Liquidation Prob'] > 0.3 else (0, 100, 0)

            pdf.set_text_color(0, 0, 0)
            pdf.cell(50, 8, res['Scenario'], border=1)

            pdf.cell(30, 8, res['ETH Price'], border=1)
            pdf.cell(25, 8, res['VIX Index'], border=1)

            pdf.set_text_color(*health_color)
            pdf.cell(35, 8, f"{res['Health Factor']:.2f}", border=1)

            pdf.set_text_color(*prob_color)
            pdf.cell(35, 8, f"{res['Liquidation Prob']:.0%}", border=1, ln=1)

        pdf.set_text_color(0, 0, 0)
        pdf.ln(5)

        # Add scenario description
        pdf.cell(0, 8, "Scenario Definitions:", ln=1)
        pdf.cell(0, 8, "- Normal Market: Stable market conditions", ln=1)
        pdf.cell(0, 8, "- Moderate Volatility: 10-20% price decline, VIX 30-40", ln=1)
        pdf.cell(0, 8, "- Market Crash: 30-50% price decline, VIX 50+", ln=1)
        pdf.cell(0, 8, "- Extreme Conditions: >50% price decline, VIX 70+", ln=1)
        pdf.ln(10)

    def _add_cluster_comparison(self, pdf):
        """Add cluster comparison section"""
        pdf.set_font("Arial", 'B', 16)
        pdf.cell(0, 10, "Peer Group Comparison", ln=1)
        pdf.set_font("Arial", '', 12)

        cluster_id = self.data['cluster']
        cluster_data = self.cluster_stats_df[self.cluster_stats_df['cluster'] == cluster_id].iloc[0]

        # Create comparison table
        metrics = [
            ('Credit Score', self.data['credit_score'], cluster_data.get('avg_score', 0)),
            ('Default Probability', self.data['default_probability'], cluster_data.get('default_probability', 0)),
            ('TX Frequency', self.data['tx_frequency'], cluster_data.get('avg_tx_freq', 0)),
            ('Deposit/Borrow Ratio', self.data['deposit_borrow_ratio'], cluster_data.get('avg_deposit_borrow_ratio', 0)),
            ('Liquidation Events', self.data['liquidation_count'], cluster_data.get('avg_liquidation', 0)),
            ('Protocol Diversity', self.data['protocol_diversity'], cluster_data.get('avg_protocol_diversity', 0))
        ]

        # Create table
        pdf.set_fill_color(240, 240, 240)
        pdf.cell(70, 8, "Metric", border=1, fill=True)
        pdf.cell(40, 8, "Wallet", border=1, fill=True)
        pdf.cell(40, 8, "Cluster Avg", border=1, fill=True, ln=1)

        for metric, wallet_val, cluster_avg in metrics:
            # Determine value colors
            if metric == 'Credit Score':
                wallet_color = (0, 100, 0) if wallet_val > 700 else \
                              (205, 133, 0) if wallet_val > 400 else (220, 0, 0)
                cluster_color = (0, 100, 0) if cluster_avg > 700 else \
                               (205, 133, 0) if cluster_avg > 400 else (220, 0, 0)
            elif metric == 'Default Probability':
                wallet_color = (220, 0, 0) if wallet_val > 0.7 else \
                              (205, 133, 0) if wallet_val > 0.3 else (0, 100, 0)
                cluster_color = (220, 0, 0) if cluster_avg > 0.7 else \
                               (205, 133, 0) if cluster_avg > 0.3 else (0, 100, 0)
            else:
                wallet_color = (0, 0, 0)
                cluster_color = (0, 0, 0)

            pdf.set_text_color(0, 0, 0)
            pdf.cell(70, 8, metric, border=1)

            pdf.set_text_color(*wallet_color)
            pdf.cell(40, 8, f"{wallet_val:.2f}", border=1)

            pdf.set_text_color(*cluster_color)
            pdf.cell(40, 8, f"{cluster_avg:.2f}", border=1, ln=1)

        pdf.set_text_color(0, 0, 0)
        pdf.ln(10)

    def _simulate_stress_test(self, scenario):
        """Simulate wallet performance under market stress"""
        # Base health factor estimate
        health_factor = max(1.0, 1.5 * self.data['deposit_borrow_ratio'])

        # Apply volatility impact
        volatility_impact = scenario['volatility'] * (1 - self.data['protocol_diversity'])
        stressed_health = health_factor * (1 - volatility_impact)

        # Apply collateral haircut
        stressed_health *= scenario['collateral_factor']

        # Calculate liquidation probability
        if stressed_health < 1.0:
            liquidation_prob = min(1.0, (1.0 - stressed_health) * 2)
        else:
            liquidation_prob = max(0, (1.5 - stressed_health) / 5)

        return stressed_health, liquidation_prob

    def _calculate_percentile(self):
        """Calculate wallet's percentile rank"""
        scores = self.features_df['credit_score']
        return (sum(scores < self.data['credit_score']) / len(scores)) * 100

    def _add_figure_to_pdf(self, pdf, figure):
        """Add matplotlib figure to PDF"""
        # Save as temp image
        temp_path = f"{REPORT_PATH}temp_{self.wallet[:6]}.png"
        figure.savefig(temp_path, dpi=100, bbox_inches='tight')
        self.temp_files.append(temp_path)

        # Add to PDF
        pdf.image(temp_path, x=10, w=190)
        pdf.ln(5)

def backtest_model(features_df):
    """Backtest model performance using temporal splits"""
    print("Backtesting model performance...")

    # Sort by time
    features_df = features_df.sort_values('last_tx')

    # Create results storage
    backtest_results = []

    # Get numeric features only (exclude timestamps and other non-numeric columns)
    numeric_cols = features_df.select_dtypes(include=[np.number]).columns.tolist()
    if 'has_liquidation' not in numeric_cols:
        numeric_cols.append('has_liquidation')

    # Temporal cross-validation
    for test_size in [0.1, 0.2, 0.3]:
        train_size = 1 - test_size
        split_idx = int(train_size * len(features_df))

        X_train = features_df[numeric_cols].iloc[:split_idx]
        X_test = features_df[numeric_cols].iloc[split_idx:]

        # Train new model
        gbm = GradientBoostingClassifier(
            n_estimators=100,
            learning_rate=0.1,
            max_depth=3,
            random_state=RANDOM_STATE
        )

        # Fit and predict
        gbm.fit(X_train.drop('has_liquidation', axis=1), X_train['has_liquidation'])
        y_pred = gbm.predict_proba(X_test.drop('has_liquidation', axis=1))[:, 1]
        auc = roc_auc_score(X_test['has_liquidation'], y_pred)

        backtest_results.append({
            'train_size': train_size,
            'test_size': test_size,
            'auc': auc
        })

    # Plot results
    plt.figure(figsize=(10, 6))
    sizes = [res['test_size'] for res in backtest_results]
    aucs = [res['auc'] for res in backtest_results]
    plt.plot(sizes, aucs, 'o-')
    plt.title('Model Performance by Test Size')
    plt.xlabel('Test Set Proportion')
    plt.ylabel('AUC Score')
    plt.grid(True)
    plt.savefig(f'{REPORT_PATH}backtest_performance.png')
    plt.close()

    return pd.DataFrame(backtest_results)

def detect_anomalies(features_df):
    """Detect anomalous wallets using isolation forest"""
    from sklearn.ensemble import IsolationForest

    print("Detecting anomalous wallets...")

    # Select features for anomaly detection
    anomaly_features = [
        'tx_frequency', 'deposit_borrow_ratio', 'repay_borrow_ratio',
        'net_utilization', 'tx_trend', 'protocol_diversity'
    ]
    X_anom = features_df[anomaly_features].fillna(0)

    # Train isolation forest
    iso = IsolationForest(contamination=0.01, random_state=RANDOM_STATE)
    anomalies = iso.fit_predict(X_anom)

    # Add to features
    features_df['is_anomaly'] = (anomalies == -1).astype(int)

    # Plot anomalies
    plt.figure(figsize=(10, 8))
    plt.scatter(
        features_df['tx_frequency'],
        features_df['deposit_borrow_ratio'],
        c=features_df['is_anomaly'],
        cmap='coolwarm',
        alpha=0.6
    )
    plt.xlabel('Transaction Frequency')
    plt.ylabel('Deposit/Borrow Ratio')
    plt.title('Anomaly Detection')
    plt.colorbar(label='Anomaly (1=True)')
    plt.savefig(f'{REPORT_PATH}anomaly_detection.png')
    plt.close()

    print(f"Detected {features_df['is_anomaly'].sum()} anomalous wallets")
    return features_df

def main():
    # Load data
    print("Loading data...")
    try:
        df = pd.read_json('user-wallet-transactions.json')
        print(f"Loaded {len(df):,} transactions")
    except Exception as e:
        print(f"Error loading data: {e}")
        return

    # Process transactions and extract features
    features_df = process_transactions(df)

    # Build transaction network
    G = build_transaction_network(df)
    features_df = calculate_network_metrics(G, features_df)

    # Detect anomalies
    features_df = detect_anomalies(features_df)

    # Generate credit scores
    scores_df, cluster_stats_df, X_scaled, full_features_df = generate_credit_scores(features_df)

    # Save results
    scores_df.to_csv(f'{REPORT_PATH}wallet_credit_scores.csv', index=False)
    cluster_stats_df.to_csv(f'{REPORT_PATH}cluster_statistics.csv', index=False)
    full_features_df.to_csv(f'{REPORT_PATH}full_features.csv', index=True)
    print(f"Saved scores for {len(scores_df)} wallets")

    # Generate t-SNE visualization
    try:
        tsne_df = generate_tsne_visualization(X_scaled, scores_df)
    except Exception as e:
        print(f"Could not generate t-SNE: {e}")

    # Analyze results
    distribution, stats, cluster_descriptions = analyze_results(scores_df, cluster_stats_df, full_features_df)

    # Backtest model
    backtest_results = backtest_model(full_features_df)
    backtest_results.to_csv(f'{REPORT_PATH}backtest_results.csv', index=False)

    # Generate analysis report
    with open(f'{REPORT_PATH}analysis.md', 'w') as f:
        f.write("# DeFi Credit Score Analysis Report\n\n")

        f.write("## Summary Statistics\n")
        f.write(pd.Series(stats).to_markdown() + "\n\n")

        f.write("## Score Distribution\n")
        f.write(distribution.to_markdown() + "\n")
        f.write("![Score Distribution](score_distribution.png)\n\n")

        f.write("## Model Performance\n")
        f.write(backtest_results.to_markdown() + "\n")
        f.write("![Backtest Performance](backtest_performance.png)\n\n")

        f.write("## Cluster Characteristics\n")
        for rank in sorted(cluster_descriptions.keys()):
            f.write(cluster_descriptions[rank] + "\n\n")

        f.write("## Risk Analysis\n")
        f.write("### High-Score Wallets (700-1000)\n")
        f.write("- Typically in top behavior clusters (Rank 0-1)\n")
        f.write("- Consistent deposit/repay patterns\n")
        f.write("- High repay/borrow ratios (>1.5)\n")
        f.write("- No liquidation history\n")
        f.write("- Moderate transaction frequency\n\n")

        f.write("### Medium-Score Wallets (300-699)\n")
        f.write("- Mixed behavior patterns\n")
        f.write("- Moderate deposit/borrow ratios (0.5-1.5)\n")
        f.write("- Occasional borrowing without full repayment\n")
        f.write("- Rare liquidation events\n\n")

        f.write("### Low-Score Wallets (0-299)\n")
        f.write("- High-risk behavior patterns\n")
        f.write("- Frequent liquidations\n")
        f.write("- Extremely high or low transaction frequency\n")
        f.write("- Low repay/borrow ratios (<0.5)\n")
        f.write("- Often show bot-like behavior patterns\n")

    # Generate sample risk reports
    print("Generating sample risk assessment reports...")
    reports = []
    for wallet in list(scores_df.head(3)['wallet']) + list(scores_df.tail(3)['wallet']):
        wallet_data = full_features_df.loc[wallet]
        report_gen = RiskReportGenerator(wallet_data, full_features_df, cluster_stats_df)
        report_path = report_gen.generate_report()
        reports.append(report_path)
        print(f"Generated report for {wallet[:8]}...")

    print("\nProcessing complete! Results saved to:")
    print(f"- {REPORT_PATH}wallet_credit_scores.csv")
    print(f"- {REPORT_PATH}cluster_statistics.csv")
    print(f"- {REPORT_PATH}analysis.md")
    print(f"- {REPORT_PATH}backtest_results.csv")
    print(f"- Various visualizations and reports")
    print(f"- {len(reports)} sample risk assessment reports")

if __name__ == "__main__":
    main()

Loading data...
Loaded 100,000 transactions
Processing transactions...
Performing vectorized feature engineering...
Processed features for 3497 wallets
Building transaction network...
Calculating network metrics...
Calculating degree centrality...
Calculating approximate betweenness centrality...
Detecting communities...
Detecting anomalous wallets...
Detected 34 anomalous wallets
Generating credit scores...
Optimizing cluster count...
Silhouette score for 2 clusters: 0.9637
Silhouette score for 3 clusters: 0.7128
Silhouette score for 4 clusters: 0.5917
Silhouette score for 5 clusters: 0.4699
Silhouette score for 6 clusters: 0.3567
Silhouette score for 7 clusters: 0.3592
Silhouette score for 8 clusters: 0.4397
Silhouette score for 9 clusters: 0.4767
Optimal clusters: 2
Clustering with 2 clusters using differential privacy...
Training supervised credit model...
Model AUC: 1.0000
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       683
