In [1]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from concurrent.futures import ProcessPoolExecutor, as_completed
import os
import numpy as np

def extract_metrics(log_entry, reduction_technique):
    pattern = r'method: (\w+).*metric: (\w+).*norm: (\w+).*Silhouette Score: ([\d.]+).*Calinski-Harabasz Index: ([\d.]+).*Davies-Bouldin Index: ([\d.]+)'
    match = re.search(pattern, log_entry, re.DOTALL)
    if match:
        return {
            'Reduction Technique': reduction_technique,
            'Method': match.group(1),
            'Metric': match.group(2),
            'Norm': match.group(3),
            'Silhouette Score': float(match.group(4)),
            'Calinski-Harabasz Index': float(match.group(5)),
            'Davies-Bouldin Index': float(match.group(6))
        }
    return None

def process_log_file(log_file):
    reduction_technique = os.path.basename(os.path.dirname(log_file))
    with open(log_file, 'r') as file:
        log_content = file.read()
    log_entries = log_content.split('==================================================')
    metrics = [extract_metrics(entry, reduction_technique) for entry in log_entries if 'Clustering Metrics' in entry]
    return [m for m in metrics if m is not None]

def analyze_metrics(df):
    best_silhouette = df.loc[df['Silhouette Score'].idxmax()]
    best_calinski = df.loc[df['Calinski-Harabasz Index'].idxmax()]
    best_davies = df.loc[df['Davies-Bouldin Index'].idxmin()]
    
    print("Best combinations:")
    print(f"Silhouette Score: {best_silhouette[['Reduction Technique', 'Method', 'Metric', 'Norm', 'Silhouette Score']]}")
    print(f"Calinski-Harabasz Index: {best_calinski[['Reduction Technique', 'Method', 'Metric', 'Norm', 'Calinski-Harabasz Index']]}")
    print(f"Davies-Bouldin Index: {best_davies[['Reduction Technique', 'Method', 'Metric', 'Norm', 'Davies-Bouldin Index']]}")

def plot_combined_heatmap(df, output_dir):
    metrics = ['Silhouette Score', 'Calinski-Harabasz Index', 'Davies-Bouldin Index']
    
    for technique in df['Reduction Technique'].unique():
        df_technique = df[df['Reduction Technique'] == technique]
        
        fig, axes = plt.subplots(3, 1, figsize=(20, 30))
        fig.suptitle(f'Clustering Performance Heatmaps for {technique}', fontsize=24, y=0.)
        
        for i, metric in enumerate(metrics):
            pivot = df_technique.groupby(['Method', 'Metric', 'Norm'])[metric].mean().unstack(level=[1, 2])
            
            if metric == 'Davies-Bouldin Index':
                cmap = 'YlOrRd_r'
                interpretation = 'Lower is better'
            else:
                cmap = 'YlGnBu'
                interpretation = 'Higher is better'
            
            sns.heatmap(pivot, annot=True, cmap=cmap, fmt='.2f', cbar_kws={'label': metric}, ax=axes[i], annot_kws={'size': 10})
            
            axes[i].set_title(f'{metric}\n{interpretation}', fontsize=18)
            axes[i].set_xlabel('Metric, Norm', fontsize=14)
            axes[i].set_ylabel('Method', fontsize=14)
            
            # Rotate x-axis labels
            axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45, ha='right')
            
            # Increase font size of cell values
            for t in axes[i].texts:
                t.set_fontsize(10)
        
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'{technique}_clustering_performance_heatmap.png'), dpi=300, bbox_inches='tight')
        plt.close()

def plot_normalized_scores(df_normalized, output_dir):
    # Define color scheme for methods
    method_colors = {'single': 'red', 'complete': 'blue', 'average': 'green', 'weighted': 'purple'}
    
    for technique in df_normalized['Reduction Technique'].unique():
        plt.figure(figsize=(15, 10))
        
        df_tech = df_normalized[df_normalized['Reduction Technique'] == technique]
        
        for method in df_tech['Method'].unique():
            df_method = df_tech[df_tech['Method'] == method]
            plt.scatter(df_method['Silhouette Score'], 
                        df_method['Calinski-Harabasz Index'],
                        c=df_method['Davies-Bouldin Index'],
                        s=200,  # Increased marker size
                        cmap='viridis_r',
                        marker='o',
                        edgecolors=method_colors[method],
                        linewidth=2,
                        alpha=0.7,
                        label=method)
        
        plt.colorbar(label='Davies-Bouldin Index (Lower is better)')
        
        plt.title(f'Normalized Clustering Performance - {technique}', fontsize=20)
        plt.xlabel('Silhouette Score (Higher is better)', fontsize=14)
        plt.ylabel('Calinski-Harabasz Index (Higher is better)', fontsize=14)
        
        plt.legend(title='Method', title_fontsize=12, fontsize=10)
        
        plt.grid(True, linestyle='--', alpha=0.7)
        
        # Annotate best performers
        best_silhouette = df_tech.loc[df_tech['Silhouette Score'].idxmax()]
        best_calinski = df_tech.loc[df_tech['Calinski-Harabasz Index'].idxmax()]
        best_davies = df_tech.loc[df_tech['Davies-Bouldin Index'].idxmin()]
        
        for best in [best_silhouette, best_calinski, best_davies]:
            plt.annotate(f"{best['Method']}\n{best['Metric']}-{best['Norm']}",
                         (best['Silhouette Score'], best['Calinski-Harabasz Index']),
                         xytext=(5, 5), textcoords='offset points', fontsize=10,
                         bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=0.8),
                         arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0.3"))
        
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'normalized_clustering_performance_{technique}.png'), dpi=300, bbox_inches='tight')
        plt.close()

    # Create a legend figure
    plt.figure(figsize=(8, 4))
    for method, color in method_colors.items():
        plt.scatter([], [], c=color, label=method, s=100)
    plt.legend(title='Method', title_fontsize=14, fontsize=12, loc='center')
    plt.axis('off')
    plt.savefig(os.path.join(output_dir, 'method_legend.png'), dpi=300, bbox_inches='tight')
    plt.close()
    
    
def run():
    log_files = [
        'Analysis/MDS/dimensionality_reduction_log.txt',
        'Analysis/PCA/None/dimensionality_reduction_log.txt',
        'Analysis/SVD/dimensionality_reduction_log.txt',
        'Analysis/t-sne/dimensionality_reduction_log.txt',
        'Analysis/UMAP/dimensionality_reduction_log.txt'
    ]

    output_dir = './Analysis/reduction-techniques'
    os.makedirs(output_dir, exist_ok=True)

    with ProcessPoolExecutor() as executor:
        futures = [executor.submit(process_log_file, log_file) for log_file in log_files]
        all_metrics = [metric for future in as_completed(futures) for metric in future.result()]

    df = pd.DataFrame(all_metrics)
    
    # Handle duplicate entries
    df = df.groupby(['Reduction Technique', 'Method', 'Metric', 'Norm']).mean().reset_index()
    
    df.to_csv(os.path.join(output_dir, 'clustering_metrics_summary.csv'), index=False)

    analyze_metrics(df)

    plot_combined_heatmap(df, output_dir)

    # Plot normalized scores
    df_normalized = df.copy()
    for col in ['Silhouette Score', 'Calinski-Harabasz Index']:
        df_normalized[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    df_normalized['Davies-Bouldin Index'] = 1 - (df['Davies-Bouldin Index'] - df['Davies-Bouldin Index'].min()) / (df['Davies-Bouldin Index'].max() - df['Davies-Bouldin Index'].min())

    plot_normalized_scores(df_normalized, output_dir)

run()

Best combinations:
Silhouette Score: Reduction Technique            None
Method                     weighted
Metric                 log2_contact
Norm                             vc
Silhouette Score           0.983039
Name: 91, dtype: object
Calinski-Harabasz Index: Reduction Technique                UMAP
Method                           single
Metric                     log2_contact
Norm                                ice
Calinski-Harabasz Index      94312.3192
Name: 228, dtype: object
Davies-Bouldin Index: Reduction Technique             None
Method                      weighted
Metric                  log2_contact
Norm                              vc
Davies-Bouldin Index        0.006492
Name: 91, dtype: object
