In [None]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from concurrent.futures import ProcessPoolExecutor, as_completed
import os
import numpy as np
import matplotlib.gridspec as gridspec

def extract_metrics(log_entry, reduction_technique):
    pattern = r'method: (\w+).*metric: (\w+).*norm: (\w+).*Silhouette Score: ([\d.]+).*Calinski-Harabasz Index: ([\d.]+).*Davies-Bouldin Index: ([\d.]+)'
    match = re.search(pattern, log_entry, re.DOTALL)
    if match:
        return {
            'Reduction Technique': reduction_technique,
            'Method': match.group(1),
            'Metric': match.group(2),
            'Norm': match.group(3),
            'Silhouette Score': float(match.group(4)),
            'Calinski-Harabasz Index': float(match.group(5)),
            'Davies-Bouldin Index': float(match.group(6))
        }
    return None

def process_log_file(log_file):
    reduction_technique = os.path.basename(os.path.dirname(log_file))
    with open(log_file, 'r') as file:
        log_content = file.read()
    log_entries = log_content.split('==================================================')
    metrics = [extract_metrics(entry, reduction_technique) for entry in log_entries if 'Clustering Metrics' in entry]
    return [m for m in metrics if m is not None]

def analyze_metrics(df):
    best_silhouette = df.loc[df['Silhouette Score'].idxmax()]
    best_calinski = df.loc[df['Calinski-Harabasz Index'].idxmax()]
    best_davies = df.loc[df['Davies-Bouldin Index'].idxmin()]
    
    print("Best combinations:")
    print(f"Silhouette Score: {best_silhouette[['Reduction Technique', 'Method', 'Metric', 'Norm', 'Silhouette Score']]}")
    print(f"Calinski-Harabasz Index: {best_calinski[['Reduction Technique', 'Method', 'Metric', 'Norm', 'Calinski-Harabasz Index']]}")
    print(f"Davies-Bouldin Index: {best_davies[['Reduction Technique', 'Method', 'Metric', 'Norm', 'Davies-Bouldin Index']]}")


def plot_combined_heatmap(df, output_dir, metrics_to_plot='all'):
    all_metrics = ['Silhouette Score', 'Calinski-Harabasz Index', 'Davies-Bouldin Index']
    
    if metrics_to_plot == 'all':
        metrics = all_metrics
    elif metrics_to_plot in all_metrics:
        metrics = [metrics_to_plot]
    else:
        raise ValueError("Invalid metric specified. Choose 'all' or one of 'Silhouette Score', 'Calinski-Harabasz Index', 'Davies-Bouldin Index'")

    for technique in df['Reduction Technique'].unique():
        df_technique = df[df['Reduction Technique'] == technique]
        
        fig, axes = plt.subplots(len(metrics), 1, figsize=(20, 8 * len(metrics)))
        if len(metrics) == 1:
            axes = [axes]  # Make axes iterable when there's only one subplot
        # fig.suptitle(f'Clustering Performance for {technique}', fontsize=24, y=0.95)
        
        for i, metric in enumerate(metrics):
            pivot = df_technique.groupby(['Method', 'Metric', 'Norm'])[metric].mean().unstack(level=[1, 2])
            
            if metric == 'Davies-Bouldin Index':
                cmap = 'YlOrRd_r'
                interpretation = 'Lower is better'
                vmin = 0
                vmax = df[metric].max()
                best_value = pivot.min().min()
                worst_value = pivot.max().max()
            elif metric == 'Silhouette Score':
                cmap = 'YlGnBu'
                interpretation = 'Higher is better'
                vmin = 0
                vmax = 1
                best_value = pivot.max().max()
                worst_value = pivot.min().min()
            else:  # Calinski-Harabasz Index
                cmap = 'YlGnBu'
                interpretation = 'Higher is better'
                vmin = None
                vmax = None
                best_value = pivot.max().max()
                worst_value = pivot.min().min()
            
            sns.heatmap(pivot, annot=True, cmap=cmap, fmt='.2f', 
                        cbar_kws={'label': metric}, ax=axes[i], annot_kws={'size': 10},
                        vmin=vmin, vmax=vmax)
            
            # Highlight the best score
            best_indices = np.where(pivot == best_value)
            for row, col in zip(*best_indices):
                axes[i].add_patch(plt.Rectangle((col, row), 1, 1, fill=False, edgecolor='lime', lw=3))
            
            # Highlight the worst score
            worst_indices = np.where(pivot == worst_value)
            for row, col in zip(*worst_indices):
                axes[i].add_patch(plt.Rectangle((col, row), 1, 1, fill=False, edgecolor='red', lw=3))
            
            axes[i].set_title(f'{metric}\n{interpretation}', fontsize=20)
            axes[i].set_xlabel('Metric, Norm', fontsize=16)
            axes[i].set_ylabel('Method', fontsize=16)
            
            axes[i].set_xticklabels(axes[i].get_xticklabels(), rotation=45, ha='right', fontsize=12)
            axes[i].set_yticklabels(axes[i].get_yticklabels(), fontsize=12)
            
            for t in axes[i].texts:
                t.set_fontsize(12)
        
        plt.tight_layout()
        filename = f'{technique}_clustering_performance_heatmap'
        if metrics_to_plot != 'all':
            filename += f'_{metrics_to_plot.replace(" ", "_").lower()}'
        plt.savefig(os.path.join(output_dir, f'{filename}.png'), dpi=300, bbox_inches='tight')
        plt.close()

    # Create a separate legend for interpretation
    fig_legend = plt.figure(figsize=(10, 5))
    ax_legend = fig_legend.add_subplot(111)
    ax_legend.axis('off')
    ax_legend.text(0.1, 0.8, "Interpretation:", fontweight='bold', fontsize=14)
    ax_legend.text(0.1, 0.6, "Silhouette Score & Calinski-Harabasz Index: Higher is better", fontsize=12)
    ax_legend.text(0.1, 0.4, "Davies-Bouldin Index: Lower is better", fontsize=12)
    ax_legend.text(0.1, 0.2, "Green border: Best score for each metric", fontsize=12)
    ax_legend.text(0.1, 0.0, "Red border: Worst score for each metric", fontsize=12)
    plt.savefig(os.path.join(output_dir, 'heatmap_legend.png'), dpi=300, bbox_inches='tight')
    plt.close()

def plot_normalized_scores(df_normalized, output_dir):
    # Define color scheme for methods
    method_colors = {'single': 'red', 'complete': 'blue', 'average': 'green', 'weighted': 'purple'}
    
    for technique in df_normalized['Reduction Technique'].unique():
        plt.figure(figsize=(30, 20))
        
        df_tech = df_normalized[df_normalized['Reduction Technique'] == technique]
        
        for method in df_tech['Method'].unique():
            df_method = df_tech[df_tech['Method'] == method]
            plt.scatter(df_method['Silhouette Score'], 
                        df_method['Calinski-Harabasz Index'],
                        c=df_method['Davies-Bouldin Index'],
                        s=200,  # Increased marker size
                        cmap='viridis_r',
                        marker='o',
                        edgecolors=method_colors[method],
                        linewidth=2,
                        alpha=0.7,
                        label=method)
        
        plt.colorbar(label='Davies-Bouldin Index (Lower is better)')
        
        plt.title(f'Normalized Clustering Performance - {technique}', fontsize=20)
        plt.xlabel('Silhouette Score (Higher is better)', fontsize=20)
        plt.ylabel('Calinski-Harabasz Index (Higher is better)', fontsize=14)
        
        plt.legend(title='Method', title_fontsize=12, fontsize=10)
        
        plt.grid(True, linestyle='--', alpha=0.7)
        
        # Annotate best performers
        best_silhouette = df_tech.loc[df_tech['Silhouette Score'].idxmax()]
        best_calinski = df_tech.loc[df_tech['Calinski-Harabasz Index'].idxmax()]
        best_davies = df_tech.loc[df_tech['Davies-Bouldin Index'].idxmin()]
        
        for best in [best_silhouette, best_calinski, best_davies]:
            plt.annotate(f"{best['Method']}\n{best['Metric']}-{best['Norm']}",
                         (best['Silhouette Score'], best['Calinski-Harabasz Index']),
                         xytext=(5, 5), textcoords='offset points', fontsize=16,
                         bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="gray", alpha=1.0),
                         arrowprops=dict(arrowstyle="->", connectionstyle="arc3,rad=0.3"))
        
        plt.tight_layout()
        plt.savefig(os.path.join(output_dir, f'normalized_clustering_performance_{technique}.png'), dpi=300, bbox_inches='tight')
        plt.close()

    # Create a legend figure
    plt.figure(figsize=(8, 4))
    for method, color in method_colors.items():
        plt.scatter([], [], c=color, label=method, s=100)
    plt.legend(title='Method', title_fontsize=14, fontsize=12, loc='center')
    plt.axis('off')
    plt.savefig(os.path.join(output_dir, 'method_legend.png'), dpi=300, bbox_inches='tight')
    plt.close()
    


def plot_reduction_technique_summary(df, output_dir, metrics_to_plot='all', central_tendency='median'):
    all_metrics = ['Silhouette Score', 'Calinski-Harabasz Index', 'Davies-Bouldin Index']
    
    if metrics_to_plot == 'all':
        metrics = all_metrics
    elif metrics_to_plot in all_metrics:
        metrics = [metrics_to_plot]
    else:
        raise ValueError("Invalid metric specified. Choose 'all' or one of 'Silhouette Score', 'Calinski-Harabasz Index', 'Davies-Bouldin Index'")

    if central_tendency not in ['mean', 'median']:
        raise ValueError("Invalid central tendency specified. Choose 'mean' or 'median'")

    fig, axes = plt.subplots(1, len(metrics), figsize=(8 * len(metrics), 8))
    if len(metrics) == 1:
        axes = [axes]
    fig.suptitle(f'Summary of Clustering Performance Across Reduction Techniques\n(Central Tendency: {central_tendency.capitalize()})', fontsize=24)
    
    for i, metric in enumerate(metrics):
        ax = axes[i]
        
        # Prepare data for box plot
        plot_data = []
        labels = []
        for technique in df['Reduction Technique'].unique():
            plot_data.append(df[df['Reduction Technique'] == technique][metric])
            labels.append(technique)
        
        # Create box plot with extended whiskers
        bp = ax.boxplot(plot_data, tick_labels=labels, patch_artist=True, whis=[0, 100])
        
        # Customize box colors
        for box in bp['boxes']:
            box.set(facecolor='lightblue', edgecolor='blue', alpha=0.7)
        
        # Customize median line
        for median in bp['medians']:
            median.set(color='red', linewidth=2)
        
        # Remove cap lines
        for cap in bp['caps']:
            cap.set(visible=True)
        
        # Extend whiskers to full range
        for whisker in bp['whiskers']:
            whisker.set(linestyle='-', color='black')
        
        ax.set_ylabel('Score', fontsize=14)
        ax.set_title(metric, fontsize=18)
        ax.set_xticklabels(labels, rotation=45, ha='right', fontsize=12)
        
        if metric == 'Davies-Bouldin Index':
            ax.invert_yaxis()  # Lower is better for Davies-Bouldin Index
        
        # Add central tendency value as text
        for i, d in enumerate(plot_data):
            if central_tendency == 'mean':
                value = np.mean(d)
            else:
                value = np.median(d)
            ax.text(i+1, value, f'{value:.2f}', horizontalalignment='center', 
                    verticalalignment='bottom', fontweight='bold')

    plt.tight_layout()
    filename = 'reduction_technique_summary_boxplot'
    if metrics_to_plot != 'all':
        filename += f'_{metrics_to_plot.replace(" ", "_").lower()}'
    filename += f'_{central_tendency}'
    plt.show()
    plt.savefig(os.path.join(output_dir, f'{filename}.png'), dpi=300, bbox_inches='tight')
    plt.close()

    # Additional plot saving as per your code
    plt.tight_layout()
    filename = 'reduction_technique_summary'
    if metrics_to_plot != 'all':
        filename += f'_{metrics_to_plot.replace(" ", "_").lower()}'
    filename += f'_{central_tendency}'
    plt.show()
    plt.savefig(os.path.join(output_dir, f'{filename}.png'), dpi=300, bbox_inches='tight')
    plt.close()    
    
def plot_combined_visualization(df, output_dir, metrics_to_plot='all', central_tendency='median'):
    # First, create the heatmaps
    plot_combined_heatmap(df, output_dir, metrics_to_plot)
    
    # Then, create the summary plot
    plot_reduction_technique_summary(df, output_dir, metrics_to_plot, central_tendency)
    
    # Now, create a combined figure
    techniques = df['Reduction Technique'].unique()
    
    all_metrics = ['Silhouette Score', 'Calinski-Harabasz Index', 'Davies-Bouldin Index']
    if metrics_to_plot == 'all':
        metrics = all_metrics
    elif metrics_to_plot in all_metrics:
        metrics = [metrics_to_plot]
    else:
        raise ValueError("Invalid metric specified. Choose 'all' or one of 'Silhouette Score', 'Calinski-Harabasz Index', 'Davies-Bouldin Index'")
    
    fig = plt.figure(figsize=(30, 10 + 5 * len(metrics)))
    gs = fig.add_gridspec(2, len(techniques))
    
    for i, technique in enumerate(techniques):
        # Add heatmap
        heatmap_filename = f'{technique}_clustering_performance_heatmap'
        if metrics_to_plot != 'all':
            heatmap_filename += f'_{metrics_to_plot.replace(" ", "_").lower()}'
        heatmap_filename += '.png'
        heatmap_img = plt.imread(os.path.join(output_dir, heatmap_filename))
        ax_heatmap = fig.add_subplot(gs[0, i])
        ax_heatmap.imshow(heatmap_img)
        ax_heatmap.axis('off')
        ax_heatmap.set_title(technique, fontsize=20)
    
    # Add summary plot
    summary_filename = 'reduction_technique_summary'
    if metrics_to_plot != 'all':
        summary_filename += f'_{metrics_to_plot.replace(" ", "_").lower()}'
    summary_filename += f'_{central_tendency}.png'
    summary_img = plt.imread(os.path.join(output_dir, summary_filename))
    ax_summary = fig.add_subplot(gs[1, :])
    ax_summary.imshow(summary_img)
    ax_summary.axis('off')
    
    plt.tight_layout()
    combined_filename = 'combined_visualization'
    if metrics_to_plot != 'all':
        combined_filename += f'_{metrics_to_plot.replace(" ", "_").lower()}'
    combined_filename += f'_{central_tendency}.png'
    plt.savefig(os.path.join(output_dir, combined_filename), dpi=300, bbox_inches='tight')
    plt.close()

    
    
def run():
    log_files = [
        'Analysis/MDS/dimensionality_reduction_log.txt',
        'Analysis/PCA/dimensionality_reduction_log.txt',
        'Analysis/SVD/dimensionality_reduction_log.txt',
        'Analysis/t-sne/dimensionality_reduction_log.txt',
        'Analysis/UMAP/dimensionality_reduction_log.txt'
    ]

    output_dir = './Analysis/reduction-techniques'
    os.makedirs(output_dir, exist_ok=True)

    with ProcessPoolExecutor() as executor:
        futures = [executor.submit(process_log_file, log_file) for log_file in log_files]
        all_metrics = [metric for future in as_completed(futures) for metric in future.result()]

    df = pd.DataFrame(all_metrics)
    
    # Handle duplicate entries
    df = df.groupby(['Reduction Technique', 'Method', 'Metric', 'Norm']).mean().reset_index()
    
    df.to_csv(os.path.join(output_dir, 'clustering_metrics_summary.csv'), index=False)

    analyze_metrics(df)
    all_metrics = ['Silhouette Score', 'Calinski-Harabasz Index', 'Davies-Bouldin Index', 'all']

    plot_combined_visualization(df, output_dir, metrics_to_plot=all_metrics[0], central_tendency='median')
    # Plot normalized scores
    df_normalized = df.copy()
    for col in ['Silhouette Score', 'Calinski-Harabasz Index']:
        df_normalized[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    df_normalized['Davies-Bouldin Index'] = 1 - (df['Davies-Bouldin Index'] - df['Davies-Bouldin Index'].min()) / (df['Davies-Bouldin Index'].max() - df['Davies-Bouldin Index'].min())

    plot_normalized_scores(df_normalized, output_dir)

run()