In [None]:
"""
This block plots performance from entire experiments directory for something like a hyperparameter sweep.
"""
import os
import json
import pandas as pd
import seaborn as sns
from torch.utils.data import DataLoader, TensorDataset
import torch
from torch import nn
import matplotlib.pyplot as plt
from tqdm import tqdm
from lib.models import TestModel
from lib.train_utils import compute_loss_and_f1, plot_hyperparameter_counts, load_experiments_from_dir

experiments_dir = f'./experiments'

df = load_experiments_from_dir(experiments_dir, device='cuda')

def get_base_test_f1(row):
    if row['mode'] == 'base':
        return row['test_f1']
    else:
        base_row = df[df['experiment'] == row['base_experiment_prefix']]
        return base_row['test_f1'].iloc[0] if len(base_row) > 0 else None

def get_base_val_f1(row):
    if row['mode'] == 'base':
        return row['target_val_f1']
    else:
        base_row = df[df['experiment'] == row['base_experiment_prefix']]
        return base_row['target_val_f1'].iloc[0] if len(base_row) > 0 else None
    
df['base_test_f1'] = df.apply(get_base_test_f1, axis=1)
df['base_target_val_f1'] = df.apply(get_base_val_f1, axis=1)

df['absolute_improvement'] = df['test_f1'] - df['base_test_f1']
df['relative_improvement'] = df['absolute_improvement'] / df['base_test_f1']
df['room_for_improvement'] = 1.0 - df['base_test_f1']
df['relative_to_room_for_improvement'] = df['absolute_improvement'] / df['room_for_improvement']

df['absolute_improvement_val'] = df['best_val_f1'] - df['base_target_val_f1']
df['relative_improvement_val'] = df['absolute_improvement_val'] / df['base_target_val_f1']
df['room_for_improvement_val'] = 1.0 - df['base_target_val_f1']
df['relative_to_room_for_improvement_val'] = df['absolute_improvement_val'] / df['room_for_improvement_val']

df['base_test_precision'] = df.apply(lambda row: row['test_precision'] if row['mode'] == 'base' else df[df['experiment'] == row['base_experiment_prefix']]['test_precision'].iloc[0] if len(df[df['experiment'] == row['base_experiment_prefix']]) > 0 else None, axis=1)
df['base_test_recall'] = df.apply(lambda row: row['test_recall'] if row['mode'] == 'base' else df[df['experiment'] == row['base_experiment_prefix']]['test_recall'].iloc[0] if len(df[df['experiment'] == row['base_experiment_prefix']]) > 0 else None, axis=1)
df['precision_improvement'] = df['test_precision'] - df['base_test_precision']
df['recall_improvement'] = df['test_recall'] - df['base_test_recall']

print(len(df))
hyperparameters_to_plot = ['target_data_pct','base_channels','mode']
for hp in hyperparameters_to_plot:
    print(df[hp].value_counts())
plot_hyperparameter_counts(df, hyperparameters_to_plot)

In [None]:
df_plot = df.copy()
# df_plot = df_plot[df_plot['mode'] == 'target_only_fine_tuning']
sns.boxplot(data=df_plot, x='base_channels', y='test_f1',hue='target_data_pct')

In [None]:
sns.boxplot(data=df_plot, x='base_channels', y='test_f1',hue='mode')

In [None]:
# Set publication-ready style
plt.rcParams.update({
    'font.size': 12,
    'font.weight': 'bold',
    'axes.labelweight': 'bold',
    'axes.titleweight': 'bold',
    'figure.dpi': 300,
    'axes.linewidth': 1.5,
    'axes.spines.top': False,
    'axes.spines.right': False
})

fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(12, 5), dpi=300)

# Create black and white boxplots
sns.boxplot(data=df[df['mode'] == 'base'], y='test_f1', ax=ax[0], 
           color='white', linewidth=1.5, 
           boxprops=dict(edgecolor='black', linewidth=1.5),
           whiskerprops=dict(color='black', linewidth=1.5),
           capprops=dict(color='black', linewidth=1.5),
           medianprops=dict(color='black', linewidth=2),
           flierprops=dict(marker='o', markerfacecolor='white', markeredgecolor='black', markersize=4))

sns.boxplot(data=df[df['mode'] == 'base'], x='fold', y='test_f1', ax=ax[1],
           color='white', linewidth=1.5,
           boxprops=dict(edgecolor='black', linewidth=1.5),
           whiskerprops=dict(color='black', linewidth=1.5),
           capprops=dict(color='black', linewidth=1.5),
           medianprops=dict(color='black', linewidth=2),
           flierprops=dict(marker='o', markerfacecolor='white', markeredgecolor='black', markersize=4))

# Set limits and labels
ax[0].set_ylim(0, 1)
ax[1].set_ylim(0, 1)
ax[0].set_title('(A) Base Model Performance', fontsize=16, fontweight='bold')
ax[1].set_title('(B) Cross-Validation Fold Performance', fontsize=16, fontweight='bold')
# Bold and properly formatted labels
ax[0].set_ylabel('Validation F1 Score', fontsize=14, fontweight='bold')
ax[0].set_xlabel('Base Model', fontsize=14, fontweight='bold')
ax[1].set_ylabel('Validation F1 Score', fontsize=14, fontweight='bold')
ax[1].set_xlabel('Cross-Validation Fold', fontsize=14, fontweight='bold')

# Add grid for better readability
for axis in ax:
    axis.grid(True, alpha=0.3, linestyle='--')
    axis.set_axisbelow(True)
    
# Adjust layout for publication
plt.tight_layout()

# Save as high-quality figure for publication
plt.savefig('figure1.pdf', dpi=300, bbox_inches='tight', 
           facecolor='white', edgecolor='none')
plt.savefig('figure1.png', dpi=300, bbox_inches='tight',
           facecolor='white', edgecolor='none')
plt.show()

In [None]:
df_plot = df.copy()
df_plot = df_plot[df_plot['mode'] != 'target_only']
df_plot = df_plot[df_plot['target_data_pct'] == 1.0]
df_plot = df_plot[df_plot['base_channels'] == 64]

# Clean up mode names for publication
df_plot['mode_clean'] = df_plot['mode'].map({
    'base': 'Base Model',
    'target_only_fine_tuning': 'Fine-Tuned Model'
})

fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(16, 12), dpi=300)
ax = axes.flatten()

# Define distinct black and white styles for each method
base_props = {
    'boxprops': dict(facecolor='white', edgecolor='black', linewidth=1.5),
    'whiskerprops': dict(color='black', linewidth=1.5),
    'capprops': dict(color='black', linewidth=1.5),
    'medianprops': dict(color='black', linewidth=2),
    'flierprops': dict(marker='o', markerfacecolor='white', markeredgecolor='black', markersize=5)
}

finetuned_props = {
    'boxprops': dict(facecolor='lightgray', edgecolor='black', linewidth=1.5, hatch='///'),
    'whiskerprops': dict(color='black', linewidth=1.5),
    'capprops': dict(color='black', linewidth=1.5),
    'medianprops': dict(color='black', linewidth=2),
    'flierprops': dict(marker='s', markerfacecolor='lightgray', markeredgecolor='black', markersize=5)
}

# Plot 1: F1 Score Comparison (top-left)
base_data = df_plot[df_plot['mode'] == 'base']['test_f1']
finetuned_data = df_plot[df_plot['mode'] == 'target_only_fine_tuning']['test_f1']

positions = [1, 2]
width = 0.6

# Plot base model
box1 = ax[0].boxplot([base_data], positions=[positions[0]], widths=width, 
                     patch_artist=True, **base_props)

# Plot fine-tuned model  
box2 = ax[0].boxplot([finetuned_data], positions=[positions[1]], widths=width,
                     patch_artist=True, **finetuned_props)

# Customize F1 plot
ax[0].set_xticks(positions)
ax[0].set_xticklabels(['Base Model', 'Fine-Tuned Model'], fontsize=12, fontweight='bold')
ax[0].set_ylabel('Test F1 Score', fontsize=14, fontweight='bold')
ax[0].set_xlabel('Model Type', fontsize=14, fontweight='bold')
ax[0].set_ylim(0, 1)
ax[0].set_title('(A) F1 Score Comparison', fontsize=14, fontweight='bold', pad=15)

# Calculate improvement for annotation
base_mean = base_data.mean()
finetuned_mean = finetuned_data.mean()
improvement = finetuned_mean - base_mean

# Add improvement text
ax[0].text(0.5, 0.95, f'Mean Improvement: +{improvement:.3f} F1', 
           transform=ax[0].transAxes, fontsize=11, fontweight='bold',
           ha='center', va='top', bbox=dict(boxstyle='round,pad=0.3', 
                                          facecolor='white', edgecolor='black'))

# Plot 2: Absolute Improvement Distribution (top-right)
finetune_only_data = df_plot[df_plot['mode'] == 'target_only_fine_tuning']
abs_improvement_data = finetune_only_data['absolute_improvement']

box3 = ax[1].boxplot([abs_improvement_data], positions=[1], widths=0.6,
                     patch_artist=True, **finetuned_props)

ax[1].set_xticks([1])
ax[1].set_xticklabels(['Fine-Tuned vs Base'], fontsize=12, fontweight='bold')
ax[1].set_ylabel('Absolute F1 Improvement', fontsize=14, fontweight='bold')
ax[1].set_xlabel('Comparison', fontsize=14, fontweight='bold')
ax[1].set_title('(B) Absolute Improvement Distribution', fontsize=14, fontweight='bold', pad=15)
ax[1].axhline(y=0, color='red', linestyle='--', alpha=0.7, linewidth=1)

# Add mean annotation
abs_mean = abs_improvement_data.mean()
ax[1].text(0.5, 0.95, f'Mean: +{abs_mean:.3f} F1', 
           transform=ax[1].transAxes, fontsize=11, fontweight='bold',
           ha='center', va='top', bbox=dict(boxstyle='round,pad=0.3', 
                                          facecolor='white', edgecolor='black'))

# Plot 3: Relative Improvement Distribution (bottom-left)
rel_improvement_data = finetune_only_data['relative_improvement'] * 100

box4 = ax[2].boxplot([rel_improvement_data], positions=[1], widths=0.6,
                     patch_artist=True, **finetuned_props)

ax[2].set_xticks([1])
ax[2].set_xticklabels(['Fine-Tuned vs Base'], fontsize=12, fontweight='bold')
ax[2].set_ylabel('Relative Improvement (%)', fontsize=14, fontweight='bold')
ax[2].set_xlabel('Comparison', fontsize=14, fontweight='bold')
ax[2].set_title('(C) Relative Improvement Distribution', fontsize=14, fontweight='bold', pad=15)
ax[2].axhline(y=0, color='red', linestyle='--', alpha=0.7, linewidth=1)

# Add mean annotation
rel_mean = rel_improvement_data.mean()
ax[2].text(0.5, 0.95, f'Mean: +{rel_mean:.1f}%', 
           transform=ax[2].transAxes, fontsize=11, fontweight='bold',
           ha='center', va='top', bbox=dict(boxstyle='round,pad=0.3', 
                                          facecolor='white', edgecolor='black'))

# Plot 4: Room for Improvement Distribution (bottom-right)
room_improvement_data = finetune_only_data['relative_to_room_for_improvement'] * 100

box5 = ax[3].boxplot([room_improvement_data], positions=[1], widths=0.6,
                     patch_artist=True, **finetuned_props)

ax[3].set_xticks([1])
ax[3].set_xticklabels(['Fine-Tuned vs Base'], fontsize=12, fontweight='bold')
ax[3].set_ylabel('% of Room for Improvement', fontsize=14, fontweight='bold')
ax[3].set_xlabel('Comparison', fontsize=14, fontweight='bold')
ax[3].set_title('(D) Relative to Room for Improvement', fontsize=14, fontweight='bold', pad=15)
ax[3].axhline(y=0, color='red', linestyle='--', alpha=0.7, linewidth=1)

# Add mean annotation
room_mean = room_improvement_data.mean()
ax[3].text(0.5, 0.95, f'Mean: +{room_mean:.1f}%', 
           transform=ax[3].transAxes, fontsize=11, fontweight='bold',
           ha='center', va='top', bbox=dict(boxstyle='round,pad=0.3', 
                                          facecolor='white', edgecolor='black'))

# Add grid for better readability to all plots
for i in range(4):
    ax[i].grid(True, alpha=0.3, linestyle='--')
    ax[i].set_axisbelow(True)

# Create comprehensive legend (only for the first plot)
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='white', edgecolor='black', linewidth=1.5, label='Base Model (Population-trained)'),
    Patch(facecolor='lightgray', edgecolor='black', linewidth=1.5, hatch='///', label='Fine-Tuned Model (Individual-customized)')
]

ax[0].legend(handles=legend_elements, loc='lower right', fontsize=10, frameon=True, 
             fancybox=False, edgecolor='black')

# Adjust layout for publication
plt.tight_layout()

# Save as high-quality figure for publication
plt.savefig('figure2.pdf', dpi=300, bbox_inches='tight', 
           facecolor='white', edgecolor='none')
plt.savefig('figure2.png', dpi=300, bbox_inches='tight',
           facecolor='white', edgecolor='none')

In [None]:
df_plot = df.copy()
df_plot = df_plot[df_plot['target_data_pct'] == 1.0]
df_plot = df_plot[df_plot['base_channels'] == 64]

# Clean up mode names for publication
df_plot['mode_clean'] = df_plot['mode'].map({
    'base': 'Base Model\n(Population)',
    'target_only': 'Target-Only\n(From Scratch)',
    'target_only_fine_tuning': 'Fine-Tuned\n(Our Method)'
})

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(10, 6), dpi=300)

# Define distinct black and white styles for each method
base_props = {
    'boxprops': dict(facecolor='white', edgecolor='black', linewidth=1.5),
    'whiskerprops': dict(color='black', linewidth=1.5),
    'capprops': dict(color='black', linewidth=1.5),
    'medianprops': dict(color='black', linewidth=2),
    'flierprops': dict(marker='o', markerfacecolor='white', markeredgecolor='black', markersize=5)
}

target_only_props = {
    'boxprops': dict(facecolor='lightgray', edgecolor='black', linewidth=1.5, hatch='...'),
    'whiskerprops': dict(color='black', linewidth=1.5),
    'capprops': dict(color='black', linewidth=1.5),
    'medianprops': dict(color='black', linewidth=2),
    'flierprops': dict(marker='^', markerfacecolor='lightgray', markeredgecolor='black', markersize=5)
}

finetuned_props = {
    'boxprops': dict(facecolor='darkgray', edgecolor='black', linewidth=1.5, hatch='///'),
    'whiskerprops': dict(color='black', linewidth=1.5),
    'capprops': dict(color='black', linewidth=1.5),
    'medianprops': dict(color='black', linewidth=2),
    'flierprops': dict(marker='s', markerfacecolor='darkgray', markeredgecolor='black', markersize=5)
}

# Extract data for each method
base_data = df_plot[df_plot['mode'] == 'base']['test_f1']
target_only_data = df_plot[df_plot['mode'] == 'target_only']['test_f1']
finetuned_data = df_plot[df_plot['mode'] == 'target_only_fine_tuning']['test_f1']

positions = [1, 2, 3]
width = 0.6

# Plot all three methods
box1 = ax.boxplot([base_data], positions=[positions[0]], widths=width, 
                  patch_artist=True, **base_props)
box2 = ax.boxplot([target_only_data], positions=[positions[1]], widths=width,
                  patch_artist=True, **target_only_props)
box3 = ax.boxplot([finetuned_data], positions=[positions[2]], widths=width,
                  patch_artist=True, **finetuned_props)

# Customize plot
ax.set_xticks(positions)
ax.set_xticklabels(['Base Model\n(Population)', 'Target-Only\n(From Scratch)', 'Fine-Tuned\n(Our Method)'], 
                   fontsize=12, fontweight='bold')
ax.set_ylabel('Test F1 Score', fontsize=14, fontweight='bold')
ax.set_xlabel('Training Approach', fontsize=14, fontweight='bold')
ax.set_ylim(0, 1)

# Add grid for better readability
ax.grid(True, alpha=0.3, linestyle='--')
ax.set_axisbelow(True)

# Calculate means and improvements
base_mean = base_data.mean()
target_only_mean = target_only_data.mean()
finetuned_mean = finetuned_data.mean()

improvement_vs_base = finetuned_mean - base_mean
improvement_vs_target_only = finetuned_mean - target_only_mean

# Create comprehensive legend
from matplotlib.patches import Patch
legend_elements = [
    Patch(facecolor='white', edgecolor='black', linewidth=1.5, 
          label='Base Model (Population-trained)'),
    Patch(facecolor='lightgray', edgecolor='black', linewidth=1.5, hatch='...', 
          label='Target-Only (Individual from scratch)'),
    Patch(facecolor='darkgray', edgecolor='black', linewidth=1.5, hatch='///', 
          label='Fine-Tuned (Population â†’ Individual)')
]

ax.legend(handles=legend_elements, loc='center right', fontsize=11, frameon=True, 
          fancybox=False, edgecolor='black')

# Adjust layout for publication
plt.tight_layout()

# Save as high-quality figure for publication
plt.savefig('figure3.pdf', dpi=300, bbox_inches='tight', 
           facecolor='white', edgecolor='none')
plt.savefig('figure3.png', dpi=300, bbox_inches='tight',
           facecolor='white', edgecolor='none')

In [None]:
df_plot = df.copy()
df_plot = df_plot[df_plot['mode'] != 'base']
df_plot = df_plot[df_plot['base_channels'] == 64]

# Clean up mode names for publication
df_plot['mode_clean'] = df_plot['mode'].map({
    'target_only': 'Target-Only\n(From Scratch)',
    'target_only_fine_tuning': 'Fine-Tuned\n(Our Method)'
})

# Convert target_data_pct to percentage for better readability
df_plot['target_data_pct_label'] = (df_plot['target_data_pct'] * 100).astype(int).astype(str) + '%'

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(12, 6), dpi=300)

# Define distinct black and white patterns for different data percentages
data_pct_patterns = {
    '5%': {'facecolor': 'white', 'hatch': None, 'edgecolor': 'black', 'alpha': 1.0},
    '12%': {'facecolor': 'lightgray', 'hatch': '///', 'edgecolor': 'black', 'alpha': 0.9},
    '25%': {'facecolor': 'gray', 'hatch': '...', 'edgecolor': 'black', 'alpha': 0.8},
    '50%': {'facecolor': 'darkgray', 'hatch': 'xxx', 'edgecolor': 'black', 'alpha': 0.7},
    '100%': {'facecolor': 'black', 'hatch': None, 'edgecolor': 'black', 'alpha': 0.6}
}

# Define the order for data percentages and training approaches
data_pct_order = ['5%', '12%', '25%', '50%', '100%']
mode_order = ['Target-Only\n(From Scratch)', 'Fine-Tuned\n(Our Method)']

# Manually create boxplots to ensure proper ordering and consistent patterns
positions = []
box_data = []
colors = []
hatches = []
alphas = []

# Calculate positions for each box
mode_spacing = 1.0  # Space between training approaches
pct_spacing = 0.15  # Space between data percentages within each approach

for mode_idx, mode in enumerate(mode_order):
    for pct_idx, pct in enumerate(data_pct_order):
        # Get data for this combination
        data_subset = df_plot[(df_plot['mode_clean'] == mode) & 
                             (df_plot['target_data_pct_label'] == pct)]['test_f1']
        
        if len(data_subset) > 0:
            # Calculate position
            base_pos = mode_idx * (mode_spacing + len(data_pct_order) * pct_spacing)
            pos = base_pos + pct_idx * pct_spacing
            
            positions.append(pos)
            box_data.append(data_subset.values)
            
            # Apply consistent patterns
            pattern_info = data_pct_patterns[pct]
            colors.append(pattern_info['facecolor'])
            hatches.append(pattern_info['hatch'])
            alphas.append(pattern_info['alpha'])

# Create the boxplot
box_plot = ax.boxplot(box_data, positions=positions, widths=0.12, 
                     patch_artist=True, showfliers=True,
                     boxprops=dict(linewidth=1.5),
                     medianprops=dict(linewidth=2, color='red'),
                     whiskerprops=dict(linewidth=1.5),
                     capprops=dict(linewidth=1.5),
                     flierprops=dict(marker='o', markersize=4, alpha=0.7))

# Apply colors and patterns to each box
for patch, color, hatch, alpha in zip(box_plot['boxes'], colors, hatches, alphas):
    patch.set_facecolor(color)
    patch.set_hatch(hatch)
    patch.set_alpha(alpha)
    patch.set_edgecolor('black')
    patch.set_linewidth(1.5)

# Set x-axis labels and positions
mode_positions = []
for mode_idx, mode in enumerate(mode_order):
    # Calculate center position for each mode
    base_pos = mode_idx * (mode_spacing + len(data_pct_order) * pct_spacing)
    center_pos = base_pos + (len(data_pct_order) - 1) * pct_spacing / 2
    mode_positions.append(center_pos)

ax.set_xticks(mode_positions)
ax.set_xticklabels(mode_order, fontsize=12, fontweight='bold')

# Customize plot appearance
ax.set_ylabel('Test F1 Score', fontsize=14, fontweight='bold')
ax.set_xlabel('Training Approach', fontsize=14, fontweight='bold')
ax.set_ylim(0, 1)

# Create custom legend for data percentages
from matplotlib.patches import Patch
legend_elements = []
for pct in data_pct_order:
    pattern_info = data_pct_patterns[pct]
    legend_elements.append(
        Patch(facecolor=pattern_info['facecolor'], 
              edgecolor='black', 
              linewidth=1.5,
              hatch=pattern_info['hatch'],
              alpha=pattern_info['alpha'],
              label=f'{pct} Target Data')
    )

# Position legend to not interfere with data
ax.legend(handles=legend_elements, loc='center right', fontsize=10, 
          frameon=True, fancybox=False, edgecolor='black',
          title='Available Target Data', title_fontsize=11)

# Add grid for better readability
ax.grid(True, alpha=0.3, linestyle='--')
ax.set_axisbelow(True)

# Adjust layout for publication
plt.tight_layout()

# Save as high-quality figure for publication
plt.savefig('figure4.pdf', dpi=300, bbox_inches='tight', 
           facecolor='white', edgecolor='none')
plt.savefig('figure4.png', dpi=300, bbox_inches='tight',
           facecolor='white', edgecolor='none')

In [None]:
import pandas as pd
from scipy import stats

# Assuming df is your original DataFrame (replace with your actual df)
df_plot = df.copy()

df_plot = df_plot[df_plot['target_data_pct'] == 1.0]

# Filter out 'base' mode
df_plot = df_plot[df_plot['mode'] != 'base']

# Pivot the data to get paired test_f1 scores
df_pivot = df_plot.pivot_table(index=['fold', 'seed_finetune'], columns='mode', values='test_f1')

# Extract paired F1 scores
f1_target_only = df_pivot['target_only']
f1_fine_tuning = df_pivot['target_only_fine_tuning']

# Compute differences
differences = f1_fine_tuning - f1_target_only
mean_diff = differences.mean()
print(f"Mean difference in test F1: {mean_diff:.4f}")

# Check normality of differences (Shapiro-Wilk test)
from scipy.stats import shapiro
shapiro_stat, shapiro_p = shapiro(differences.dropna())
print(f"Shapiro-Wilk p-value: {shapiro_p:.4f} (if >0.05, assume normal)")

# Choose and run the paired test
if shapiro_p > 0.05 and len(differences) >= 3:  # Minimum for t-test
    t_stat, p_value = stats.ttest_rel(f1_fine_tuning.dropna(), f1_target_only.dropna(), alternative='greater')
    test_name = "Paired t-test"
else:
    stat, p_value = stats.wilcoxon(f1_fine_tuning.dropna() - f1_target_only.dropna(), alternative='greater')
    test_name = "Wilcoxon signed-rank test"

print(f"{test_name} p-value: {p_value:.8f}")

# Interpret
alpha = 0.05
if p_value < alpha:
    print("target_only_fine_tuning is significantly better than target_only.")
else:
    print("No significant difference detected.")

In [None]:
df_plot = df.copy()
df_plot = df_plot[df_plot['mode'] != 'target_only']
df_plot = df_plot[df_plot['target_data_pct'] == 1.0]
df_plot = df_plot[df_plot['base_channels'] == 64]

df_grouped = df_plot.groupby('fold')[['base_test_f1','test_f1','absolute_improvement','relative_improvement','relative_to_room_for_improvement','test_precision','test_recall','precision_improvement','recall_improvement']].mean()

n_participants = df['fold'].nunique()
base_test_f1_mean = df_grouped['base_test_f1'].mean()
base_test_f1_std = df_grouped['base_test_f1'].std()
base_test_f1_min = df_grouped['base_test_f1'].min()
base_test_f1_max = df_grouped['base_test_f1'].max()
finetune_test_f1_mean = df_grouped['test_f1'].mean()
finetune_test_f1_std = df_grouped['test_f1'].std()
finetune_test_f1_min = df_grouped['test_f1'].min()
finetune_test_f1_max = df_grouped['test_f1'].max()
absolute_improvement_mean = df_grouped['absolute_improvement'].mean()
absolute_improvement_std = df_grouped['absolute_improvement'].std()
absolute_improvement_min = df_grouped['absolute_improvement'].min()
absolute_improvement_max = df_grouped['absolute_improvement'].max()
relative_improvement_mean = df_grouped['relative_improvement'].mean()*100
relative_improvement_std = df_grouped['relative_improvement'].std()*100
relative_improvement_min = df_grouped['relative_improvement'].min()*100
relative_improvement_max = df_grouped['relative_improvement'].max()*100
relative_room_for_improvement_mean = df_grouped['relative_to_room_for_improvement'].mean()*100
relative_room_for_improvement_std = df_grouped['relative_to_room_for_improvement'].std()*100
relative_room_for_improvement_min = df_grouped['relative_to_room_for_improvement'].min()*100
relative_room_for_improvement_max = df_grouped['relative_to_room_for_improvement'].max()*100
precision_mean = df_grouped['test_precision'].mean()
precision_std = df_grouped['test_precision'].std()
precision_improvement_mean = df_grouped['precision_improvement'].mean()
recall_mean = df_grouped['test_recall'].mean()
recall_std = df_grouped['test_recall'].std()
recall_improvement_mean = df_grouped['recall_improvement'].mean()

latex_code = rf"""
\newcommand{{\nParticipants}}{{{n_participants}}}
\newcommand{{\baseFoneMean}}{{{base_test_f1_mean:.3f}}}
\newcommand{{\baseFoneStd}}{{{base_test_f1_std:.3f}}}
\newcommand{{\baseFoneMax}}{{{base_test_f1_max:.3f}}}
\newcommand{{\baseFoneMin}}{{{base_test_f1_min:.3f}}}
\newcommand{{\finetuneFoneMean}}{{{finetune_test_f1_mean:.3f}}}
\newcommand{{\finetuneFoneStd}}{{{finetune_test_f1_std:.3f}}}
\newcommand{{\finetuneFoneMax}}{{{finetune_test_f1_max:.3f}}}
\newcommand{{\finetuneFoneMin}}{{{finetune_test_f1_min:.3f}}}
\newcommand{{\absoluteImprovementMean}}{{{absolute_improvement_mean:.3f}}}
\newcommand{{\absoluteImprovementStd}}{{{absolute_improvement_std:.3f}}}
\newcommand{{\absoluteImprovementMax}}{{{absolute_improvement_max:.3f}}}
\newcommand{{\absoluteImprovementMin}}{{{absolute_improvement_min:.3f}}}
\newcommand{{\relativeImprovementMean}}{{{relative_improvement_mean:.1f}}}
\newcommand{{\relativeImprovementStd}}{{{relative_improvement_std:.1f}}}
\newcommand{{\relativeImprovementMax}}{{{relative_improvement_max:.1f}}}
\newcommand{{\relativeImprovementMin}}{{{relative_improvement_min:.1f}}}
\newcommand{{\relativeRoomForImprovementMean}}{{{relative_room_for_improvement_mean:.1f}}}
\newcommand{{\relativeRoomForImprovementStd}}{{{relative_room_for_improvement_std:.1f}}}
\newcommand{{\relativeRoomForImprovementMax}}{{{relative_room_for_improvement_max:.1f}}}
\newcommand{{\relativeRoomForImprovementMin}}{{{relative_room_for_improvement_min:.1f}}}
\newcommand{{\precisionMean}}{{{precision_mean:.3f}}}
\newcommand{{\precisionStd}}{{{precision_std:.3f}}}
\newcommand{{\precisionImprovementMean}}{{{precision_improvement_mean:.3f}}}
\newcommand{{\recallMean}}{{{recall_mean:.3f}}}
\newcommand{{\recallStd}}{{{recall_std:.3f}}}
\newcommand{{\recallImprovementMean}}{{{recall_improvement_mean:.3f}}}
"""
print(latex_code)

In [None]:
df_plot = df.copy()
df_plot = df_plot[df_plot['mode'] != 'target_only']
df_plot = df_plot[df_plot['target_data_pct'] == 1.0]
df_plot = df_plot[df_plot['base_channels'] == 64]

df_grouped = df_plot.groupby('fold')[['base_test_f1','test_f1','absolute_improvement','relative_improvement','relative_to_room_for_improvement','test_precision','test_recall','precision_improvement','recall_improvement']].mean()


latex_code = rf"""\begin{{table}}[h]
\centering
\caption{{Our table}}
\label{{tab:performance}}
\small
\begin{{tabular}}{{lcccccccc}}
\hline
\textbf{{Fold}} & \textbf{{Base F1}} & \textbf{{Fine-tuned F1}} & \textbf{{$\Delta$F1}} & \textbf{{Rel. Imp. (\%)}} & \textbf{{Precision}} & \textbf{{Recall}} & \textbf{{$\Delta$Prec}} & \textbf{{$\Delta$Rec}} \\
\hline
"""

for row in df_grouped.itertuples():
    fold = row.Index
    base_f1 = row.base_test_f1
    finetuned_f1 = row.test_f1
    delta_f1 = finetuned_f1 - base_f1
    rel_imp = row.relative_improvement * 100
    precision = row.test_precision
    recall = row.test_recall
    precision_imp = row.precision_improvement
    recall_imp = row.recall_improvement
    latex_code += f"""{fold} & {base_f1:.2f} & {finetuned_f1:.2f} & {delta_f1:+.2f} & {rel_imp:.1f} & {precision:.2f} & {recall:.2f} & {precision_imp:+.2f} & {recall_imp:+.2f} \\\\\n"""

latex_code += rf"""\textbf{{Mean}} & \textbf{{{df_grouped['base_test_f1'].mean():.3f}}} & \textbf{{{df_grouped['test_f1'].mean():.3f}}} & \textbf{{{df_grouped['absolute_improvement'].mean():.3f}}} & \textbf{{{df_grouped['relative_improvement'].mean() * 100:.1f}}} & \textbf{{{df_grouped['test_precision'].mean():.3f}}} & \textbf{{{df_grouped['test_recall'].mean():.3f}}} & \textbf{{{df_grouped['precision_improvement'].mean():.3f}}} & \textbf{{{df_grouped['recall_improvement'].mean():.3f}}} \\
\textbf{{Std}} & \textbf{{{df_grouped['base_test_f1'].std():.3f}}} & \textbf{{{df_grouped['test_f1'].std():.3f}}} & \textbf{{{df_grouped['absolute_improvement'].std():.3f}}} & \textbf{{{df_grouped['relative_improvement'].std() * 100:.1f}}} & \textbf{{{df_grouped['test_precision'].std():.3f}}} & \textbf{{{df_grouped['test_recall'].std():.3f}}} & \textbf{{{df_grouped['precision_improvement'].std():.3f}}} & \textbf{{{df_grouped['recall_improvement'].std():.3f}}} \\
\hline
\end{{tabular}}
\end{{table}}"""

print(latex_code)


In [None]:
df_plot = df.copy()
df_plot = df_plot[df_plot['mode'] == 'target_only_fine_tuning']
df_plot.sort_values(by='absolute_improvement', ascending=False, inplace=True)
experiment = df_plot.iloc[0]['experiment']
run = df_plot.iloc[0]['run']

hyperparameters = json.load(open(f'{experiments_dir}/{experiment}/{run}/hyperparameters.json'))

best_model_path = f'{experiments_dir}/{experiment}/{run}/best_model.pt'
from lib.models import TestModel
target_participant = hyperparameters['target_participant']
data_path = hyperparameters['data_path']
batch_size = hyperparameters['batch_size']
criterion = nn.BCEWithLogitsLoss()

model = TestModel(dropout=hyperparameters['dropout'],
                use_dilation=hyperparameters['use_dilation'],
                base_channels=hyperparameters['base_channels'],
                num_blocks=hyperparameters['num_blocks'],
                use_residual=hyperparameters['use_residual'],
                return_features=True)
model.load_state_dict(torch.load(best_model_path, map_location='cpu'))

model.eval()

df_participants = df[['target_participant','fold']].drop_duplicates()

features = []
labels = []
smoking_labels = []

from tqdm import tqdm

for idx, row in tqdm(df_participants.iterrows()):
    participant = row['target_participant']
    fold = row['fold']

    X,y = torch.load(f'{data_path}/{participant}_test.pt')

    model.eval()
    with torch.no_grad():
        features.append(model(X))
        labels.append(torch.tensor([idx] * X.size(0)))
        smoking_labels.append(y)

from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import seaborn as sns

features = torch.cat(features, dim=0)
labels = torch.cat(labels, dim=0)
smoking_labels = torch.cat(smoking_labels, dim=0)

features = features.numpy()
labels = labels.numpy()
smoking_labels = smoking_labels.numpy()

tsne = TSNE(n_components=2, random_state=42, perplexity=30, max_iter=500)
features_2d = tsne.fit_transform(features)
df_features = pd.DataFrame(features_2d, columns=['TSNE1', 'TSNE2'])

In [None]:
df_plot = df.copy()
df_plot = df_plot[df_plot['mode'] != 'base']
df_plot = df_plot[df_plot['base_channels'] == 8]
df_plot = df_plot[df_plot['fold'] == 1]

plt.figure(figsize=(7.2, 4.48), dpi=300)
sns.boxplot(data=df_plot, x='mode', y='test_f1',hue='target_data_pct')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Create boxplot with median and mean
sns.boxplot(x='Category', y='Value', data=df, showmeans=True, 
            meanprops={'marker': 'D', 'markeredgecolor': 'red', 'markersize': 10})

# Add legend for mean
plt.legend(labels=['Mean'], loc='upper right')

# Show plot
plt.show()

In [None]:
# Set publication-ready style
plt.rcParams.update({
    'font.size': 12,
    'font.weight': 'bold',
    'axes.labelweight': 'bold',
    'axes.titleweight': 'bold',
    'figure.dpi': 300,
    'axes.linewidth': 1.5,
    'axes.spines.top': False,
    'axes.spines.right': False
})

df_features['Participant'] = labels
df_features['Participant'] = df_features['Participant'].astype(str)

df_features['smoking_label'] = smoking_labels
df_features['smoking_label'] = df_features['smoking_label'].map({0: 'Non-Smoking', 1: 'Smoking'})

fig, ax = plt.subplots(ncols=2, nrows=1, figsize=(14, 8), dpi=300)

# Define publication-ready colors
smoking_colors = {'Non-Smoking': '#2E86AB', 'Smoking': '#A23B72'}  # Blue and maroon
participant_colors = plt.cm.tab20  # Use tab20 colormap for participants

# Plot A: Colored by smoking label with custom colors
sns.scatterplot(data=df_features, x='TSNE1', y='TSNE2', hue='smoking_label', 
                palette=smoking_colors, s=15, alpha=0.8, ax=ax[0], 
                edgecolor='black', linewidth=0.1)
ax[0].set_title('(A) t-SNE Colored by Smoking Label', fontsize=16, fontweight='bold', pad=20)
ax[0].set_xlabel('t-SNE Dimension 1', fontsize=14, fontweight='bold')
ax[0].set_ylabel('t-SNE Dimension 2', fontsize=14, fontweight='bold')

# Customize legend for smoking labels
handles_0, labels_0 = ax[0].get_legend_handles_labels()
ax[0].legend(handles_0, labels_0, title='Class', title_fontsize=12, 
            fontsize=11, frameon=True, fancybox=False, edgecolor='black',
            loc='upper right')

# Plot B: Colored by participant
sns.scatterplot(data=df_features, x='TSNE1', y='TSNE2', hue='Participant', 
                palette='tab20', s=15, alpha=0.8, ax=ax[1],
                edgecolor='black', linewidth=0.1)
ax[1].set_title('(B) t-SNE Colored by Participant', fontsize=16, fontweight='bold', pad=20)
ax[1].set_xlabel('t-SNE Dimension 1', fontsize=14, fontweight='bold')
ax[1].set_ylabel('t-SNE Dimension 2', fontsize=14, fontweight='bold')

# Customize legend for participants
handles_1, labels_1 = ax[1].get_legend_handles_labels()
# Create a more compact legend for participants
ax[1].legend(handles_1, [f'P{i}' for i in labels_1], title='Participant', 
            title_fontsize=12, fontsize=10, frameon=True, fancybox=False, 
            edgecolor='black', loc='center left', bbox_to_anchor=(1, 0.5),
            ncol=1)

# Add grid for better readability
for axis in ax:
    axis.grid(True, alpha=0.3, linestyle='--')
    axis.set_axisbelow(True)
    # Remove top and right spines for cleaner look
    axis.spines['top'].set_visible(False)
    axis.spines['right'].set_visible(False)

# Add main title
plt.suptitle('Feature Space Visualization of Test Set Data', 
             fontsize=18, fontweight='bold', y=0.95)

# Adjust layout for publication
plt.tight_layout()

# Save as high-quality figure for publication
plt.savefig('figure5.pdf', dpi=300, bbox_inches='tight', 
           facecolor='white', edgecolor='none')
plt.savefig('figure5.png', dpi=300, bbox_inches='tight',
           facecolor='white', edgecolor='none')
plt.show()

# Print summary statistics for the caption
n_participants = len(df_features['Participant'].unique())
n_smoking_samples = len(df_features[df_features['smoking_label'] == 'Smoking'])
n_nonsmoking_samples = len(df_features[df_features['smoking_label'] == 'Non-Smoking'])
total_samples = len(df_features)