In [7]:
import pickle
import numpy as np
import os
import plotly.graph_objects as go

def read_and_average_test_f1(root_folder, run_folders, filename_template, file_indices):
    """
    Reads pickle files for a specific strategy, extracts test F1 scores, and computes their average across runs.
    """
    performance_test_data_all = []

    for run_folder, file_index in zip(run_folders, file_indices):
        pickle_filename = filename_template.format(file_index)  # Dynamically generate filenames
        pickle_path = os.path.join(root_folder, run_folder, pickle_filename)
        
        print(f"Loading pickle file: {pickle_path}")
        with open(pickle_path, 'rb') as pickle_file:
            results = pickle.load(pickle_file)
            performance_test_data_all.append(results["test_f1_scores_micro"])

    # Calculate the average test F1 score across runs
    average_test_f1 = np.mean(performance_test_data_all, axis=0)
    return average_test_f1

def plot_all_test_performance(no_of_samples, performances, strategies, title, x_axes, y_axes):
    """
    Plots the average test F1 scores of multiple strategies on a single graph.
    """
    fig = go.Figure()
    fig.update_layout(plot_bgcolor='rgb(209, 217, 222)')

    # Add dashed lines for sample numbers and text labels on top of the graph
    samples_text = [str(f) for f in no_of_samples]
    for line, label in zip(no_of_samples, samples_text):
        fig.add_trace(go.Scatter(
            x=[line, line],
            y=[0, 1.12],
            mode='lines',
            line=dict(dash='dash', color='rgb(149, 162, 171)', width=1),
            showlegend=False
        ))

        fig.add_trace(go.Scatter(
            x=[line],
            y=[1.13],
            mode='text',
            marker=dict(size=0),
            text=[label],
            textposition='top center',
            showlegend=False,
            textfont=dict(family='Arial', color='black', size=8.5)
        ))

    # Define the color palette, with random sampling in red and other strategies in different shades of green
    colors = [
              'rgb(97, 192, 134)',
              'rgb(45, 140, 58)',
              'rgb(151, 193, 57)',
              'rgb(255, 0, 0)'# Red for random sampling
              ]   

    # Add traces for each strategy
    for idx, (performance, strategy) in enumerate(zip(performances, strategies)):
        # Ensure random sampling is always red (the first strategy in the list)
        color = colors[strategies.index(strategy)]
        fig.add_trace(go.Scatter(
            x=no_of_samples,
            y=performance,
            name=f"{strategy} (final: {performance[-1]:.3f})",
            marker=dict(color=color),
            textfont=dict(family="Arial", size=11),
            line=dict(width=2)
        ))

    fig.update_layout(
        title_text=str(title),
        legend=dict(font=dict(color='black')),
        title_font_color='black'
    )

    # Set the x-axis to be on a log scale
    fig.update_xaxes(
        title_text=str(x_axes),
        title_font=dict(color="black", family="Arial"),
        tickfont_color='black',
        type="log"
    )

    # Update y-axis for test F1 score
    fig.update_yaxes(
        title_text=str(y_axes),
        title_font=dict(color="black", family="Arial"),
        tickfont_color='black'
    )

    fig.show()

# Define file paths and folders for each strategy
strategies = ['entropy_sampling', 'margin_sampling', 'uncertainty_sampling', 'random_sampling']
root_folders = {
    'entropy_sampling': "/home/woody/iwfa/iwfa044h/CleanLab_Test/ActiveLearningApproaches/EOD/DInoS/multiclass/AL/ES",
    'margin_sampling': "/home/woody/iwfa/iwfa044h/CleanLab_Test/ActiveLearningApproaches/EOD/DInoS/multiclass/AL/MS",
    'uncertainty_sampling': "/home/woody/iwfa/iwfa044h/CleanLab_Test/ActiveLearningApproaches/EOD/DInoS/multiclass/AL/US",
    'random_sampling': "/home/woody/iwfa/iwfa044h/CleanLab_Test/ActiveLearningApproaches/EOD/DInoS/multiclass/RS"
}
run_folders = [f"run{i}" for i in range(1, 11)]
file_indices = list(range(42, 52))  # Updated file indices to range from 42 to 51

# Template for filenames for each strategy
filename_templates = {
    'entropy_sampling': "AL_entropy_sampling_results_for_multiclass_classification_s{}.pickle",
    'margin_sampling': "AL_margin_sampling_results_for_multiclass_classification_s{}.pickle",
    'uncertainty_sampling': "AL_uncertainty_sampling_results_for_multiclass_classification_s{}.pickle",
    'random_sampling': "random_sampling_results_for_multiclass_classification_s{}.pickle"
}

# Sample counts
no_of_samples = [8, 24, 56, 112, 212, 388, 704, 1264, 2264, 4048, 7200, 12824, 22824, 26880]

# Calculate averages for each strategy
performances = []
for strategy in strategies:
    average_f1 = read_and_average_test_f1(
        root_folder=root_folders[strategy],
        run_folders=run_folders,
        filename_template=filename_templates[strategy],
        file_indices=file_indices
    )
    performances.append(average_f1)

# Plot the results
plot_all_test_performance(
    no_of_samples, 
    performances, 
    strategies, 
    title="Average Test F1 Scores for Different Strategies",
    x_axes="Amount of Training Samples",
    y_axes="Test F1 Score"
)


Loading pickle file: /home/woody/iwfa/iwfa044h/CleanLab_Test/ActiveLearningApproaches/EOD/DInoS/multiclass/AL/ES/run1/AL_entropy_sampling_results_for_multiclass_classification_s42.pickle
Loading pickle file: /home/woody/iwfa/iwfa044h/CleanLab_Test/ActiveLearningApproaches/EOD/DInoS/multiclass/AL/ES/run2/AL_entropy_sampling_results_for_multiclass_classification_s43.pickle
Loading pickle file: /home/woody/iwfa/iwfa044h/CleanLab_Test/ActiveLearningApproaches/EOD/DInoS/multiclass/AL/ES/run3/AL_entropy_sampling_results_for_multiclass_classification_s44.pickle


Loading pickle file: /home/woody/iwfa/iwfa044h/CleanLab_Test/ActiveLearningApproaches/EOD/DInoS/multiclass/AL/ES/run4/AL_entropy_sampling_results_for_multiclass_classification_s45.pickle
Loading pickle file: /home/woody/iwfa/iwfa044h/CleanLab_Test/ActiveLearningApproaches/EOD/DInoS/multiclass/AL/ES/run5/AL_entropy_sampling_results_for_multiclass_classification_s46.pickle
Loading pickle file: /home/woody/iwfa/iwfa044h/CleanLab_Test/ActiveLearningApproaches/EOD/DInoS/multiclass/AL/ES/run6/AL_entropy_sampling_results_for_multiclass_classification_s47.pickle
Loading pickle file: /home/woody/iwfa/iwfa044h/CleanLab_Test/ActiveLearningApproaches/EOD/DInoS/multiclass/AL/ES/run7/AL_entropy_sampling_results_for_multiclass_classification_s48.pickle
Loading pickle file: /home/woody/iwfa/iwfa044h/CleanLab_Test/ActiveLearningApproaches/EOD/DInoS/multiclass/AL/ES/run8/AL_entropy_sampling_results_for_multiclass_classification_s49.pickle
Loading pickle file: /home/woody/iwfa/iwfa044h/CleanLab_Test/Acti

In [4]:
from scipy import stats

def perform_t_tests(performances, strategies):
    """
    Perform t-tests and calculate p-values between each active learning approach and random sampling,
    as well as between all pairs of active learning strategies.
    """
    random_sampling_performance = performances[strategies.index('random_sampling')]  # Random sampling F1 scores
    
    # List to store results
    t_test_results = []
    
    # Compare each active learning approach with random sampling
    for strategy, performance in zip(strategies, performances):
        if strategy == 'random_sampling':
            continue  # Skip random sampling, since it's compared with others
        
        # Perform t-test between the random_sampling and the current strategy
        t_stat, p_value = stats.ttest_ind(random_sampling_performance, performance, equal_var=False)  # Welch's t-test
        
        # Append the result
        t_test_results.append((f"Random Sampling vs {strategy}", t_stat, p_value))
    
    # Compare all pairs of active learning strategies
    for i in range(len(strategies)):
        for j in range(i + 1, len(strategies)):
            if strategies[i] != 'random_sampling' and strategies[j] != 'random_sampling':
                strategy_1, strategy_2 = strategies[i], strategies[j]
                performance_1, performance_2 = performances[i], performances[j]
                
                # Perform t-test between the two active learning strategies
                t_stat, p_value = stats.ttest_ind(performance_1, performance_2, equal_var=False)  # Welch's t-test
                
                # Append the result
                t_test_results.append((f"{strategy_1} vs {strategy_2}", t_stat, p_value))

    # Display results
    for comparison, t_stat, p_value in t_test_results:
        print(f"T-test between {comparison}:")
        print(f"T-statistic: {t_stat:.3f}, P-value: {p_value:.3f}")
        if p_value < 0.05:
            print(f"Statistically significant difference (p < 0.05)\n")
        else:
            print(f"No significant difference (p >= 0.05)\n")

# Perform t-tests and display p-values
perform_t_tests(performances, strategies)


T-test between Random Sampling vs entropy_sampling:
T-statistic: 1.029, P-value: 0.313
No significant difference (p >= 0.05)

T-test between Random Sampling vs margin_sampling:
T-statistic: -0.048, P-value: 0.962
No significant difference (p >= 0.05)

T-test between Random Sampling vs uncertainty_sampling:
T-statistic: 1.015, P-value: 0.320
No significant difference (p >= 0.05)

T-test between entropy_sampling vs margin_sampling:
T-statistic: -1.053, P-value: 0.302
No significant difference (p >= 0.05)

T-test between entropy_sampling vs uncertainty_sampling:
T-statistic: 0.018, P-value: 0.986
No significant difference (p >= 0.05)

T-test between margin_sampling vs uncertainty_sampling:
T-statistic: 1.039, P-value: 0.309
No significant difference (p >= 0.05)

