In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from plotly.subplots import make_subplots
import plotly.subplots as sp
import math
from scipy.stats import norm
import os
import pingouin as pg

In [2]:
# !pip install pingouin

In [3]:
### Helper functions ###

def list_files_in_directory(directory_path, extension='.csv'):
    """
    Returns a list of full paths for all files in the given directory path 
    with a specific extension and forward slashes.

    Args:
        directory_path (str): The path of the directory to list files from.
        extension (str): The file extension to filter for (e.g., '.txt').

    Returns:
        list: A list of full file paths in the directory with the given extension and forward slashes.
    """
    # Ensure the directory path itself has forward slashes
    directory_path = directory_path.replace('\\', '/')
    
    try:
        # List all files with the given extension
        files = [
            os.path.join(directory_path, f).replace('\\', '/') 
            for f in os.listdir(directory_path) 
            if os.path.isfile(os.path.join(directory_path, f)) and f.endswith(extension)
        ]
        return files
    except FileNotFoundError:
        print("The directory does not exist.")
        return []
    

def create_file_dict(file_paths):
    """
    Creates a dictionary where keys are derived from characters before the second underscore in the file name.
    
    Args:
        file_paths (list): List of file paths.

    Returns:
        dict: Dictionary with keys based on file names and paths as values.
    """
    file_dict = {}
    for file_path in file_paths:
        # Extract the file name without the directory path
        file_name = os.path.basename(file_path)
        
        # Split by underscores and join the first two parts as the key
        key = '_'.join(file_name.split('_')[:2])
        
        # Add to dictionary
        file_dict[key] = file_path
        
    return file_dict


def concatenate_and_save_datasets(df1, df2, save_path): 
    """
    This function concatenates two datasets and saves the result as a CSV file.

    Parameters:
    - df1, df2: The datasets to concatenate.
    - save_path: The file path to save the resulting CSV.
    """
    # Concatenating the datasets
    concatenated_df = pd.concat([df1, df2], ignore_index=True)

    # Saving the concatenated DataFrame as a CSV file
    concatenated_df.to_csv(save_path, index=False)

    return concatenated_df


def find_best_lr_for_datasets(datasets_dict):
    """
    Given a dictionary of datasets, where keys are dataset names and values are DataFrames,
    this function returns a dictionary where each key is a dataset name, and the value is
    the best learning rate (lr) based on the lowest mean test loss.

    Parameters:
    - datasets_dict: dictionary of DataFrames with dataset names as keys.

    Returns:
    - Dictionary with dataset names as keys and best lr values as values.
    """
    best_lrs = {}

    for ds_name, dataset in datasets_dict.items():
        # Group by 'lr' and calculate the mean and standard deviation of 'test_loss'
        lr_summary = dataset.groupby('lr')['test_loss'].agg(['mean', 'std']).reset_index()
        lr_summary = lr_summary.rename(columns={'mean': 'loss_mean', 'std': 'loss_std'})

        # Identify the learning rate with the minimum mean test loss
        best_lr = lr_summary.loc[lr_summary['loss_mean'].idxmin(), 'lr']

        # Store in the dictionary
        best_lrs[ds_name] = best_lr

    return best_lrs

In [5]:
# Example usage
# data_folder = r'C:\Users\canel\Downloads\results'
data_folder = 'C:\\Users\\Amir Hossein\\Downloads\\results for a long run'
csv_list = list_files_in_directory(data_folder)
file_dict = create_file_dict(csv_list)
print(file_dict)

{'dna_dropout': 'C:/Users/Amir Hossein/Downloads/results for a long run/dna_dropout_results.csv', 'dna_l2': 'C:/Users/Amir Hossein/Downloads/results for a long run/dna_l2_results.csv'}


In [5]:
file_dict = {
    'dna_dropout': 'C:/Users/canel/Downloads/with learning rate results in csv/dna_dropout_results.csv', 
    # 'dna_l2': 'C:/Users/canel/Downloads/with learning rate results in csv/dna_l2_results.csv', 
    # 'protein_dropout': 'C:/Users/canel/Downloads/with learning rate results in csv/protein_dropout_results.csv', 
    # 'protein_l2': 'C:/Users/canel/Downloads/with learning rate results in csv/protein_l2_results.csv', 
    # 'splice_dropout': 'C:/Users/canel/Downloads/with learning rate results in csv/splice_dropout_results.csv', 
    # 'splice_l2': 'C:/Users/canel/Downloads/with learning rate results in csv/splice_l2_results.csv', 
    # 'twomoons_dropout': 'C:/Users/canel/Downloads/with learning rate results in csv/twomoons_dropout_results.csv', 
    # 'twomoons_l2': 'C:/Users/canel/Downloads/with learning rate results in csv/twomoons_l2_results.csv'
    }

In [6]:
datasets_dict = {}

for ds_name, ds_path in file_dict.items():
    datasets_dict[ds_name] = pd.read_csv(ds_path)

    if 'loss' in datasets_dict[ds_name].columns:
        datasets_dict[ds_name].rename(columns={'loss': 'test_loss'}, inplace=True)

    print(f'Dataset in "{ds_path}" read.')

Dataset in "C:/Users/Amir Hossein/Downloads/results for a long run/dna_dropout_results.csv" read.
Dataset in "C:/Users/Amir Hossein/Downloads/results for a long run/dna_l2_results.csv" read.


In [7]:
def plot_best_reg_by_data_size(df, x_range=None):
    """
    Plots the regularization value that produced the best loss for each data size
    at each specified learning rate.

    Parameters:
    df (pd.DataFrame): The dataset containing the experiment results.
    learning_rates (list of float): A list of learning rates to filter the data.
    x_range (tuple, optional): A tuple specifying the range for the x-axis, in the form (min, max).

    Returns:
    None: Displays a subplot of scatter plots.
    """
    
    df_name = df['dataset_name'].unique()[0]
    reg_type = df['reg_type'].unique()[0]
    learning_rates = list(df['lr'].unique())
    
    # Determine number of rows required for a 2-column layout
    num_plots = len(learning_rates)
    num_rows = (num_plots + 1) // 2
    
    # Create subplot figure
    fig = sp.make_subplots(rows=num_rows, cols=2, subplot_titles=[f'Learning Rate = {lr}' for lr in learning_rates])
    
    # Generate a scatter plot for each learning rate
    for i, learning_rate in enumerate(learning_rates):
        row = (i // 2) + 1
        col = (i % 2) + 1

        # Filter dataset by the current learning rate
        filtered_df = df[df['lr'] == learning_rate]
        
        # Find the regularization value that produced the best loss for each data_size_pct
        best_reg_values = filtered_df.loc[filtered_df.groupby('data_size_pct')['test_loss'].idxmin()]

        # Create scatter plot
        scatter = px.scatter(
            best_reg_values,
            x='data_size_pct',
            y='reg_val',
            labels={'data_size_pct': 'Data Size (%)', 'reg_val': 'Best Regularization Value'},
            template='plotly_white'
        )

        # Add the scatter plot to the subplot
        for trace in scatter['data']:
            fig.add_trace(trace, row=row, col=col)

        # Add axis titles for each subplot and apply x-axis range if provided
        fig.update_xaxes(title_text="Data Size (%)", range=x_range, row=row, col=col)
        fig.update_yaxes(title_text="Best Regularization Value", row=row, col=col)

    # Update layout
    fig.update_layout(height=num_rows * 300, width=1000, title_text=f"{df_name.upper()} {reg_type.upper()}. Best Regularization Value by Data Size")
    
    fig.show()

    return fig

# Example usage
# Assuming 'df' is your DataFrame and 'lr_list' is your list of learning rates
# plot_best_reg_by_data_size(df, lr_list, x_range=(0, 100))


In [8]:
def plot_3d_surface_with_avg(dataset, x_col, y_col, z_col, log_x=False, log_y=False, log_z=False, z_min=None, z_max=None):
    """
    Plots a 3D surface with average values of z_col grouped by x_col and y_col.
    Filters data by z_col range if specified.

    Parameters:
    dataset (pd.DataFrame): The dataset containing the data.
    x_col (str): The column name for the x-axis.
    y_col (str): The column name for the y-axis.
    z_col (str): The column name for the z-axis.
    log_x (bool): Set x-axis to logarithmic scale if True.
    log_y (bool): Set y-axis to logarithmic scale if True.
    log_z (bool): Set z-axis to logarithmic scale if True.
    z_min (float, optional): Minimum value for filtering the z-axis data.
    z_max (float, optional): Maximum value for filtering the z-axis data.

    Returns:
    fig: Plotly Figure object.
    """
    df_name = dataset['dataset_name'].unique()[0]
    reg_type = dataset['reg_type'].unique()[0]

    # Filter data based on z_min and z_max if provided
    if z_min is not None:
        dataset = dataset[dataset[z_col] >= z_min]
    if z_max is not None:
        dataset = dataset[dataset[z_col] <= z_max]

    # Group by x and y and compute the mean of the z column
    df_grouped = dataset.groupby([x_col, y_col])[z_col].mean().reset_index()

    # Pivot the grouped data to get the correct format for the surface plot
    df_pivot = df_grouped.pivot(index=y_col, columns=x_col, values=z_col)

    # Create x, y, z values for the surface plot
    x_vals = df_pivot.columns
    y_vals = df_pivot.index
    z_vals = df_pivot.values

    # Create a 3D surface plot with opacity set to 0.6 and contours on the z-axis
    surface = go.Surface(
        z=z_vals, 
        x=x_vals, 
        y=y_vals, 
        opacity=0.9, 
        colorscale='Jet',
        contours={
            "z": {
                "show": True,
                "color": "black",
                "width": 2
            }
        }
    )

    # Add a 3D scatter plot with the same data points
    scatter_data = go.Scatter3d(
        x=df_grouped[x_col], 
        y=df_grouped[y_col], 
        z=df_grouped[z_col],
        mode='markers',
        marker=dict(size=5, color=df_grouped[z_col], colorscale='Jet', opacity=0.9)
    )

    # Identify the row with the minimum z_col value
    min_row = df_grouped.loc[df_grouped[z_col].idxmin()]

    # Create a separate scatter trace for the minimum loss point with a red marker
    min_scatter = go.Scatter3d(
        x=[min_row[x_col]],
        y=[min_row[y_col]],
        z=[min_row[z_col]],
        mode='markers',
        marker=dict(size=8, color='red')
    )

    # Create the figure and add the surface, scatter plot, and min scatter plot
    fig = go.Figure(data=[surface, scatter_data, min_scatter])

    # Update axis types to logarithmic if requested
    axis_settings = dict(
        xaxis=dict(title=x_col, type='log' if log_x else 'linear'),
        yaxis=dict(title=y_col, type='log' if log_y else 'linear'),
        zaxis=dict(title=z_col, type='log' if log_z else 'linear')
    )

    # Update the layout for better visualization
    fig.update_layout(
        width=800,
        height=600,
        scene=dict(
            aspectratio=dict(x=1.25, y=1, z=1),
            xaxis=axis_settings['xaxis'],
            yaxis=axis_settings['yaxis'],
            zaxis=axis_settings['zaxis']
        ),
        margin=dict(l=0.5, r=0.5, b=50, t=50),
        title=f'{df_name.upper()}. {reg_type.upper()}. {z_col}' 
    )

    # Show the plot
    fig.show()
    return fig

# Example usage with filtering on z-axis values
# plot_3d_surface_with_avg(dataset, 'x_col_name', 'y_col_name', 'z_col_name', z_min=0, z_max=100)

In [10]:
def plot_histograms_with_normal_fit(
    dataset,
    reg_val='all',
    data_size_pct='all',
    opacity=0.6,
    line_color='blue'
):
    """
    Plots histograms of 'loss' values and fits them to normal distribution curves.
    
    Parameters:
    - dataset (pd.DataFrame): DataFrame containing at least ['reg_val', 'data_size_pct', 'loss'] columns.
    - reg_val (float, str, or list, optional): Specific regularization value(s) to include or 'all'. Default is 'all'.
    - data_size_pct (float, str, or list, optional): Specific data size percentage(s) to include or 'all'. Default is 'all'.
    - opacity (float, optional): Opacity level for the histogram bars (0 to 1). Default is 0.6.
    - line_color (str, optional): Color for the histogram bars and normal distribution lines. Default is 'blue'.
    
    The function generates subplots using Plotly, with each subplot representing a histogram
    of 'loss' values for the specified (reg_val, data_size_pct) combinations. Each histogram
    is overlaid with a fitted normal distribution curve and annotated with the mean and standard deviation.
    """
    
    # Validate input DataFrame
    if not isinstance(dataset, pd.DataFrame):
        raise TypeError("dataset must be a pandas DataFrame.")
    
    required_columns = {'reg_val', 'data_size_pct', 'loss'}
    if not required_columns.issubset(dataset.columns):
        raise ValueError(f"dataset must contain columns: {required_columns}")
    
    # Handle 'all' or specific reg_val
    if reg_val == 'all':
        reg_vals = dataset['reg_val'].unique()
    elif isinstance(reg_val, list):
        reg_vals = reg_val
    else:
        reg_vals = [reg_val]
    
    # Handle 'all' or specific data_size_pct
    if data_size_pct == 'all':
        data_size_pcts = dataset['data_size_pct'].unique()
    elif isinstance(data_size_pct, list):
        data_size_pcts = data_size_pct
    else:
        data_size_pcts = [data_size_pct]
    
    # Generate all valid combinations
    combinations = []
    for rv in reg_vals:
        for dscp in data_size_pcts:
            if not dataset[(dataset['reg_val'] == rv) & (dataset['data_size_pct'] == dscp)].empty:
                combinations.append((rv, dscp))
    
    if not combinations:
        raise ValueError("No valid (reg_val, data_size_pct) combinations found for plotting.")
    
    # Determine subplot layout (n rows x 2 columns)
    n = len(combinations)
    n_cols = 2
    n_rows = (n + n_cols - 1) // n_cols  # Ceiling division
    
    # Create subplot titles
    subplot_titles = [f"reg_val={rv}, data_size_pct={dscp}" for rv, dscp in combinations]
    
    # Initialize subplots
    fig = make_subplots(rows=n_rows, cols=n_cols, subplot_titles=subplot_titles)
    
    # Compute global x-axis range based on all data
    global_x_min = dataset['loss'].min()
    global_x_max = dataset['loss'].max()
    
    # Increase the x range by 10% of the range
    x_range = global_x_max - global_x_min
    x_padding = 0.10 * x_range
    global_x_min -= x_padding
    global_x_max += x_padding
    
    # Iterate over each combination and plot
    for idx, (rv, dscp) in enumerate(combinations):
        row = (idx) // n_cols + 1
        col = (idx) % n_cols + 1
        
        # Extract 'loss' data for the current combination
        subset = dataset[(dataset['reg_val'] == rv) & (dataset['data_size_pct'] == dscp)]['loss']
        
        if subset.empty:
            continue  # Skip if no data
        
        # Calculate statistics
        mean = subset.mean()
        stdv = subset.std()
        
        # Determine optimal number of bins using Freedman-Diaconis rule
        q25, q75 = np.percentile(subset, [25, 75])
        iqr = q75 - q25
        bin_width = 2 * iqr * len(subset) ** (-1/3)
        if bin_width > 0:
            bins = int(np.ceil((global_x_max - global_x_min) / bin_width))
            bins = min(100, bins)
        else:
            bins = 30  # Fallback to default if IQR is zero
            
        
        # Generate histogram
        hist = go.Histogram(
            x=subset,
            nbinsx=bins,
            name='Loss Histogram',
            marker_color=line_color,
            opacity=opacity,
            showlegend=False
        )
        
        fig.add_trace(hist, row=row, col=col)
        
        # Generate normal distribution line
        x = np.linspace(global_x_min, global_x_max, 500)
        y = norm.pdf(x, mean, stdv) * len(subset) * (global_x_max - global_x_min) / bins  # Scale to histogram
        norm_line = go.Scatter(
            x=x,
            y=y,
            mode='lines',
            line=dict(color='red', width=2),
            name='Normal Fit',
            showlegend=False
        )
        
        fig.add_trace(norm_line, row=row, col=col)
        
        # Calculate maximum count in histogram to position annotation
        hist_counts = subset.value_counts(bins=bins)
        y_max_hist = hist_counts.max()
        
        # Compute annotation position near the top right corner
        annotation_x = global_x_max - 0.05 * x_range
        annotation_y = y_max_hist * 1.05
        
        # Add annotation for mean and standard deviation in the top right corner
        annotation_text = f"Mean: {mean:.3f}<br>Std Dev: {stdv:.3f}"
        
        fig.add_annotation(
            x=annotation_x,
            y=annotation_y,
            xref=f"x{idx+1}",
            yref=f"y{idx+1}",
            text=annotation_text,
            showarrow=False,
            xanchor='right',
            yanchor='bottom',
            font=dict(color='black', size=12)
        )
        
        # Update y-axis for the current subplot
        fig.update_yaxes(title_text="Count", row=row, col=col)
    
    # Update x-axis range for all subplots to be the same
    for i in range(1, n + 1):
        row = (i - 1) // n_cols + 1
        col = (i - 1) % n_cols + 1
        fig.update_xaxes(range=[global_x_min, global_x_max], row=row, col=col)
    
    # Update layout for better appearance
    fig.update_layout(
        height=400 * n_rows,  # Adjust height based on number of rows
        width=900,             # Set a reasonable width
        showlegend=False,     # Hide legend
        title_text=f"{dataset_name.upper()}.Histograms of Loss with Normal Distribution Fits",
        margin=dict(t=80, b=50, l=50, r=50)  # Adjust margins
    )
    
    fig.show()
    return fig

In [12]:
def save_plots_to_html(plot_list, file_name):
    """
    Saves multiple Plotly plots into a single HTML file.

    Parameters:
    - plot_list: A list of Plotly Figure objects.
    - file_name: The name of the HTML file to save the plots.
    """
    import plotly.io as pio

    # Initialize an empty list to store the HTML content of each plot
    html_content = []

    # Loop through the list of plot objects and convert each to HTML
    for plot in plot_list:
        html_content.append(pio.to_html(plot, full_html=False))

    # Concatenate all plots into a single HTML string
    full_html = "<html><head></head><body>" + "".join(html_content) + "</body></html>"

    # Save the HTML content to the file
    with open(file_name, "w", encoding="utf8") as f:
        f.write(full_html)

    print(f"Plots saved to {file_name}")


In [13]:
def box_plot_metric(df, x_axis, y_axis, group_by, log_x=False, auto_scale=False, message=''):
    """
    Plots the distribution of loss as a box plot for each unique group_by value using Plotly subplots.
    Each subplot shows the box plot distribution for values of the x_axis.

    Parameters:
    - df: pandas DataFrame containing the data.
    - x_axis: column name to use for the x-axis ('data_size_pct' or 'reg_val').
    - group_by: column name to group the data by ('reg_val' or 'data_size_pct').
    - log_x: boolean indicating whether to use a log scale for the x-axis.
    - auto_scale: boolean indicating whether to automatically adjust plot scales.
    - same_y: boolean indicating whether all subplots should share the same y-axis range.
    - message: additional message to include in the plot title.

    Returns:
    - Plotly Figure object.
    """
    if auto_scale == False:
        same_y = True
    else:
        same_y = False

    df_name = df['dataset_name'].unique()[0]
    reg_type = df['reg_type'].unique()[0]

    # Validate inputs
    if x_axis not in df.columns:
        raise ValueError(f"x_axis '{x_axis}' not found in DataFrame columns.")
    if group_by not in df.columns:
        raise ValueError(f"group_by '{group_by}' not found in DataFrame columns.")
    if x_axis == group_by:
        raise ValueError("x_axis and group_by must be different columns.")

    # Make a copy of the DataFrame to avoid modifying the original data
    df_plot = df.copy()

    # Adjust x_axis to be positive by shifting if log_x is True
    shift_value = 0
    if log_x:
        min_x = df_plot[x_axis].min()
        if min_x <= 0:
            shift_value = abs(min_x) + 1e-6  # Small constant to ensure positivity
            df_plot[x_axis] = df_plot[x_axis] + shift_value
            print(f"Shifted '{x_axis}' by {shift_value} to make all values positive for log scale.")

    # Get unique groups
    unique_groups = sorted(df_plot[group_by].unique())

    # Determine number of subplots (n rows x 2 columns)
    n = len(unique_groups)
    cols = 2
    rows = math.ceil(n / cols)

    # Create subplot titles
    subplot_titles = [f"{group_by} = {g}" for g in unique_groups]

    # Create subplots
    fig = make_subplots(rows=rows, cols=cols, subplot_titles=subplot_titles)

    # If same_y is True, determine the global y-axis range
    if same_y:
        y_min = df_plot[y_axis].min()
        y_max = df_plot[y_axis].max()
        # Optionally, add some padding
        y_padding = (y_max - y_min) * 0.05
        y_range = [y_min - y_padding, y_max + y_padding]

    for i, group in enumerate(unique_groups):
        row = i // cols + 1
        col = i % cols + 1
        df_group = df_plot[df_plot[group_by] == group]

        fig.add_trace(
            go.Box(
                x=df_group[x_axis],
                y=df_group[y_axis],
                name=str(group),
                boxpoints='all',  # Show all points
                jitter=0,       # Jitter for visibility
                pointpos=0     # Offset for points
            ),
            row=row,
            col=col
        )

        # Set x-axis to log if needed
        fig.update_xaxes(type='log' if log_x else 'linear', row=row, col=col)

        # Set y-axis range if same_y is True
        if same_y:
            fig.update_yaxes(range=y_range, title_text=f"{y_axis}", row=row, col=col)
        else:
            fig.update_yaxes(title_text=f"{y_axis}", row=row, col=col)

        # Set x-axis title
        fig.update_xaxes(title_text=x_axis, row=row, col=col)

    # If there are empty subplots, remove their annotations
    total_subplots = rows * cols
    existing_annotations = len(fig.layout.annotations)
    if n < total_subplots:
        for i in range(n, total_subplots):
            if i < existing_annotations:
                fig.layout.annotations[i].text = ''

    fig.update_layout(
        height=400 * rows,
        width=1200,
        title_text=f"{message} {df_name.upper()} {reg_type.upper()}. {y_axis} Distribution vs {x_axis} grouped by {group_by}",
        showlegend=False
    )
    fig.show()
    return fig


In [14]:
def plot_mean_metric(df, x_axis, y_axis, group_by, log_x=False, auto_scale=False, show_error_bars=True, message=''):
    """
    Plots the mean loss as a function of x_axis for each unique group_by value using a single Plotly plot.
    Each marker in the scatter plot can be accompanied by an error bar representing the spread of the data.

    Parameters:
    - df: pandas DataFrame containing the data.
    - x_axis: column name to use for the x-axis ('data_size_pct' or 'reg_val').
    - group_by: column name to group the data by ('reg_val' or 'data_size_pct').
    - log_x: boolean indicating whether to use a log scale for the x-axis.
    - auto_scale: boolean indicating whether to automatically adjust plot scales.
    - show_error_bars: boolean indicating whether to display error bars representing the standard deviation.
    - message: additional message to include in the plot title.

    Returns:
    - Plotly Figure object.
    """
    # Extract dataset and regression type names
    df_name = df['dataset_name'].unique()[0]
    reg_type = df['reg_type'].unique()[0]

    # Validate inputs
    if x_axis not in df.columns:
        raise ValueError(f"x_axis '{x_axis}' not found in DataFrame columns.")
    if group_by not in df.columns:
        raise ValueError(f"group_by '{group_by}' not found in DataFrame columns.")
    if x_axis == group_by:
        raise ValueError("x_axis and group_by must be different columns.")

    # Make a copy of the DataFrame to avoid modifying the original data
    df_plot = df.copy()

    # Adjust x_axis to be positive by shifting if log_x is True
    shift_value = 0  # Initialize shift value
    if log_x:
        min_x = df_plot[x_axis].min()
        if min_x <= 0:
            shift_value = abs(min_x) + 1e-6  # Small constant to ensure positivity
            df_plot[x_axis] = df_plot[x_axis] + shift_value
            print(f"Shifted '{x_axis}' by {shift_value} to make all values positive for log scale.")

    # Compute mean and standard deviation of loss grouped by 'group_by' and 'x_axis'
    grouped = df_plot.groupby([group_by, x_axis])[y_axis].agg(['mean', 'std']).reset_index()
    grouped = grouped.rename(columns={'mean': f'{y_axis}_mean', 'std': f'{y_axis}_std'})

    # Get unique groups
    unique_groups = sorted(grouped[group_by].unique())

    # Create a single plot
    fig = go.Figure()

    for group in unique_groups:
        df_group = grouped[grouped[group_by] == group].sort_values(by=x_axis)
        
        # Prepare trace arguments
        trace_args = {
            'x': df_group[x_axis],
            'y': df_group[f'{y_axis}_mean'],
            'mode': 'lines+markers',
            'name': f"{group_by} = {group}",
        }

        # Conditionally add error bars
        if show_error_bars:
            trace_args['error_y'] = dict(
                type='data',
                array=df_group[f'{y_axis}_std'],
                visible=True
            )

        # Add the trace to the figure
        fig.add_trace(go.Scatter(**trace_args))

    # Set x-axis to log if needed
    fig.update_xaxes(type='log' if log_x else 'linear', title_text=x_axis)

    # Set y-axis title
    fig.update_yaxes(title_text=F"Mean {y_axis}")

    # Update layout
    fig.update_layout(
        height=600,
        width=1000,
        title_text=f"{message} {df_name.upper()} {reg_type.upper()}. Mean {y_axis} vs {x_axis} grouped by {group_by}",
        legend_title=group_by
    )

    fig.show()
    return fig


In [15]:
def subplot_mean_metric(df, x_axis, y_axis,  group_by, log_x=False, auto_scale=True, message=''):
    """
    Plots the mean loss as a function of x_axis for each unique group_by value using Plotly subplots.
    Each marker in the scatter plot is accompanied by an error bar representing the spread of the data.

    Parameters:
    - df: pandas DataFrame containing the data.
    - x_axis: column name to use for the x-axis ('data_size_pct' or 'reg_val').
    - group_by: column name to group the data by ('reg_val' or 'data_size_pct').
    - log_x: boolean indicating whether to use a log scale for the x-axis.
    - auto_scale: boolean indicating whether to automatically adjust plot scales.
    - same_y: boolean indicating whether all subplots should share the same y-axis range.
    - message: additional message to include in the plot title.

    Returns:
    - Plotly Figure object.
    """
    if auto_scale == True:
        same_y = False
    else:
        same_y = True

    # Extract dataset and regression type names
    df_name = df['dataset_name'].unique()[0]
    reg_type = df['reg_type'].unique()[0]

    # Validate inputs
    if x_axis not in df.columns:
        raise ValueError(f"x_axis '{x_axis}' not found in DataFrame columns.")
    if group_by not in df.columns:
        raise ValueError(f"group_by '{group_by}' not found in DataFrame columns.")
    if x_axis == group_by:
        raise ValueError("x_axis and group_by must be different columns.")

    # Make a copy of the DataFrame to avoid modifying the original data
    df_plot = df.copy()

    # Adjust x_axis to be positive by shifting if log_x is True
    shift_value = 0  # Initialize shift value
    if log_x:
        min_x = df_plot[x_axis].min()
        if min_x <= 0:
            shift_value = abs(min_x) + 1e-6  # Small constant to ensure positivity
            df_plot[x_axis] = df_plot[x_axis] + shift_value
            print(f"Shifted '{x_axis}' by {shift_value} to make all values positive for log scale.")

    # Compute mean and standard deviation of loss grouped by 'group_by' and 'x_axis'
    grouped = df_plot.groupby([group_by, x_axis])[y_axis].agg(['mean', 'std']).reset_index()
    grouped = grouped.rename(columns={'mean': f'{y_axis}_mean', 'std': f'{y_axis}_std'})

    # Get unique groups
    unique_groups = sorted(grouped[group_by].unique())

    # Determine number of subplots (n rows x 2 columns)
    n = len(unique_groups)
    cols = 2
    rows = math.ceil(n / cols)

    # Create subplot titles
    subplot_titles = [f"{group_by} = {g}" for g in unique_groups]

    # Create subplots
    fig = make_subplots(rows=rows, cols=cols, subplot_titles=subplot_titles)

    # If same_y is True, determine the global y-axis range
    if same_y:
        y_min = grouped[f'{y_axis}_mean'].min()
        y_max = grouped[f'{y_axis}_mean'].max()
        # Optionally, add some padding
        y_padding = (y_max - y_min) * 0.05 if y_max != y_min else 1
        y_range = [y_min - y_padding, y_max + y_padding]

    for i, group in enumerate(unique_groups):
        row = i // cols + 1
        col = i % cols + 1
        df_group = grouped[grouped[group_by] == group].sort_values(by=x_axis)

        fig.add_trace(
            go.Scatter(
                x=df_group[x_axis],
                y=df_group[f'{y_axis}_mean'],
                mode='lines+markers',
                name=str(group),
                error_y=dict(
                    type='data',
                    array=df_group[f'{y_axis}_std'],
                    visible=True
                )
            ),
            row=row,
            col=col
        )

        # Set x-axis to log if needed
        fig.update_xaxes(type='log' if log_x else 'linear', autorange=True, row=row, col=col)

        # Set y-axis range based on same_y parameter
        if same_y:
            fig.update_yaxes(range=y_range, title_text=f"Mean {y_axis}", row=row, col=col)
        else:
            fig.update_yaxes(autorange=True, title_text=f"Mean {y_axis}", row=row, col=col)

        # Adjust x-axis title if shifted
        if log_x and shift_value != 0:
            fig.update_xaxes(title_text=f"{x_axis}", row=row, col=col)
        else:
            fig.update_xaxes(title_text=x_axis, row=row, col=col)

    # If there are empty subplots, remove their annotations
    total_subplots = rows * cols
    existing_annotations = len(fig.layout.annotations)
    if n < total_subplots:
        for i in range(n, total_subplots):
            if i < existing_annotations:
                fig.layout.annotations[i].text = ''

    fig.update_layout(
        height=320 * rows,
        width=1000,
        title_text=f"{message} {df_name.upper()} {reg_type.upper()}. Mean {y_axis} vs {x_axis} grouped by {group_by}",
        showlegend=False
    )
    fig.show()
    return fig


In [16]:
best_lrs = find_best_lr_for_datasets(datasets_dict)
best_lrs


for dataset_name, dataset in datasets_dict.items():
    plots_list = []

    data_sizes = datasets_dict[dataset_name]['data_size_pct'].unique()

    best_lr = best_lrs[dataset_name]
    dataset2 = dataset[dataset['lr'] == best_lr]
    reg_vals = dataset['reg_val'].unique()[::2]
    dataset2 = dataset[dataset['reg_val'].isin(reg_vals)]




    if dataset['reg_type'].unique().item() == 'l2':
        reg_type = 'l2'
        log_scale = True


    else:
        reg_type = 'dropout'
        log_scale = False
        dataset2 = dataset

    plot1 = subplot_mean_metric(dataset2, x_axis='reg_val', y_axis='val_acc', group_by='data_size_pct', log_x=log_scale, auto_scale=True, message='')
    plots_list.append(plot1)


    for datasize in data_sizes:

        dataset3 =  dataset[dataset['data_size_pct'] == datasize]

        # dataset3 = dataset[
        #         (dataset['data_size_pct'] == datasize) &
        #         (dataset['lr'] != 0.1)]

        if dataset['reg_type'].unique().item() == 'l2':
            reg_type = 'l2'
            log_scale = True

            reg_vals = dataset['reg_val'].unique()[::2]
            dataset3 = dataset3[dataset3['reg_val'].isin(reg_vals)]

        else:
            reg_type = 'dropout'
            log_scale = False
            
        if not dataset3.empty:
            plot2 = box_plot_metric(dataset3, x_axis='lr', y_axis='val_acc', group_by='reg_val', log_x=True, auto_scale=False, message=str(datasize)+'%')
            plots_list.append(plot2)
        else:
            print(f"No data available for data_size_pct = {datasize} in {dataset_name}")


    for datasize in data_sizes:

        dataset3 =  dataset[dataset['data_size_pct'] == datasize]
        # dataset3 = dataset[
        #         (dataset['data_size_pct'] == datasize) &
        #         (dataset['lr'] != 0.1)]

        if dataset['reg_type'].unique().item() == 'l2':
            reg_type = 'l2'
            log_scale = True

            reg_vals = dataset['reg_val'].unique()[::2]
            dataset3 = dataset3[dataset3['reg_val'].isin(reg_vals)]

        else:
            reg_type = 'dropout'
            log_scale = False


        if not dataset3.empty:
            plot3 = plot_mean_metric(dataset3, x_axis='reg_val', y_axis='val_acc', group_by='lr', log_x=log_scale, auto_scale=False, show_error_bars=True, message=str(datasize)+'%')
            plots_list.append(plot3)
        else:
            print(f"No data available for data_size_pct = {datasize} in {dataset_name}")

    
    html_filename = os.path.join(data_folder, f'{dataset_name.upper()}_plots_updated_18112024.html')
    print(html_filename)

    save_plots_to_html(plots_list, html_filename)

C:\Users\Amir Hossein\Downloads\results for a long run\DNA_DROPOUT_plots_updated_18112024.html
Plots saved to C:\Users\Amir Hossein\Downloads\results for a long run\DNA_DROPOUT_plots_updated_18112024.html
Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


Shifted 'reg_val' by 1e-06 to make all values positive for log scale.


C:\Users\Amir Hossein\Downloads\results for a long run\DNA_L2_plots_updated_18112024.html
Plots saved to C:\Users\Amir Hossein\Downloads\results for a long run\DNA_L2_plots_updated_18112024.html


In [15]:
sbnlasdkhvb

NameError: name 'sbnlasdkhvb' is not defined

In [None]:
first_key = next(iter(datasets_dict))
print(first_key)  # Output: 'key1'
unique_values = list(datasets_dict[first_key]['data_size_pct'].unique())
unique_values

dna_l2


[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

In [None]:
def largest_non_significant_intervals(non_significant_pairs, n=1):
    # Create an undirected graph
    G = nx.Graph()
    
    # Add edges to the graph based on non-significant pairs
    for _, row in non_significant_pairs.iterrows():
        G.add_edge(row['A'], row['B'])
    
    # Find all connected components and sort them by size (descending order)
    sorted_components = sorted(nx.connected_components(G), key=len, reverse=True)
    
    # Get the n largest components
    largest_intervals = [sorted(component) for component in sorted_components[:n]]
    
    return largest_intervals

values = []
values_dict = {}

for i in unique_values:

    df = datasets_dict[first_key][datasets_dict[first_key]['data_size_pct'] == i]

    welch_anova = pg.welch_anova(dv='test_loss', between='reg_val', data=df)
    # print("Welch’s ANOVA results:")
    # print(welch_anova)

    # Check if there's an overall significant difference
    if welch_anova['p-unc'][0] < 0.05:
        # print(f"Significant differences detected by Welch’s ANOVA. P-value: {welch_anova['p-unc'][0]} < 0.05.")
        
        # Step 2: Perform Games-Howell Test for pairwise comparisons
        games_howell = pg.pairwise_gameshowell(dv='test_loss', between='reg_val', data=df)
        # print("\nGames-Howell pairwise comparison results:")
        # print(games_howell)
        
        # Step 3: Filter out non-significant pairs (p-value >= 0.05)
        non_significant_pairs = games_howell[games_howell['pval'] >= 0.05]
        # print("\nNon-significant pairs (test_loss is not statistically different):")
        # print(non_significant_pairs[['A', 'B', 'pval']])

        # Sample data based on the given example
        data = non_significant_pairs[['A', 'B', 'pval']]
        non_significant_pairs = pd.DataFrame(data)

        # Find the largest interval
        largest_interval = largest_non_significant_intervals(non_significant_pairs)
        print(f'Size: {i}%. Largest interval of values that are not statistically different: {largest_interval}')#. Max non-significant value: {max(largest_interval)}')
        values_dict[i] = max(largest_interval)
        # values.append(max(largest_interval))
        # print(f'Size: {i}%. {max(largest_interval)}')

    else:
        print(f"Size: {i}%. No overall significant difference found by Welch’s ANOVA. No need for pairwise comparisons.")

NameError: name 'nx' is not defined

In [None]:

fig = px.scatter(x=values_dict.keys(), y=values_dict.values(), labels={'x': 'X Values', 'y': 'Y Values'})
fig.update_layout(title='Scatter Plot of X vs. Y',
                  height=400,
                  width=600)

# Show plot
fig.show()

In [None]:
import pingouin as pg

# Assuming `df` is your DataFrame with 'reg_val' and 'loss' columns
# Step 1: Perform Welch’s ANOVA to check for overall differences

df = datasets_dict['dna_l2'][datasets_dict['dna_l2']['data_size_pct'] == 4]

welch_anova = pg.welch_anova(dv='test_loss', between='reg_val', data=df)
print("Welch’s ANOVA results:")
print(welch_anova)

# Check if there's an overall significant difference
if welch_anova['p-unc'][0] < 0.05:
    print(f"Significant differences detected by Welch’s ANOVA. P-value: {welch_anova['p-unc'][0]} < 0.05.")
    
    # Step 2: Perform Games-Howell Test for pairwise comparisons
    games_howell = pg.pairwise_gameshowell(dv='test_loss', between='reg_val', data=df)
    # print("\nGames-Howell pairwise comparison results:")
    # print(games_howell)
    
    # Step 3: Filter out non-significant pairs (p-value >= 0.05)
    non_significant_pairs = games_howell[games_howell['pval'] >= 0.05]
    print("\nNon-significant pairs (test_loss is not statistically different):")
    print(non_significant_pairs[['A', 'B', 'pval']])
else:
    print("No overall significant difference found by Welch’s ANOVA. No need for pairwise comparisons.")


def largest_non_significant_interval(non_significant_pairs):
    # Create an undirected graph
    G = nx.Graph()
    
    # Add edges to the graph based on non-significant pairs
    for _, row in non_significant_pairs.iterrows():
        G.add_edge(row['A'], row['B'])
    
    # Find the largest connected component
    largest_component = max(nx.connected_components(G), key=len)
    
    # Sort the values in the largest component to get the interval
    largest_interval = sorted(largest_component)
    
    return largest_interval

# Sample data based on the given example
data = non_significant_pairs[['A', 'B', 'pval']]
non_significant_pairs = pd.DataFrame(data)

# Find the largest interval
largest_interval = largest_non_significant_interval(non_significant_pairs)
print("Largest interval of values that are not statistically different:", largest_interval)



KeyError: 'dna_l2'

In [None]:
import networkx as nx

def largest_non_significant_interval(non_significant_pairs):
    # Create an undirected graph
    G = nx.Graph()
    
    # Add edges to the graph based on non-significant pairs
    for _, row in non_significant_pairs.iterrows():
        G.add_edge(row['A'], row['B'])
    
    # Find the largest connected component
    largest_component = max(nx.connected_components(G), key=len)
    
    # Sort the values in the largest component to get the interval
    largest_interval = sorted(largest_component)
    
    return largest_interval

# Sample data based on the given example
data = non_significant_pairs[['A', 'B', 'pval']]
non_significant_pairs = pd.DataFrame(data)

# Find the largest interval
largest_interval = largest_non_significant_interval(non_significant_pairs)
print("Largest interval of values that are not statistically different:", largest_interval)


Largest interval of values that are not statistically different: [1e-06, 1e-05, 0.0001, 0.001, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1]
