In [None]:
!pip install -q scipy

In [6]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Configuration
EXPERIMENT_DIR = 'experiments/memory_rw_4'
ANALYSIS_DIR = os.path.join(EXPERIMENT_DIR, 'analysis')
ORIGINAL_INFERENCE_BASE_PATH = '/workspace/slice-monorepo/sub_validations/memory_signal_dnd'
RESULTS_CSV_NAME = 'retrained_inference_results.csv'
Y_AXIS_LIMITS = (-20, 30)  # Adjusting limits for normalized perplexity values

def load_csv(file_path):
    print(f"Loading CSV file from: {file_path}")
    return pd.read_csv(file_path)

def normalize_data(data, new_min, new_max):
    data_min = data.min()
    data_max = data.max()
    return new_min + ((data - data_min) * (new_max - new_min)) / (data_max - data_min)

def plot_difference_inference_perplexities(experiment_dir, analysis_dir):
    results_csv_path = os.path.join(experiment_dir, RESULTS_CSV_NAME)
    results_df = load_csv(results_csv_path)
    
    unique_combinations = results_df[['learning_rate', 'retrain_percentage']].drop_duplicates()

    for _, combo in unique_combinations.iterrows():
        lr = combo['learning_rate']
        retrain_percentage = combo['retrain_percentage']
        plt.figure(figsize=(14, 7))

        # Plot original inference perplexity for each folder
        folders = results_df['folder'].unique()
        for folder in folders:
            original_inference_path = os.path.join(ORIGINAL_INFERENCE_BASE_PATH, folder, 'new_inference_results.csv')
            original_inference_df = load_csv(original_inference_path)

            lr_results = results_df[(results_df['learning_rate'] == lr) & (results_df['retrain_percentage'] == retrain_percentage)]
            for retrain_window in lr_results['retrain_window'].unique():
                retrained_loss_df = lr_results[lr_results['retrain_window'] == retrain_window].copy()

                # Merge the original and retrained dataframes on 'window' to find common windows
                merged_df = pd.merge(original_inference_df, retrained_loss_df, on='window', suffixes=('_original', '_retrained'))
                
                # Calculate the perplexity
                merged_df['perplexity_original'] = np.exp(merged_df['inference_loss_original'])
                merged_df['perplexity_retrained'] = np.exp(merged_df['inference_loss_retrained'])

                # Normalize the perplexity values
                merged_df['normalized_perplexity_original'] = normalize_data(merged_df['perplexity_original'], -10, 30)
                merged_df['normalized_perplexity_retrained'] = normalize_data(merged_df['perplexity_retrained'], -10, 30)

                # Calculate the difference in normalized perplexity
                merged_df['difference'] = merged_df['normalized_perplexity_retrained'] - merged_df['normalized_perplexity_original']
                
                # Calculate the rolling average for the difference
                rolling_window_size = max(1, int(0.05 * len(merged_df)))
                merged_df['rolling_difference'] = merged_df['difference'].rolling(window=rolling_window_size).mean()

                # Plot the differences
                plt.plot(merged_df['window'], merged_df['difference'], label=f'Difference (Window: {retrain_window})', alpha=0.7)
                # Plot the rolling average without adding it to the legend
                plt.plot(merged_df['window'], merged_df['rolling_difference'], linestyle='--', color='red', alpha=0.7)
                
                # Add vertical lines for retrain window
                retrain_start, retrain_end = map(int, retrain_window.split('-'))
                plt.axvline(x=retrain_start, color='red', linestyle='--')
                plt.axvline(x=retrain_end, color='red', linestyle='--')
        
        plt.xlabel('Window')
        plt.ylabel('Difference in Normalized Inference Perplexity')
        plt.ylim(*Y_AXIS_LIMITS)
        plt.title(f'Difference in Normalized Inference Perplexity (Retrained - Original) for Learning Rate: {lr}, Retrain Percentage: {retrain_percentage}')
        plt.legend()
        save_path = os.path.join(analysis_dir, f'difference_inference_perplexity_lr_{lr}_rp_{retrain_percentage}.png')
        plt.savefig(save_path)
        plt.close()
        print(f"Plot saved to: {save_path}")

def main():
    os.makedirs(ANALYSIS_DIR, exist_ok=True)
    plot_difference_inference_perplexities(EXPERIMENT_DIR, ANALYSIS_DIR)

if __name__ == "__main__":
    main()


Loading CSV file from: experiments/memory_rw_4/retrained_inference_results.csv
Loading CSV file from: /workspace/slice-monorepo/sub_validations/memory_signal_dnd/rw_4/new_inference_results.csv
Plot saved to: experiments/memory_rw_4/analysis/difference_inference_perplexity_lr_0.0001_rp_5.0.png
Loading CSV file from: /workspace/slice-monorepo/sub_validations/memory_signal_dnd/rw_4/new_inference_results.csv
Plot saved to: experiments/memory_rw_4/analysis/difference_inference_perplexity_lr_1e-05_rp_5.0.png
Loading CSV file from: /workspace/slice-monorepo/sub_validations/memory_signal_dnd/rw_4/new_inference_results.csv
Plot saved to: experiments/memory_rw_4/analysis/difference_inference_perplexity_lr_1e-06_rp_5.0.png
Loading CSV file from: /workspace/slice-monorepo/sub_validations/memory_signal_dnd/rw_4/new_inference_results.csv
Plot saved to: experiments/memory_rw_4/analysis/difference_inference_perplexity_lr_0.0001_rp_10.0.png


In [4]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# Configuration
EXPERIMENT_DIR = 'experiments/memory_rw_4'
ANALYSIS_DIR = os.path.join(EXPERIMENT_DIR, 'analysis')
ORIGINAL_INFERENCE_BASE_PATH = '/workspace/slice-monorepo/sub_validations/memory_signal_dnd'
RESULTS_CSV_NAME = 'retrained_inference_results.csv'
Y_AXIS_LIMITS = (-1.5, 1.5)

def load_csv(file_path):
    print(f"Loading CSV file from: {file_path}")
    return pd.read_csv(file_path)

def plot_difference_inference_losses(experiment_dir, analysis_dir):
    results_csv_path = os.path.join(experiment_dir, RESULTS_CSV_NAME)
    results_df = load_csv(results_csv_path)
    
    unique_combinations = results_df[['learning_rate', 'retrain_percentage']].drop_duplicates()

    for _, combo in unique_combinations.iterrows():
        lr = combo['learning_rate']
        retrain_percentage = combo['retrain_percentage']
        plt.figure(figsize=(14, 7))

        # Plot original inference loss for each folder
        folders = results_df['folder'].unique()
        for folder in folders:
            original_inference_path = os.path.join(ORIGINAL_INFERENCE_BASE_PATH, folder, 'new_inference_results.csv')
            original_inference_df = load_csv(original_inference_path)

            lr_results = results_df[(results_df['learning_rate'] == lr) & (results_df['retrain_percentage'] == retrain_percentage)]
            for retrain_window in lr_results['retrain_window'].unique():
                retrained_loss_df = lr_results[lr_results['retrain_window'] == retrain_window].copy()

                # Merge the original and retrained dataframes on 'window' to find common windows
                merged_df = pd.merge(original_inference_df, retrained_loss_df, on='window', suffixes=('_original', '_retrained'))
                
                # Calculate the difference
                merged_df['difference'] = merged_df['inference_loss_retrained'] - merged_df['inference_loss_original']
                
                # Calculate the rolling average for the difference
                rolling_window_size = max(1, int(0.05 * len(merged_df)))
                merged_df['rolling_difference'] = merged_df['difference'].rolling(window=rolling_window_size).mean()

                # Plot the differences
                plt.plot(merged_df['window'], merged_df['difference'], label=f'Difference (Window: {retrain_window})', alpha=0.7)
                # Plot the rolling average without adding it to the legend
                plt.plot(merged_df['window'], merged_df['rolling_difference'], linestyle='--', color='red', alpha=0.7)
                
                # Add vertical lines for retrain window
                retrain_start, retrain_end = map(int, retrain_window.split('-'))
                plt.axvline(x=retrain_start, color='red', linestyle='--')
                plt.axvline(x=retrain_end, color='red', linestyle='--')
        
        plt.xlabel('Window')
        plt.ylabel('Difference in Inference Loss')
        plt.ylim(*Y_AXIS_LIMITS)
        plt.title(f'Difference in Inference Loss (Retrained - Original) for Learning Rate: {lr}, Retrain Percentage: {retrain_percentage}')
        plt.legend()
        save_path = os.path.join(analysis_dir, f'difference_inference_loss_lr_{lr}_rp_{retrain_percentage}.png')
        plt.savefig(save_path)
        plt.close()
        print(f"Plot saved to: {save_path}")

def main():
    os.makedirs(ANALYSIS_DIR, exist_ok=True)
    plot_difference_inference_losses(EXPERIMENT_DIR, ANALYSIS_DIR)

if __name__ == "__main__":
    main()


Loading CSV file from: experiments/memory_rw_4/retrained_inference_results.csv
Loading CSV file from: /workspace/slice-monorepo/sub_validations/memory_signal_dnd/rw_4/new_inference_results.csv
Plot saved to: experiments/memory_rw_4/analysis/normalized_difference_inference_perplexity_lr_0.0001_rp_5.0.png
Loading CSV file from: /workspace/slice-monorepo/sub_validations/memory_signal_dnd/rw_4/new_inference_results.csv
Plot saved to: experiments/memory_rw_4/analysis/normalized_difference_inference_perplexity_lr_1e-05_rp_5.0.png
Loading CSV file from: /workspace/slice-monorepo/sub_validations/memory_signal_dnd/rw_4/new_inference_results.csv
Plot saved to: experiments/memory_rw_4/analysis/normalized_difference_inference_perplexity_lr_1e-06_rp_5.0.png
Loading CSV file from: /workspace/slice-monorepo/sub_validations/memory_signal_dnd/rw_4/new_inference_results.csv
Plot saved to: experiments/memory_rw_4/analysis/normalized_difference_inference_perplexity_lr_0.0001_rp_10.0.png


In [36]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# Configuration
EXPERIMENT_DIR = 'experiments/memory_rw_4'
ANALYSIS_DIR = os.path.join(EXPERIMENT_DIR, 'analysis')
ORIGINAL_INFERENCE_BASE_PATH = '/workspace/slice-monorepo/sub_validations/memory_signal_dnd'
RESULTS_CSV_NAME = 'retrained_inference_results.csv'
Y_AXIS_LIMITS = (-1.5, 1.0)

def load_csv(file_path):
    print(f"Loading CSV file from: {file_path}")
    return pd.read_csv(file_path)

def plot_difference_inference_losses(experiment_dir, analysis_dir):
    results_csv_path = os.path.join(experiment_dir, RESULTS_CSV_NAME)
    results_df = load_csv(results_csv_path)
    
    unique_learning_rates = results_df['learning_rate'].unique()

    for lr in unique_learning_rates:
        plt.figure(figsize=(14, 7))

        # Plot original inference loss for each folder
        folders = results_df['folder'].unique()
        for folder in folders:
            original_inference_path = os.path.join(ORIGINAL_INFERENCE_BASE_PATH, folder, 'new_inference_results.csv')
            original_inference_df = load_csv(original_inference_path)

            lr_results = results_df[results_df['learning_rate'] == lr]
            for retrain_window in lr_results['retrain_window'].unique():
                retrained_loss_df = lr_results[lr_results['retrain_window'] == retrain_window].copy()

                # Merge the original and retrained dataframes on 'window' to find common windows
                merged_df = pd.merge(original_inference_df, retrained_loss_df, on='window', suffixes=('_original', '_retrained'))
                
                # Calculate the difference
                merged_df['difference'] = merged_df['inference_loss_retrained'] - merged_df['inference_loss_original']
                
                # Calculate the rolling average for the difference
                rolling_window_size = max(1, int(0.05 * len(merged_df)))
                merged_df['rolling_difference'] = merged_df['difference'].rolling(window=rolling_window_size).mean()

                # Plot the differences
                plt.plot(merged_df['window'], merged_df['difference'], label=f'Difference (Window: {retrain_window})', alpha=0.7)
                # Plot the rolling average without adding it to the legend
                plt.plot(merged_df['window'], merged_df['rolling_difference'], linestyle='--', color='red', alpha=0.7)
                
                # Add vertical lines for retrain window
                retrain_start, retrain_end = map(int, retrain_window.split('-'))
                plt.axvline(x=retrain_start, color='red', linestyle='--')
                plt.axvline(x=retrain_end, color='red', linestyle='--')
        
        plt.xlabel('Window')
        plt.ylabel('Difference in Inference Loss')
        plt.ylim(*Y_AXIS_LIMITS)
        plt.title(f'Difference in Inference Loss (Retrained - Original) for Learning Rate: {lr}')
        plt.legend()
        save_path = os.path.join(analysis_dir, f'difference_inference_loss_lr_{lr}.png')
        plt.savefig(save_path)
        plt.close()
        print(f"Plot saved to: {save_path}")

def main():
    os.makedirs(ANALYSIS_DIR, exist_ok=True)
    plot_difference_inference_losses(EXPERIMENT_DIR, ANALYSIS_DIR)

if __name__ == "__main__":
    main()


Loading CSV file from: experiments/memory_rw_4/retrained_inference_results.csv
Loading CSV file from: /workspace/slice-monorepo/sub_validations/memory_signal_dnd/rw_4/new_inference_results.csv
Plot saved to: experiments/memory_rw_4/analysis/difference_inference_loss_lr_0.0001.png
Loading CSV file from: /workspace/slice-monorepo/sub_validations/memory_signal_dnd/rw_4/new_inference_results.csv
Plot saved to: experiments/memory_rw_4/analysis/difference_inference_loss_lr_1e-05.png
Loading CSV file from: /workspace/slice-monorepo/sub_validations/memory_signal_dnd/rw_4/new_inference_results.csv
Plot saved to: experiments/memory_rw_4/analysis/difference_inference_loss_lr_1e-06.png
