In [2]:
!pip install -q matplotlib pandas

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import os

# Load the CSV file
csv_file_path = '/workspace/slice-monorepo/sub_validations/cl_scaling/pile_dnd/experiments/combined_training_7/training_results.csv'
data = pd.read_csv(csv_file_path)

# Get unique epochs
epochs = data['epoch'].unique()

# Create the analysis directory if it doesn't exist
analysis_dir = os.path.join(os.path.dirname(csv_file_path), 'analysis')
os.makedirs(analysis_dir, exist_ok=True)

# Function to calculate running average
def running_average(values, window_size):
    return values.rolling(window=window_size).mean()

# Iterate over each epoch to generate and save plots
for epoch in epochs:
    epoch_data = data[data['epoch'] == epoch]
    
    # Plot and save train_loss over steps
    plt.figure(figsize=(12, 6))
    plt.plot(epoch_data['step'], epoch_data['train_loss'], label='Train Loss')
    plt.title(f'Epoch {epoch} - Train Loss over Steps')
    plt.xlabel('Step')
    plt.ylabel('Train Loss')
    plt.grid(True)
    plot_path = os.path.join(analysis_dir, f'epoch_{epoch}_train_loss.png')
    plt.savefig(plot_path)
    plt.close()

    # Plot and save rw_inference_loss over steps
    plt.figure(figsize=(12, 6))
    plt.plot(epoch_data['step'], epoch_data['rw_inference_loss'], label='RW Inference Loss', color='green')
    plt.title(f'Epoch {epoch} - RW Inference Loss over Steps')
    plt.xlabel('Step')
    plt.ylabel('RW Inference Loss')
    plt.grid(True)
    plot_path = os.path.join(analysis_dir, f'epoch_{epoch}_rw_inference_loss.png')
    plt.savefig(plot_path)
    plt.close()

    # Plot and save pile_inference_loss over steps
    plt.figure(figsize=(12, 6))
    plt.plot(epoch_data['step'], epoch_data['pile_inference_loss'], label='Pile Inference Loss', color='red')
    plt.title(f'Epoch {epoch} - Pile Inference Loss over Steps')
    plt.xlabel('Step')
    plt.ylabel('Pile Inference Loss')
    plt.grid(True)
    plot_path = os.path.join(analysis_dir, f'epoch_{epoch}_pile_inference_loss.png')
    plt.savefig(plot_path)
    plt.close()

    # Plot and save combined running average of all three losses
    plt.figure(figsize=(12, 6))
    plt.plot(epoch_data['step'], running_average(epoch_data['train_loss'], 50), label='Train Loss (Running Average)', color='blue')
    plt.plot(epoch_data['step'], running_average(epoch_data['rw_inference_loss'], 50), label='RW Inference Loss (Running Average)', color='green')
    plt.plot(epoch_data['step'], running_average(epoch_data['pile_inference_loss'], 50), label='Pile Inference Loss (Running Average)', color='red')
    plt.title(f'Epoch {epoch} - Running Average of Losses over Steps')
    plt.xlabel('Step')
    plt.ylabel('Running Average Loss')
    plt.legend()
    plt.grid(True)
    plot_path = os.path.join(analysis_dir, f'epoch_{epoch}_combined_running_average.png')
    plt.savefig(plot_path)
    plt.close()
