In [1]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import GPTNeoXForCausalLM
from sklearn.decomposition import PCA
from matplotlib.cm import get_cmap
from sklearn.metrics.pairwise import cosine_similarity

# Define constants
experiment = "fixed_1e5"
MODEL_NAME = "EleutherAI/pythia-70m"
MODEL_FOLDER = f"{experiment}/models"
GRADIENT_FOLDER = f"{experiment}/gradients"
OUTPUT_FOLDER = f"{experiment}/gradient_analysis_output"
PCA_FOLDER = os.path.join(OUTPUT_FOLDER, 'pca')
os.makedirs(PCA_FOLDER, exist_ok=True)

# Function to calculate MAD
def calculate_mad(values1, values2):
    return np.mean(np.abs(values1 - values2))

# Load gradients
def load_gradients(gradient_dir, epoch):
    grad_path = os.path.join(gradient_dir, f"gradients_epoch_{epoch}.npz")
    loaded = np.load(grad_path)
    return [loaded[key] for key in loaded]

# Function to perform gradient analysis
def analyze_gradients(gradient_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    
    mad_values = []
    norm_values = []
    cosine_sim_epoch_to_epoch = []
    cosine_sim_to_first_epoch = []

    gradient_files = [f for f in os.listdir(gradient_folder) if f.endswith(".npz")]
    epochs = sorted([int(f.split('_')[-1].split('.')[0]) for f in gradient_files])
    cmap = get_cmap('viridis')
    norm = plt.Normalize(min(epochs), max(epochs))
    
    first_epoch_gradients = None

    for i, epoch in enumerate(epochs):
        gradients = load_gradients(gradient_folder, epoch)
        if i == 0:
            first_epoch_gradients = gradients

        epoch_norms = [np.linalg.norm(grad) for grad in gradients if grad is not None]
        norm_values.append(epoch_norms)

        if i > 0:
            prev_gradients = load_gradients(gradient_folder, epochs[i-1])
            cosine_sim = [cosine_similarity([grad.flatten()], [prev_grad.flatten()])[0, 0]
                          for grad, prev_grad in zip(gradients, prev_gradients) if grad is not None and prev_grad is not None]
            cosine_sim_epoch_to_epoch.append(np.mean(cosine_sim))

            cosine_sim_first = [cosine_similarity([grad.flatten()], [first_grad.flatten()])[0, 0]
                                for grad, first_grad in zip(gradients, first_epoch_gradients) if grad is not None and first_grad is not None]
            cosine_sim_to_first_epoch.append(np.mean(cosine_sim_first))
        
        for j, grad in enumerate(gradients):
            mad = calculate_mad(first_epoch_gradients[j], grad)
            mad_values.append({'Layer': f'Layer_{j}', 'MAD': mad, 'Epoch': epoch})

    # Save MAD values to CSV
    df_mad = pd.DataFrame(mad_values)
    csv_mad_path = os.path.join(output_folder, 'gradient_mad_values.csv')
    df_mad.to_csv(csv_mad_path, index=False)
    print(f'Gradient MAD values saved to {csv_mad_path}')

    # Plot MAD values
    plt.figure(figsize=(18, 6))
    for epoch in sorted(df_mad['Epoch'].unique()):
        epoch_df = df_mad[df_mad['Epoch'] == epoch]
        color = cmap(norm(epoch))
        plt.scatter(range(len(epoch_df)), epoch_df['MAD'], color=color, alpha=0.6, label=f'Epoch {epoch}')
    plt.xlabel('Layer Index')
    plt.ylabel('MAD')
    plt.title('Gradient MAD for Each Layer')
    plt.legend()
    plt.xticks(range(0, len(epoch_df), 5))
    plt.tight_layout()
    plot_mad_path = os.path.join(output_folder, 'gradient_mad_plot.png')
    plt.savefig(plot_mad_path)
    plt.close()
    print(f'Gradient MAD plot saved to {plot_mad_path}')

    # Plot Gradient Norms Over Epochs
    plt.figure(figsize=(10, 5))
    for i, layer_norms in enumerate(zip(*norm_values)):
        plt.plot(epochs, layer_norms, label=f'Layer {i+1}')
    plt.xlabel('Epoch')
    plt.ylabel('Gradient Norm')
    plt.title('Gradient Norms Over Epochs')
    plt.legend()
    plt.savefig(os.path.join(output_folder, 'gradient_norms_over_epochs.png'))
    plt.close()
    print(f'Gradient norms plot saved to {os.path.join(output_folder, "gradient_norms_over_epochs.png")}')

    # Plot Cosine Similarity Between Consecutive Epochs
    plt.figure(figsize=(10, 5))
    plt.plot(epochs[1:], cosine_sim_epoch_to_epoch, marker='o')
    plt.xlabel('Epoch')
    plt.ylabel('Cosine Similarity')
    plt.title('Cosine Similarity of Gradients Between Consecutive Epochs')
    plt.savefig(os.path.join(output_folder, 'cosine_similarity_epoch_to_epoch.png'))
    plt.close()
    print(f'Cosine similarity (epoch-to-epoch) plot saved to {os.path.join(output_folder, "cosine_similarity_epoch_to_epoch.png")}')

    # Plot Cosine Similarity to First Epoch
    plt.figure(figsize=(10, 5))
    plt.plot(epochs[1:], cosine_sim_to_first_epoch, marker='o')
    plt.xlabel('Epoch')
    plt.ylabel('Cosine Similarity')
    plt.title('Cosine Similarity of Gradients to First Epoch')
    plt.savefig(os.path.join(output_folder, 'cosine_similarity_to_first_epoch.png'))
    plt.close()
    print(f'Cosine similarity to first epoch plot saved to {os.path.join(output_folder, "cosine_similarity_to_first_epoch.png")}')

def main():
    # Analyze the gradients
    analyze_gradients(GRADIENT_FOLDER, OUTPUT_FOLDER)

if __name__ == "__main__":
    main()


  cmap = get_cmap('viridis')


Gradient MAD values saved to fixed_1e5/gradient_analysis_output/gradient_mad_values.csv
Gradient MAD plot saved to fixed_1e5/gradient_analysis_output/gradient_mad_plot.png
Gradient norms plot saved to fixed_1e5/gradient_analysis_output/gradient_norms_over_epochs.png
Cosine similarity (epoch-to-epoch) plot saved to fixed_1e5/gradient_analysis_output/cosine_similarity_epoch_to_epoch.png
Cosine similarity to first epoch plot saved to fixed_1e5/gradient_analysis_output/cosine_similarity_to_first_epoch.png
