In [2]:
!pip install -q scikit-learn matplotlib pandas transformers

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Define constants
experiment = "fixed_1e5"
EVAL_CSV_FILE_PATH = f"{experiment}/lr_dependency_results_scaled_eval.csv"
MAIN_CSV_FILE_PATH = f"{experiment}/lr_dependency_results_scaled.csv"
OUTPUT_FOLDER = f"{experiment}/analysis_output"
os.makedirs(OUTPUT_FOLDER, exist_ok=True)

# Function to get the min and max range for x and y axis
def get_y_axis_range(csv_file_paths):
    all_losses = []
    
    for csv_file_path in csv_file_paths:
        df = pd.read_csv(csv_file_path)
        df = df[pd.to_numeric(df['LR'], errors='coerce').notnull()]
        df['LR'] = df['LR'].astype(float)
        all_losses.extend(df['Inference Loss'].values)
    
    y_min, y_max = min(all_losses), max(all_losses)
    
    return y_min, y_max

# Function to analyze CSV data and plot learning rate vs. loss
def analyze_and_plot(csv_file_path, model_name, output_folder, show_plot=True, add_fixed_lr=True, x_range=(1e-7, 1e-2), y_values=None):
    df = pd.read_csv(csv_file_path)
    
    # Separate fixed LR rows
    fixed_lr_df = df[df['LR'] == 'fixed']
    
    # Ensure Learning Rate is a float, ignoring non-numeric rows
    df = df[pd.to_numeric(df['LR'], errors='coerce').notnull()]
    df['LR'] = df['LR'].astype(float)
    
    for epoch in df['Epoch'].unique():
        epoch_df = df[df['Epoch'] == epoch]
        
        # Handle fixed LR points
        if add_fixed_lr and not fixed_lr_df.empty:
            epoch_fixed_lr_df = fixed_lr_df[fixed_lr_df['Epoch'] == epoch]
            if not epoch_fixed_lr_df.empty:
                epoch_fixed_lr = epoch_fixed_lr_df.iloc[0]
                epoch_fixed_lr_point = pd.DataFrame([{'LR': 1e-5, 'Inference Loss': epoch_fixed_lr['Inference Loss'], 'Train Loss': epoch_fixed_lr['Train Loss'], 'Epoch': epoch}])
                epoch_df = epoch_df[epoch_df['LR'] != 1e-5]  # Remove any existing 1e-5 points
                epoch_df = pd.concat([epoch_df, epoch_fixed_lr_point], ignore_index=True)
        
        epoch_df = epoch_df.sort_values(by='LR')
        
        plt.figure(figsize=(12, 6))
        plt.plot(epoch_df['LR'], epoch_df['Inference Loss'], 'bo-', label='Inference Loss')
        
        best_lr = epoch_df.loc[epoch_df['Inference Loss'].idxmin(), 'LR']
        best_loss = epoch_df['Inference Loss'].min()
        train_loss = epoch_df['Train Loss'].iloc[0]  # Assuming train loss is constant for the entire epoch
        
        plt.xscale('log')
        plt.yscale('log')
        plt.xlabel('Learning Rate (log scale)')
        plt.ylabel('Loss (log scale)')
        plt.title(f'{model_name} - Epoch {epoch}: Inference Loss vs Learning Rate')
        plt.axhline(y=train_loss, color='r', linestyle='-', label=f'Train Loss = {train_loss:.2f}')
        plt.scatter([best_lr], [best_loss], color='g', s=100, label=f'Best LR: {best_lr:.2e}\nLoss: {best_loss:.2f}')
        
        # Add the fixed LR point as a purple dot
        if add_fixed_lr and not epoch_fixed_lr_df.empty:
            plt.scatter([1e-5], [epoch_fixed_lr['Inference Loss']], color='purple', s=100, label=f'Fixed LR: 1e-5\nLoss: {epoch_fixed_lr["Inference Loss"]:.2f}')
        
        plt.legend()
        plt.tight_layout()
        
        # Set the same x and y axis range for all plots
        plt.xlim(x_range)
        plt.ylim(y_values[0], y_values[-1])
        plt.yticks(y_values, labels=[f'{y:.2f}' for y in y_values])
        
        plot_path = os.path.join(output_folder, f'{model_name.lower().replace(" ", "_")}_epoch_{epoch}_loss_vs_lr.png')
        plt.savefig(plot_path)
        print(f'{model_name} - Epoch {epoch} plot saved to {plot_path}')
        
        plt.close()

def main(plot_eval=True, add_fixed_lr=True):
    csv_file_paths = [MAIN_CSV_FILE_PATH]
    if plot_eval:
        csv_file_paths.append(EVAL_CSV_FILE_PATH)
    
    y_min, y_max = get_y_axis_range(csv_file_paths)
    y_values = np.logspace(np.log10(y_min), np.log10(y_max), num=15)
    
    if plot_eval:
        analyze_and_plot(EVAL_CSV_FILE_PATH, "Evaluation Model", OUTPUT_FOLDER, add_fixed_lr=add_fixed_lr, y_values=y_values)
    analyze_and_plot(MAIN_CSV_FILE_PATH, "Main Model", OUTPUT_FOLDER, add_fixed_lr=add_fixed_lr, y_values=y_values)

if __name__ == "__main__":
    main(plot_eval=False, add_fixed_lr=False)


Main Model - Epoch 1 plot saved to fixed_1e5/analysis_output/main_model_epoch_1_loss_vs_lr.png
Main Model - Epoch 2 plot saved to fixed_1e5/analysis_output/main_model_epoch_2_loss_vs_lr.png
Main Model - Epoch 3 plot saved to fixed_1e5/analysis_output/main_model_epoch_3_loss_vs_lr.png
Main Model - Epoch 4 plot saved to fixed_1e5/analysis_output/main_model_epoch_4_loss_vs_lr.png
Main Model - Epoch 5 plot saved to fixed_1e5/analysis_output/main_model_epoch_5_loss_vs_lr.png
Main Model - Epoch 6 plot saved to fixed_1e5/analysis_output/main_model_epoch_6_loss_vs_lr.png
Main Model - Epoch 7 plot saved to fixed_1e5/analysis_output/main_model_epoch_7_loss_vs_lr.png
Main Model - Epoch 8 plot saved to fixed_1e5/analysis_output/main_model_epoch_8_loss_vs_lr.png
Main Model - Epoch 9 plot saved to fixed_1e5/analysis_output/main_model_epoch_9_loss_vs_lr.png
Main Model - Epoch 10 plot saved to fixed_1e5/analysis_output/main_model_epoch_10_loss_vs_lr.png
Main Model - Epoch 11 plot saved to fixed_1e5/an

In [None]:
import os
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import GPTNeoXForCausalLM
from sklearn.decomposition import PCA
from matplotlib.cm import get_cmap

# Define constants
experiment = "test"
MODEL_NAME = "EleutherAI/pythia-70m"
#MODEL_NAME = "EleutherAI/pythia-410m"
MODEL_FOLDER = f"{experiment}/models"
OUTPUT_FOLDER = f"{experiment}/mad_analysis_output"
PCA_FOLDER = os.path.join(OUTPUT_FOLDER, 'pca')
os.makedirs(PCA_FOLDER, exist_ok=True)

# Flag for filtering fixed learning rate models
FIXED_LR = False

# Function to calculate MAD
def calculate_mad(weights1, weights2):
    return np.mean(np.abs(weights1 - weights2))

# Function to perform PCA and save the plot
def plot_pca(weights1, weights2, layer_name, epoch):
    try:
        if weights1.ndim < 2:
            weights1 = weights1.reshape(-1, 1)
            weights2 = weights2.reshape(-1, 1)
        pca = PCA(n_components=2)
        weights = np.concatenate([weights1, weights2], axis=0)
        pca_result = pca.fit_transform(weights)
        plt.figure(figsize=(10, 5))
        plt.scatter(pca_result[:len(weights1), 0], pca_result[:len(weights1), 1], alpha=0.5, label='Pre-trained')
        plt.scatter(pca_result[len(weights1):, 0], pca_result[len(weights1):, 1], alpha=0.5, label='Fine-tuned')
        plt.title(f'PCA of Weights: {layer_name} - Epoch {epoch}')
        plt.xlabel('PCA Component 1')
        plt.ylabel('PCA Component 2')
        plot_file = os.path.join(PCA_FOLDER, f'{layer_name.replace(".", "_")}_epoch_{epoch}_pca.png')
        plt.savefig(plot_file)
        plt.close()
    except Exception as e:
        print(f"Skipping PCA for {layer_name} due to error: {e}")

# Load models
def load_models(saved_model_path):
    model_pretrained = GPTNeoXForCausalLM.from_pretrained(MODEL_NAME).to('cuda')
    model_fine_tuned = GPTNeoXForCausalLM.from_pretrained(MODEL_NAME).to('cuda')
    model_fine_tuned.load_state_dict(torch.load(saved_model_path))
    return model_pretrained, model_fine_tuned

# Function to analyze models
def analyze_models(model_folder, fixed_lr=False):
    model_pretrained = GPTNeoXForCausalLM.from_pretrained(MODEL_NAME).to('cuda')
    layer_names = [name for name, _ in model_pretrained.named_parameters() if "weight" in name]
    mad_values = []

    csv_mad_path = os.path.join(OUTPUT_FOLDER, 'mad_values.csv')
    cmap = get_cmap('viridis')
    
    if os.path.exists(csv_mad_path):
        df_mad = pd.read_csv(csv_mad_path)
        print(f'Loaded existing MAD values from {csv_mad_path}')
        epochs = df_mad['Epoch'].unique()
        norm = plt.Normalize(min(epochs), max(epochs))
    else:
        if fixed_lr:
            model_files = [f for f in os.listdir(model_folder) if f.startswith("fixed_lr") and f.endswith(".pt")]
        else:
            model_files = [f for f in os.listdir(model_folder) if f.endswith(".pt")]
        epochs = [int(f.split('_')[-1].split('.')[0]) for f in model_files]
        norm = plt.Normalize(min(epochs), max(epochs))

        for file_name in model_files:
            model_path = os.path.join(model_folder, file_name)
            epoch = int(file_name.split('_')[-1].split('.')[0])
            model_fine_tuned = GPTNeoXForCausalLM.from_pretrained(MODEL_NAME).to('cuda')
            model_fine_tuned.load_state_dict(torch.load(model_path))

            # Calculate MAD for all layers
            for layer_name in layer_names:
                weights_pretrained = model_pretrained.state_dict()[layer_name].cpu().numpy()
                weights_fine_tuned = model_fine_tuned.state_dict()[layer_name].cpu().numpy()
                mad = calculate_mad(weights_pretrained.flatten(), weights_fine_tuned.flatten())
                mad_values.append({'Layer': layer_name, 'MAD': mad, 'Model': file_name, 'Epoch': epoch})
        
        # Save MAD values to CSV
        df_mad = pd.DataFrame(mad_values)
        df_mad.to_csv(csv_mad_path, index=False)
        print(f'MAD values saved to {csv_mad_path}')

    # Plot and save the MAD scatter plot
    plt.figure(figsize=(18, 6))
    unique_epochs = sorted(df_mad['Epoch'].unique())
    selected_epochs = [unique_epochs[0], unique_epochs[len(unique_epochs)//3], unique_epochs[2*len(unique_epochs)//3], unique_epochs[-1]]
    handles = []
    labels = []
    for epoch in unique_epochs:
        epoch_df = df_mad[df_mad['Epoch'] == epoch]
        color = cmap(norm(epoch))
        scatter = plt.scatter(range(len(epoch_df)), epoch_df['MAD'], color=color, alpha=0.6)
        if epoch in selected_epochs:
            handles.append(scatter)
            labels.append(f'Epoch {epoch}')
    plt.xlabel('Layer Index')
    plt.ylabel('MAD')
    plt.title('MAD for Each Layer')
    plt.legend(handles, labels)
    plt.xticks(range(0, len(layer_names), 20))  # Adjusting x-ticks to show every 20th layer index
    plt.tight_layout()
    plot_mad_path = os.path.join(OUTPUT_FOLDER, 'mad_plot.png')
    plt.savefig(plot_mad_path)
    plt.close()
    print(f'MAD plot saved to {plot_mad_path}')

def main():
    # Analyze the models for MAD
    analyze_models(MODEL_FOLDER, fixed_lr=FIXED_LR)

if __name__ == "__main__":
    main()


In [None]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt

# Define the root directory containing evaluation results
experiment = "fixed_1e5"
evaluation_results_dir = f'{experiment}/evaluation_results'
analysis_output_dir = f'{experiment}/analysis_llmeval'

# Ensure the analysis output directory exists
os.makedirs(analysis_output_dir, exist_ok=True)

# Function to extract evaluation metrics from JSON files
def extract_metrics(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    results = data.get('results', {})
    metrics = {}
    for task, task_results in results.items():
        for metric, value in task_results.items():
            metrics[f"{task}_{metric}"] = value
    return metrics

# Function to gather all metrics from the evaluation results directory
def gather_metrics(evaluation_results_dir):
    all_metrics = []
    for root, _, files in os.walk(evaluation_results_dir):
        for file in files:
            if file.endswith(".json"):
                json_file = os.path.join(root, file)
                metrics = extract_metrics(json_file)
                # Extract epoch number if the directory name contains 'epoch_'
                if 'epoch_' in root:
                    epoch_str = root.split('epoch_')[-1].split('/')[0].split('.')[0]
                    try:
                        metrics['epoch'] = int(epoch_str)
                    except ValueError:
                        continue
                else:
                    continue
                all_metrics.append(metrics)
    df = pd.DataFrame(all_metrics)
    df.dropna(how='all', subset=[col for col in df.columns if col != 'epoch'], inplace=True)
    return df

# Function to plot metrics
def plot_metrics(df, output_dir):
    for column in df.columns:
        if column == 'epoch':
            continue
        plt.figure(figsize=(10, 6))
        try:
            plt.plot(df['epoch'], df[column].astype(float), marker='o', linestyle='-')
        except ValueError:
            continue  # Skip plotting if conversion to float fails
        plt.xlabel('Epoch')
        plt.ylabel(column)
        plt.title(f'{column} over Epochs')
        plt.grid(True)
        plt.savefig(os.path.join(output_dir, f'{column}_over_epochs.png'))
        plt.close()
        plt.show()

# Main function to run the analysis
def main(evaluation_results_dir, analysis_output_dir):
    df_metrics = gather_metrics(evaluation_results_dir)
    df_metrics.sort_values(by='epoch', inplace=True)
    
    output_csv = os.path.join(analysis_output_dir, 'evaluation_metrics.csv')
    df_metrics.to_csv(output_csv, index=False)
    print(f"Metrics saved to {output_csv}")
    
    plot_metrics(df_metrics, analysis_output_dir)

if __name__ == "__main__":
    main(evaluation_results_dir, analysis_output_dir)
