# Weights Distances

In [1]:
######################## (70m) ############################

In [2]:
from tqdm import tqdm
from transformers import GPTNeoXForCausalLM, AutoTokenizer
import concurrent.futures
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns



tokenizer = AutoTokenizer.from_pretrained( "EleutherAI/pythia-70m-deduped", revision="step143000" )
ref_model = GPTNeoXForCausalLM.from_pretrained( "EleutherAI/pythia-70m-deduped", revision="step143000")
ref_model_params = ref_model.state_dict()

In [3]:
# Define the function to calculate layer distances
def calculate_layer_distances(step_idx,rev_id):
    model = GPTNeoXForCausalLM.from_pretrained( "EleutherAI/pythia-70m-deduped", revision=rev_id)
    model_params = model.state_dict()
        
    layer_dists = {}
    
    for layer in list(model_params.keys()):
        # Extract weights for the current layer
        weights_model = model_params[layer]
        weights_ref_model = ref_model_params[layer]

        # Calculate Euclidean distance (you can replace this with other metrics)
        euclidean_distance = (weights_model - weights_ref_model).norm().item()
        layer_dists[layer] = euclidean_distance
        print(f"Step {rev_id} Layer {layer}: Euclidean Distance = {euclidean_distance}")

    del model
    return layer_dists

In [8]:
# Define the function to calculate layer distances
def calculate_layer_diff_rank(step_idx,rev_id):
    model = GPTNeoXForCausalLM.from_pretrained( "EleutherAI/pythia-70m-deduped", revision=rev_id)
    model_params = model.state_dict()
        
    rank_dict = {}
    
    for layer in list(model_params.keys()):
        # Extract weights for the current layer
        weights_model = model_params[layer]
        weights_ref_model = ref_model_params[layer]

        # Calculate Euclidean distance (you can replace this with other metrics)
        rank = np.linalg.matrix_rank(weights_ref_model.cpu().detach().numpy()-weights_model.cpu().detach().numpy())

        # Store the rank in the dictionary
        rank_dict[layer] = rank
        print(f"Step {rev_id} Layer {layer}: Rank = {rank}")

    del model
    return rank_dict

In [9]:
# Set the number of steps and the number of steps to process simultaneously
num_steps = 143  # Adjust this according to your actual number of steps

In [10]:
# since the distance is majorly in the first and last layers, should we apply layerwise learning rates ?3
# the overall "loss" is sum of individial layer-losses
# so, from the plots of layerwise distances it can be seen which layer has a steep-decrease in distance and which layer does'nt 
# this can be used for layerwise finetuning ? 

In [11]:

weight_diff_ranks = {}

# Iterate over the steps without using ThreadPoolExecutor
for step_idx in tqdm(range(num_steps), total=num_steps):
    revision_id = f"step{1000*(step_idx+1)}"
    weight_diff_ranks[step_idx] = calculate_layer_diff_rank(step_idx, revision_id)


  0%|                                                                                                                                                                                      | 0/143 [00:00<?, ?it/s]

Step step1000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step1000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step1000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step1000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step1000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step1000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step1000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step1000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 511
Step step1000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step1000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step1000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step1000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step1000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.bias: Rank = 1
Step step1000 Layer gpt_neox.layers.1.input_layer

  1%|█▏                                                                                                                                                                            | 1/143 [00:04<09:39,  4.08s/it]

Step step1000 Layer embed_out.weight: Rank = 512
Step step2000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step2000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step2000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step2000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step2000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step2000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step2000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step2000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 511
Step step2000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step2000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step2000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step2000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step2000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.bias: Rank = 1


  1%|██▍                                                                                                                                                                           | 2/143 [00:08<10:07,  4.31s/it]

Step step2000 Layer embed_out.weight: Rank = 512
Step step3000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step3000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step3000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step3000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step3000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step3000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step3000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step3000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step3000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step3000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step3000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step3000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step3000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.bias: Rank = 1


  2%|███▋                                                                                                                                                                          | 3/143 [00:12<10:02,  4.30s/it]

Step step3000 Layer embed_out.weight: Rank = 512
Step step4000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step4000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step4000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step4000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step4000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step4000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step4000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step4000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step4000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step4000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step4000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step4000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step4000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.bias: Rank = 1


  3%|████▊                                                                                                                                                                         | 4/143 [00:17<10:03,  4.34s/it]

Step step4000 Layer embed_out.weight: Rank = 512
Step step5000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step5000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step5000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step5000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step5000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step5000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step5000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step5000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step5000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step5000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step5000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step5000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step5000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.bias: Rank = 1


  3%|██████                                                                                                                                                                        | 5/143 [00:21<10:00,  4.35s/it]

Step step5000 Layer embed_out.weight: Rank = 512
Step step6000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step6000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step6000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step6000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step6000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step6000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step6000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step6000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step6000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step6000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step6000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step6000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step6000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.bias: Rank = 1


  4%|███████▎                                                                                                                                                                      | 6/143 [00:25<09:55,  4.35s/it]

Step step6000 Layer embed_out.weight: Rank = 512
Step step7000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step7000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step7000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step7000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step7000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step7000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step7000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step7000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step7000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step7000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step7000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step7000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step7000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.bias: Rank = 1


  5%|████████▌                                                                                                                                                                     | 7/143 [00:30<10:00,  4.41s/it]

Step step7000 Layer embed_out.weight: Rank = 512
Step step8000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step8000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step8000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step8000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step8000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step8000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step8000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step8000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step8000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step8000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step8000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step8000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step8000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.bias: Rank = 1


  6%|█████████▋                                                                                                                                                                    | 8/143 [00:35<10:18,  4.58s/it]

Step step8000 Layer embed_out.weight: Rank = 512
Step step9000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step9000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step9000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step9000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step9000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step9000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step9000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step9000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step9000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step9000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step9000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step9000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step9000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.bias: Rank = 1


  6%|██████████▉                                                                                                                                                                   | 9/143 [00:39<10:09,  4.55s/it]

Step step9000 Layer embed_out.weight: Rank = 512
Step step10000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step10000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step10000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step10000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step10000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step10000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step10000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step10000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step10000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step10000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step10000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step10000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step10000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.bi

  7%|████████████                                                                                                                                                                 | 10/143 [00:44<09:48,  4.43s/it]

Step step10000 Layer embed_out.weight: Rank = 512
Step step11000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step11000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step11000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step11000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step11000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step11000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step11000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step11000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 511
Step step11000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step11000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step11000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step11000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step11000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.b

  8%|█████████████▎                                                                                                                                                               | 11/143 [00:48<09:38,  4.38s/it]

Step step11000 Layer embed_out.weight: Rank = 512
Step step12000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step12000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step12000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step12000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step12000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step12000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step12000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step12000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step12000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step12000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step12000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step12000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step12000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.b

  8%|██████████████▌                                                                                                                                                              | 12/143 [00:52<09:41,  4.44s/it]

Step step12000 Layer embed_out.weight: Rank = 512
Step step13000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step13000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step13000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step13000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step13000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step13000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step13000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step13000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step13000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step13000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step13000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step13000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step13000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.b

  9%|███████████████▋                                                                                                                                                             | 13/143 [00:57<09:32,  4.40s/it]

Step step13000 Layer embed_out.weight: Rank = 512
Step step14000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step14000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step14000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step14000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step14000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step14000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step14000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step14000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step14000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step14000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step14000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step14000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step14000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.b

 10%|████████████████▉                                                                                                                                                            | 14/143 [01:03<10:28,  4.87s/it]

Step step14000 Layer embed_out.weight: Rank = 512
Step step15000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step15000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step15000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step15000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step15000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step15000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step15000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step15000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step15000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step15000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step15000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step15000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step15000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.b

 10%|██████████████████▏                                                                                                                                                          | 15/143 [01:08<10:25,  4.89s/it]

Step step15000 Layer embed_out.weight: Rank = 512
Step step16000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step16000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step16000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step16000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step16000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step16000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step16000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step16000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step16000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step16000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step16000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step16000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step16000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.b

 11%|███████████████████▎                                                                                                                                                         | 16/143 [01:12<10:02,  4.74s/it]

Step step16000 Layer embed_out.weight: Rank = 512
Step step17000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step17000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step17000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step17000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step17000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step17000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step17000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step17000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step17000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step17000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step17000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step17000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step17000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.b

 12%|████████████████████▌                                                                                                                                                        | 17/143 [01:17<09:51,  4.69s/it]

Step step17000 Layer embed_out.weight: Rank = 512
Step step18000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step18000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step18000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step18000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step18000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step18000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step18000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step18000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 511
Step step18000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step18000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step18000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step18000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step18000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.b

 13%|█████████████████████▊                                                                                                                                                       | 18/143 [01:21<09:47,  4.70s/it]

Step step18000 Layer embed_out.weight: Rank = 512
Step step19000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step19000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step19000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step19000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step19000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step19000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step19000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step19000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step19000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step19000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step19000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step19000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step19000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.b

 13%|██████████████████████▉                                                                                                                                                      | 19/143 [01:26<09:27,  4.57s/it]

Step step19000 Layer embed_out.weight: Rank = 512
Step step20000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step20000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step20000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step20000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step20000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step20000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step20000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step20000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step20000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step20000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step20000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step20000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step20000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.b

 14%|████████████████████████▏                                                                                                                                                    | 20/143 [01:30<09:25,  4.60s/it]

Step step20000 Layer embed_out.weight: Rank = 512
Step step21000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step21000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step21000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step21000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step21000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step21000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step21000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step21000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 511
Step step21000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step21000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step21000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step21000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step21000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.b

 15%|█████████████████████████▍                                                                                                                                                   | 21/143 [01:35<09:24,  4.63s/it]

Step step21000 Layer embed_out.weight: Rank = 512
Step step22000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step22000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step22000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step22000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step22000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step22000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step22000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step22000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step22000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step22000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step22000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step22000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step22000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.b

 15%|██████████████████████████▌                                                                                                                                                  | 22/143 [01:39<09:12,  4.56s/it]

Step step22000 Layer embed_out.weight: Rank = 512
Step step23000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step23000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step23000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step23000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step23000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step23000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step23000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step23000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 511
Step step23000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step23000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step23000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step23000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step23000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.b

 16%|███████████████████████████▊                                                                                                                                                 | 23/143 [01:44<08:58,  4.48s/it]

Step step23000 Layer embed_out.weight: Rank = 512
Step step24000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step24000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step24000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step24000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step24000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step24000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step24000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step24000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step24000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step24000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step24000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step24000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step24000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.b

 17%|█████████████████████████████                                                                                                                                                | 24/143 [01:48<08:50,  4.45s/it]

Step step24000 Layer embed_out.weight: Rank = 512
Step step25000 Layer gpt_neox.embed_in.weight: Rank = 512
Step step25000 Layer gpt_neox.layers.0.input_layernorm.weight: Rank = 1
Step step25000 Layer gpt_neox.layers.0.input_layernorm.bias: Rank = 1
Step step25000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Rank = 1
Step step25000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Rank = 1
Step step25000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Rank = 512
Step step25000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Rank = 1
Step step25000 Layer gpt_neox.layers.0.attention.dense.weight: Rank = 512
Step step25000 Layer gpt_neox.layers.0.attention.dense.bias: Rank = 1
Step step25000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Rank = 512
Step step25000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.bias: Rank = 1
Step step25000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.weight: Rank = 512
Step step25000 Layer gpt_neox.layers.0.mlp.dense_4h_to_h.b

 17%|██████████████████████████████▏                                                                                                                                              | 25/143 [01:53<08:48,  4.48s/it]

Step step25000 Layer embed_out.weight: Rank = 512


 17%|██████████████████████████████▏                                                                                                                                              | 25/143 [01:54<09:01,  4.59s/it]


KeyboardInterrupt: 

In [None]:
with open('pythia_70m_layer_diff_rank.pickle', 'wb') as file:
    pickle.dump(step_dists, file)

In [None]:
# Distance form the last-epoch's layer's weights vs current-epcoh's

In [None]:
plot_data = []
for epoch, layers in step_dists.items():
    for layer, distance in layers.items():
        plot_data.append({'Epoch': int(epoch), 'Layer': layer, 'Distance': distance})

# Creating a DataFrame from the structured data
df = pd.DataFrame(plot_data)

# Sorting the DataFrame by Epoch for better plotting
df.sort_values(by='Epoch', inplace=True)

# Setting up the plot style
sns.set(style="whitegrid")

# Creating the plot
plt.figure(figsize=(30, 16))

# Plotting a line for each layer
for layer in list(ref_model_params.keys()):
    if "rotary_emb.inv_freq" not in layer:
        sns.lineplot(data=df[df['Layer'] == layer], x='Epoch', y='Distance', label=layer)

# Adding plot labels and title
plt.xlabel('Epoch')
plt.ylabel('Distance')
plt.title('Distance by Layer over Different Epochs')
plt.legend(title='Layers', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()

plt.show()

In [None]:
# Distance form the last-epoch's layer's weights vs current-epcoh's (log-scale)

In [None]:

# Setting up the plot style
sns.set(style="whitegrid")

# Creating the plot
plt.figure(figsize=(30, 16))

# Dictionary to store slope data for each layer
slope_data = {'Layer': [], 'Slope': []}

# Plotting a line for each layer
for layer in list(ref_model_params.keys()):
    if ("rotary_emb.inv_freq" not in layer) and ("bias" not in layer):
        layer_data = df[df['Layer'] == layer]
        sns.lineplot(data=layer_data, x='Epoch', y='Distance', label=layer)

        # Calculating the slope using linear regression
        x = layer_data['Epoch']
        y = layer_data['Distance']
        slope, intercept = np.polyfit(x, y, 1)

        # Adding slope data to the dictionary
        slope_data['Layer'].append(layer)
        slope_data['Slope'].append(slope)

# Adding plot labels and title
plt.xlabel('Epoch')
plt.ylabel('Distance (log scale)')
plt.yscale('log')  # Set y-axis to logarithmic scale
plt.title('Distance by Layer over Different Epochs')
plt.legend(title='Layers', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()

plt.show()

In [None]:
# Creating a DataFrame for the slope data
slope_df = pd.DataFrame(slope_data)

# Displaying the slope data
print("Slope Data:")
print(slope_df)