# Weights Distances

In [1]:
######################## (70m) ############################

In [2]:
from tqdm import tqdm
from transformers import GPTNeoXForCausalLM, AutoTokenizer
import concurrent.futures
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pickle
import seaborn as sns



tokenizer = AutoTokenizer.from_pretrained( "EleutherAI/pythia-70m-deduped", revision="step143000" )
ref_model = GPTNeoXForCausalLM.from_pretrained( "EleutherAI/pythia-70m-deduped", revision="step143000")
ref_model_params = ref_model.state_dict()

In [8]:
# Define the function to calculate layer distances
def calculate_layer_distances(step_idx,rev_id):
    model = GPTNeoXForCausalLM.from_pretrained( "EleutherAI/pythia-70m-deduped", revision=rev_id)
    model_params = model.state_dict()
        
    layer_dists = {}
    
    for layer in list(model_params.keys()):
        # Extract weights for the current layer
        weights_model = model_params[layer]
        weights_ref_model = ref_model_params[layer]

        # Calculate Euclidean distance (you can replace this with other metrics)
        euclidean_distance = (weights_model - weights_ref_model).norm().item()
        layer_dists[layer] = euclidean_distance
        print(f"Step {rev_id} Layer {layer}: Euclidean Distance = {euclidean_distance}")

    del model
    return layer_dists

# Set the number of steps and the number of steps to process simultaneously
num_steps = 143  # Adjust this according to your actual number of steps

In [9]:
# since the distance is majorly in the first and last layers, should we apply layerwise learning rates ?3
# the overall "loss" is sum of individial layer-losses
# so, from the plots of layerwise distances it can be seen which layer has a steep-decrease in distance and which layer does'nt 
# this can be used for layerwise finetuning ? 

In [None]:
step_dists = {}

with concurrent.futures.ThreadPoolExecutor() as executor:
    # Using ThreadPoolExecutor to parallelize the loop
    futures = {executor.submit(calculate_layer_distances, step_idx, f"step{1000*(step_idx+1)}"): step_idx for step_idx in range(num_steps)}

    for future in tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
        step_idx = futures[future]
        revision_id = f"step{1000*(step_idx+1)}"
        step_dists[step_idx] = future.result()

  0%|                                                                                                                                                                                      | 0/143 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

  1%|█▏                                                                                                                                                                          | 1/143 [01:37<3:51:18, 97.73s/it]

Step step13000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 273.5267639160156
Step step13000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.015206336975098
Step step13000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 4.489306926727295
Step step13000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.102649211883545
Step step13000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.2717159986495972
Step step13000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 30.943649291992188
Step step13000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 74.43865203857422
Step step13000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 13.900260925292969
Step step13000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.6547363996505737
Step step13000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight:

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

  1%|██▍                                                                                                                                                                         | 2/143 [01:47<1:48:20, 46.10s/it]

Step step16000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 272.931396484375
Step step16000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.528091907501221
Step step16000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 4.16194486618042
Step step16000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.1550586223602295
Step step16000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.2017978429794312
Step step16000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 29.971752166748047
Step step16000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 73.09539031982422
Step step16000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 13.505831718444824
Step step16000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.5466543436050415
Step step16000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: 

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

  2%|███▌                                                                                                                                                                        | 3/143 [02:21<1:34:55, 40.68s/it]

Step step6000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 254.92550659179688
Step step6000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 2.4641687870025635
Step step6000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 5.491636753082275
Step step6000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 5.758944988250732
Step step6000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.3270764350891113
Step step6000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 33.86932373046875
Step step6000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 77.47471618652344
Step step6000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 14.0899019241333
Step step6000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 2.013460636138916
Step step6000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Euclidean D

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

  3%|████▊                                                                                                                                                                       | 4/143 [02:30<1:04:28, 27.83s/it]

Step step14000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 273.647216796875
Step step14000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.211747646331787
Step step14000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 4.375204086303711
Step step14000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.070387840270996
Step step14000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.2492040395736694
Step step14000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 30.590288162231445
Step step14000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 74.00431823730469
Step step14000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 13.753132820129395
Step step14000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.6174139976501465
Step step14000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: 

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

  3%|██████                                                                                                                                                                        | 5/143 [02:38<47:42, 20.74s/it]

Step step15000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 273.4178466796875
Step step15000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.370882511138916
Step step15000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 4.26692533493042
Step step15000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.0970699787139893
Step step15000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.2263561487197876
Step step15000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 30.28978729248047
Step step15000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 73.53746795654297
Step step15000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 13.636945724487305
Step step15000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.5811591148376465
Step step15000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: 

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

  4%|███████▎                                                                                                                                                                      | 6/143 [02:40<33:07, 14.51s/it]

Step step11000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 272.0161437988281
Step step11000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 3.6089377403259277
Step step11000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 4.726040363311768
Step step11000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.445899486541748
Step step11000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.3112047910690308
Step step11000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 31.622203826904297
Step step11000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 75.31324768066406
Step step11000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 14.097148895263672
Step step11000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.7356832027435303
Step step11000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight

  5%|████████▌                                                                                                                                                                     | 7/143 [02:42<23:29, 10.37s/it]

Step step7000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 260.58135986328125
Step step7000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 2.696251153945923
Step step7000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 5.304899215698242
Step step7000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 4.785677909851074
Step step7000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.3381812572479248
Step step7000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 33.367698669433594
Step step7000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 77.01908111572266
Step step7000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 14.195975303649902
Step step7000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.942922592163086
Step step7000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Euclidean

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

  6%|█████████▋                                                                                                                                                                    | 8/143 [02:43<16:51,  7.49s/it]

Step step3000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 226.36756896972656
Step step3000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 1.831499695777893
Step step3000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 6.367600917816162
Step step3000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 9.404569625854492
Step step3000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.265354037284851
Step step3000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 35.69472122192383
Step step3000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 79.25436401367188
Step step3000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 13.221075057983398
Step step3000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 2.2788774967193604
Step step3000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Euclidean 

  6%|██████████▉                                                                                                                                                                   | 9/143 [02:45<12:21,  5.54s/it]

Step step9000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 268.0849304199219
Step step9000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 3.151726722717285
Step step9000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 4.992898464202881
Step step9000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 3.315944194793701
Step step9000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.33623206615448
Step step9000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 32.41864013671875
Step step9000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 76.14226531982422
Step step9000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 14.231165885925293
Step step9000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.8287090063095093
Step step9000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Euclidean Di

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

  7%|████████████                                                                                                                                                                 | 10/143 [02:49<11:38,  5.25s/it]

Step step1000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 201.6257781982422
Step step1000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 1.7775578498840332
Step step1000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 7.211513519287109
Step step1000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 11.064237594604492
Step step1000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.2760734558105469
Step step1000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 36.84406280517578
Step step1000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 81.37568664550781
Step step1000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 12.357338905334473
Step step1000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 2.4261786937713623
Step step1000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Euclidea

  8%|█████████████▎                                                                                                                                                               | 11/143 [02:51<09:11,  4.18s/it]

Step step4000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 238.13278198242188
Step step4000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 2.01214599609375
Step step4000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 5.988559722900391
Step step4000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 8.098560333251953
Step step4000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.2855143547058105
Step step4000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 34.988731384277344
Step step4000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 78.55138397216797
Step step4000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 13.646321296691895
Step step4000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 2.183746576309204
Step step4000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Euclidean 

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

  8%|██████████████▌                                                                                                                                                              | 12/143 [02:52<07:02,  3.23s/it]

Step step12000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 273.0096740722656
Step step12000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 3.826784133911133
Step step12000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 4.604235649108887
Step step12000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.2182929515838623
Step step12000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.2924445867538452
Step step12000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 31.277673721313477
Step step12000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 74.87709045410156
Step step12000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 14.003046989440918
Step step12000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.6943747997283936
Step step12000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

  9%|███████████████▋                                                                                                                                                             | 13/143 [02:55<06:56,  3.20s/it]

Step step2000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 212.15289306640625
Step step2000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 1.7648742198944092
Step step2000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 6.963913440704346
Step step2000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 10.62486457824707
Step step2000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.2564787864685059
Step step2000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 36.52656555175781
Step step2000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 80.22220611572266
Step step2000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 12.700386047363281
Step step2000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 2.3624768257141113
Step step2000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Euclidea

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 10%|████████████████▉                                                                                                                                                            | 14/143 [03:05<11:24,  5.30s/it]

Step step17000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 272.23858642578125
Step step17000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.669159889221191
Step step17000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 4.059601306915283
Step step17000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.2302405834198
Step step17000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.1778943538665771
Step step17000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 29.68033790588379
Step step17000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 72.63678741455078
Step step17000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 13.376591682434082
Step step17000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.5139029026031494
Step step17000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: E

 10%|██████████████████▏                                                                                                                                                          | 15/143 [03:06<08:28,  3.97s/it]

Step step5000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 247.5482940673828
Step step5000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 2.244213342666626
Step step5000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 5.71243143081665
Step step5000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 6.867464065551758
Step step5000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.3078899383544922
Step step5000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 34.40169906616211
Step step5000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 77.98627471923828
Step step5000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 13.919836044311523
Step step5000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 2.0934317111968994
Step step5000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Euclidean D

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 11%|███████████████████▎                                                                                                                                                         | 16/143 [03:15<11:11,  5.29s/it]

Step step8000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 264.8728332519531
Step step8000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 2.925847053527832
Step step8000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 5.14096736907959
Step step8000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 3.973113536834717
Step step8000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.3409450054168701
Step step8000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 32.895286560058594
Step step8000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 76.58257293701172
Step step8000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 14.249476432800293
Step step8000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.8825150728225708
Step step8000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Euclidean 

 12%|████████████████████▌                                                                                                                                                        | 17/143 [03:16<08:50,  4.21s/it]

Step step10000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 270.4158020019531
Step step10000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 3.389859199523926
Step step10000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 4.856212615966797
Step step10000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.8023955821990967
Step step10000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.3261995315551758
Step step10000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 31.99799156188965
Step step10000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 75.72787475585938
Step step10000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 14.168938636779785
Step step10000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.7807031869888306
Step step10000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight:

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 13%|█████████████████████▊                                                                                                                                                       | 18/143 [04:22<47:32, 22.82s/it]

Step step25000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 261.9300231933594
Step step25000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.2896728515625
Step step25000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 3.3612349033355713
Step step25000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.7235167026519775
Step step25000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.9853634834289551
Step step25000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 27.320138931274414
Step step25000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 67.69708251953125
Step step25000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 12.248025894165039
Step step25000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.2753169536590576
Step step25000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight:

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 13%|██████████████████████▉                                                                                                                                                      | 19/143 [04:25<34:22, 16.63s/it]

Step step18000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 271.3516845703125
Step step18000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.793742656707764
Step step18000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 3.9617934226989746
Step step18000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.316472291946411
Step step18000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.1530137062072754
Step step18000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 29.381855010986328
Step step18000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 72.18702697753906
Step step18000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 13.239431381225586
Step step18000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.4812541007995605
Step step18000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 14%|████████████████████████▏                                                                                                                                                    | 20/143 [05:10<52:04, 25.40s/it]

Step step20000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 269.1359558105469
Step step20000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.993687152862549
Step step20000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 3.7755582332611084
Step step20000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.4719321727752686
Step step20000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.1039505004882812
Step step20000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 28.783954620361328
Step step20000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 71.24905395507812
Step step20000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 12.9540433883667
Step step20000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.4198883771896362
Step step20000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight:

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 15%|█████████████████████████▍                                                                                                                                                   | 21/143 [05:25<45:11, 22.23s/it]

Step step19000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 270.2980041503906
Step step19000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.900063991546631
Step step19000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 3.8673055171966553
Step step19000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.3971550464630127
Step step19000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.1280032396316528
Step step19000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 29.067062377929688
Step step19000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 71.74176788330078
Step step19000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 13.090481758117676
Step step19000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.4504988193511963
Step step19000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weigh

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 16%|███████████████████████████▊                                                                                                                                                 | 23/143 [05:31<26:40, 13.34s/it]

Step step30000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 253.30612182617188
Step step30000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.404611110687256
Step step30000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 3.001973867416382
Step step30000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.850205183029175
Step step30000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.8819316029548645
Step step30000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 25.8724365234375
Step step30000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 62.41487503051758
Step step30000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 11.532169342041016
Step step30000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.1481235027313232
Step step30000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: 

 17%|█████████████████████████████                                                                                                                                                | 24/143 [05:33<20:35, 10.38s/it]

Step step24000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 263.51007080078125
Step step24000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.244401454925537
Step step24000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 3.4394466876983643
Step step24000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.6839911937713623
Step step24000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.008199691772461
Step step24000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 27.607952117919922
Step step24000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 68.41779327392578
Step step24000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 12.391595840454102
Step step24000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.3027302026748657
Step step24000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weigh

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 17%|██████████████████████████████▏                                                                                                                                              | 25/143 [05:33<15:22,  7.82s/it]

Step step21000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 267.8431701660156
Step step21000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.080574989318848
Step step21000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 3.6867101192474365
Step step21000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.542992353439331
Step step21000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.0797662734985352
Step step21000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 28.500778198242188
Step step21000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 70.75563049316406
Step step21000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 12.82263469696045
Step step21000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.3904674053192139
Step step21000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight:

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 18%|███████████████████████████████▍                                                                                                                                             | 26/143 [05:35<11:58,  6.14s/it]

Step step27000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 258.6062927246094
Step step27000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.359560489654541
Step step27000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 3.211942195892334
Step step27000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.7790002822875977
Step step27000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.9428645968437195
Step step27000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 26.76396942138672
Step step27000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 65.65452575683594
Step step27000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 11.962268829345703
Step step27000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.2230526208877563
Step step27000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight:

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 19%|████████████████████████████████▋                                                                                                                                            | 27/143 [05:37<09:32,  4.93s/it]

Step step22000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 266.47332763671875
Step step22000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.140028953552246
Step step22000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 3.5998549461364746
Step step22000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.594362735748291
Step step22000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.0561892986297607
Step step22000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 28.229799270629883
Step step22000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 70.27970123291016
Step step22000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 12.678995132446289
Step step22000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.3605799674987793
Step step22000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weigh

 20%|█████████████████████████████████▊                                                                                                                                           | 28/143 [05:39<07:46,  4.06s/it]

Step step26000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 260.2942810058594
Step step26000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.324553489685059
Step step26000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 3.2837817668914795
Step step26000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.7636210918426514
Step step26000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.962169349193573
Step step26000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 27.012954711914062
Step step26000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 66.69368743896484
Step step26000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 12.092212677001953
Step step26000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.2483584880828857
Step step26000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 20%|███████████████████████████████████                                                                                                                                          | 29/143 [05:40<06:01,  3.17s/it]

Step step23000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 265.0376892089844
Step step23000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.187092304229736
Step step23000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 3.519047737121582
Step step23000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.6271212100982666
Step step23000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 1.0304350852966309
Step step23000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 27.922332763671875
Step step23000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 69.34770965576172
Step step23000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 12.545108795166016
Step step23000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.3305325508117676
Step step23000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 21%|████████████████████████████████████▎                                                                                                                                        | 30/143 [05:49<09:22,  4.97s/it]

Step step28000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 256.8913879394531
Step step28000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.380966663360596
Step step28000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 3.1381595134735107
Step step28000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.8151135444641113
Step step28000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.9217899441719055
Step step28000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 26.416011810302734
Step step28000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 64.56440734863281
Step step28000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 11.789510726928711
Step step28000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.1965739727020264
Step step28000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weigh

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 22%|█████████████████████████████████████▌                                                                                                                                       | 31/143 [06:04<14:55,  8.00s/it]

Step step31000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 251.4734344482422
Step step31000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.4158735275268555
Step step31000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.9377193450927734
Step step31000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.8511435985565186
Step step31000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.8639815449714661
Step step31000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 25.617786407470703
Step step31000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 61.403743743896484
Step step31000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 11.405319213867188
Step step31000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.125105857849121
Step step31000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weig

 22%|██████████████████████████████████████▋                                                                                                                                      | 32/143 [06:04<10:35,  5.72s/it]

Step step34000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 245.79771423339844
Step step34000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.4008612632751465
Step step34000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.7531063556671143
Step step34000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.8700616359710693
Step step34000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.8112499713897705
Step step34000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 24.673288345336914
Step step34000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 58.27558898925781
Step step34000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 10.913117408752441
Step step34000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.0542707443237305
Step step34000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.wei

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 23%|███████████████████████████████████████▉                                                                                                                                     | 33/143 [06:08<09:14,  5.04s/it]

Step step33000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 247.70790100097656
Step step33000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.415881156921387
Step step33000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.8121261596679688
Step step33000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.8705942630767822
Step step33000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.829046905040741
Step step33000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 24.967912673950195
Step step33000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 59.428775787353516
Step step33000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 11.054452896118164
Step step33000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.0770915746688843
Step step33000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weig

 24%|█████████████████████████████████████████▏                                                                                                                                   | 34/143 [06:10<07:35,  4.18s/it]

Step step32000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 249.60662841796875
Step step32000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.417267799377441
Step step32000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.8730623722076416
Step step32000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.863271474838257
Step step32000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.8465997576713562
Step step32000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 25.29424476623535
Step step32000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 60.357269287109375
Step step32000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 11.227696418762207
Step step32000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.101446509361267
Step step32000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 24%|██████████████████████████████████████████▎                                                                                                                                  | 35/143 [06:52<28:07, 15.62s/it]

Step step39000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 235.8982391357422
Step step39000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.289036273956299
Step step39000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.48199725151062
Step step39000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.831489324569702
Step step39000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.7392817139625549
Step step39000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 23.17606544494629
Step step39000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 52.65448760986328
Step step39000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 10.18405532836914
Step step39000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.9364778399467468
Step step39000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Eu

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 25%|███████████████████████████████████████████▌                                                                                                                                 | 36/143 [07:16<32:17, 18.11s/it]

Step step35000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 243.8506622314453
Step step35000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.395422458648682
Step step35000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.6966278553009033
Step step35000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.863881826400757
Step step35000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.7979017496109009
Step step35000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 24.397008895874023
Step step35000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 57.44918441772461
Step step35000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 10.786169052124023
Step step35000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.0314704179763794
Step step35000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 26%|████████████████████████████████████████████▊                                                                                                                                | 37/143 [07:56<43:15, 24.48s/it]

Step step46000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 220.9480743408203
Step step46000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.829555988311768
Step step46000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.191307783126831
Step step46000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.522005319595337
Step step46000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.6483757495880127
Step step46000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 20.87925910949707
Step step46000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 41.4006233215332
Step step46000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 9.171358108520508
Step step46000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.7720775008201599
Step step46000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Eu

 27%|█████████████████████████████████████████████▉                                                                                                                               | 38/143 [07:57<30:29, 17.43s/it]

Step step45000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 223.1477508544922
Step step45000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.853240013122559
Step step45000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.234051465988159
Step step45000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.4627742767333984
Step step45000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.663432776927948
Step step45000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 21.263540267944336
Step step45000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 43.51971435546875
Step step45000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 9.368280410766602
Step step45000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.7941614389419556
Step step45000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: 

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 27%|███████████████████████████████████████████████▏                                                                                                                             | 39/143 [07:59<22:23, 12.92s/it]

Step step36000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 241.9014129638672
Step step36000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.3730998039245605
Step step36000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.6403968334198
Step step36000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.8612923622131348
Step step36000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.7821374535560608
Step step36000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 24.088516235351562
Step step36000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 56.245487213134766
Step step36000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 10.617091178894043
Step step36000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 1.007311463356018
Step step36000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight:

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 28%|████████████████████████████████████████████████▍                                                                                                                            | 40/143 [08:07<19:21, 11.27s/it]

Step step40000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 233.8445587158203
Step step40000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.253170490264893
Step step40000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.4318532943725586
Step step40000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.8092312812805176
Step step40000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.7256564497947693
Step step40000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 22.876461029052734
Step step40000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 51.44121551513672
Step step40000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 10.055418014526367
Step step40000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.912468433380127
Step step40000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 29%|█████████████████████████████████████████████████▌                                                                                                                           | 41/143 [08:12<16:05,  9.47s/it]

Step step38000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 237.91842651367188
Step step38000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.318467617034912
Step step38000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.5334694385528564
Step step38000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.8475680351257324
Step step38000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.7537336945533752
Step step38000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 23.486568450927734
Step step38000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 53.851524353027344
Step step38000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 10.327042579650879
Step step38000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.9596667885780334
Step step38000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.wei

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 29%|██████████████████████████████████████████████████▊                                                                                                                          | 42/143 [08:15<12:58,  7.71s/it]

Step step49000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 214.26397705078125
Step step49000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.700056552886963
Step step49000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.0666470527648926
Step step49000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.502514362335205
Step step49000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.6177554726600647
Step step49000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 19.98246192932129
Step step49000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 35.95005416870117
Step step49000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 8.744438171386719
Step step49000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.7079238891601562
Step step49000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight:

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 30%|████████████████████████████████████████████████████                                                                                                                         | 43/143 [08:17<09:56,  5.97s/it]

Step step37000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 239.91561889648438
Step step37000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.351335048675537
Step step37000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.5874719619750977
Step step37000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.8562285900115967
Step step37000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.7669726014137268
Step step37000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 23.82452392578125
Step step37000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 54.99732208251953
Step step37000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 10.501977920532227
Step step37000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.9843721389770508
Step step37000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weigh

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 31%|█████████████████████████████████████████████████████▏                                                                                                                       | 44/143 [08:30<13:03,  7.91s/it]

Step step43000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 227.6217803955078
Step step43000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.110168933868408
Step step43000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.2917351722717285
Step step43000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.7367401123046875
Step step43000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.6889550685882568
Step step43000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 21.96359634399414
Step step43000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 47.782501220703125
Step step43000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 9.619226455688477
Step step43000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.8470319509506226
Step step43000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 31%|██████████████████████████████████████████████████████▍                                                                                                                      | 45/143 [08:33<10:36,  6.49s/it]

Step step41000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 231.7830352783203
Step step41000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.205078125
Step step41000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.3840177059173584
Step step41000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.7905185222625732
Step step41000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.7119682431221008
Step step41000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 22.598234176635742
Step step41000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 50.24806594848633
Step step41000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 9.912712097167969
Step step41000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.8910467028617859
Step step41000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: Eucl

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 32%|███████████████████████████████████████████████████████▋                                                                                                                     | 46/143 [08:38<09:43,  6.02s/it]

Step step44000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 225.46546936035156
Step step44000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.006383419036865
Step step44000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.2544548511505127
Step step44000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.6294679641723633
Step step44000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.6794352531433105
Step step44000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 21.665475845336914
Step step44000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 45.98360061645508
Step step44000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 9.520933151245117
Step step44000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.8252177834510803
Step step44000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weigh

 33%|████████████████████████████████████████████████████████▊                                                                                                                    | 47/143 [08:38<06:52,  4.30s/it]

Step step42000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 229.71168518066406
Step step42000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 5.156898498535156
Step step42000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.339120388031006
Step step42000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.766491651535034
Step step42000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.7000927925109863
Step step42000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 22.27313804626465
Step step42000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 49.03482437133789
Step step42000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 9.761330604553223
Step step42000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.8687038421630859
Step step42000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: 

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 34%|██████████████████████████████████████████████████████████                                                                                                                   | 48/143 [08:53<11:42,  7.40s/it]

Step step47000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 218.7469482421875
Step step47000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.805037975311279
Step step47000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.1476497650146484
Step step47000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.550614595413208
Step step47000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.6377701163291931
Step step47000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 20.607004165649414
Step step47000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 40.117774963378906
Step step47000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 9.056031227111816
Step step47000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.751787543296814
Step step47000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight:

 34%|███████████████████████████████████████████████████████████▎                                                                                                                 | 49/143 [08:53<08:20,  5.33s/it]

Step step52000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 207.38473510742188
Step step52000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.57354211807251
Step step52000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.9414705038070679
Step step52000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.4348065853118896
Step step52000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.5885534286499023
Step step52000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 19.16609001159668
Step step52000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 31.711271286010742
Step step52000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 8.35565185546875
Step step52000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.6502982974052429
Step step52000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight:

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 35%|████████████████████████████████████████████████████████████▍                                                                                                                | 50/143 [08:58<07:50,  5.06s/it]

Step step48000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 216.5559844970703
Step step48000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.726113319396973
Step step48000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.1087770462036133
Step step48000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.4846906661987305
Step step48000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.6307647228240967
Step step48000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 20.325153350830078
Step step48000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 38.12864303588867
Step step48000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 8.906048774719238
Step step48000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.7301028370857239
Step step48000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 36%|█████████████████████████████████████████████████████████████▋                                                                                                               | 51/143 [09:06<09:10,  5.99s/it]

Step step50000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 211.96055603027344
Step step50000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.67022705078125
Step step50000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 2.0211422443389893
Step step50000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.5016167163848877
Step step50000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.6082541346549988
Step step50000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 19.679479598999023
Step step50000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 34.56282424926758
Step step50000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 8.588507652282715
Step step50000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.6873337626457214
Step step50000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 36%|██████████████████████████████████████████████████████████████▉                                                                                                              | 52/143 [09:16<10:55,  7.21s/it]

Step step62000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 183.33412170410156
Step step62000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.046189308166504
Step step62000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.6054399013519287
Step step62000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.0628859996795654
Step step62000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.5106404423713684
Step step62000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 16.60003662109375
Step step62000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 18.126554489135742
Step step62000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 7.222997665405273
Step step62000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.4874851703643799
Step step62000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weigh

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 37%|████████████████████████████████████████████████████████████████                                                                                                             | 53/143 [09:46<21:12, 14.14s/it]

Step step51000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 209.7025604248047
Step step51000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.622747898101807
Step step51000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.9820606708526611
Step step51000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.4593636989593506
Step step51000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.5965280532836914
Step step51000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 19.473546981811523
Step step51000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 33.164676666259766
Step step51000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 8.505826950073242
Step step51000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.6701861023902893
Step step51000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weigh

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 38%|█████████████████████████████████████████████████████████████████▎                                                                                                           | 54/143 [10:26<32:22, 21.83s/it]

Step step58000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 193.09307861328125
Step step58000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.274082183837891
Step step58000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.7336387634277344
Step step58000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.2181472778320312
Step step58000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.5389279127120972
Step step58000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 17.623720169067383
Step step58000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 23.20814323425293
Step step58000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 7.6689324378967285
Step step58000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.5466067790985107
Step step58000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weig

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 38%|██████████████████████████████████████████████████████████████████▌                                                                                                          | 55/143 [10:35<26:18, 17.94s/it]

Step step57000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 195.532958984375
Step step57000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.332051753997803
Step step57000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.764184832572937
Step step57000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.2549550533294678
Step step57000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.5458176732063293
Step step57000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 17.85057830810547
Step step57000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 24.492841720581055
Step step57000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 7.777557373046875
Step step57000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.5612496137619019
Step step57000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: 

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 39%|███████████████████████████████████████████████████████████████████▋                                                                                                         | 56/143 [10:39<20:07, 13.88s/it]

Step step53000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 205.04672241210938
Step step53000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.542241096496582
Step step53000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.9066035747528076
Step step53000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.4107842445373535
Step step53000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.57902991771698
Step step53000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 18.931659698486328
Step step53000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 30.203224182128906
Step step53000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 8.253536224365234
Step step53000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.6339253187179565
Step step53000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 40%|████████████████████████████████████████████████████████████████████▉                                                                                                        | 57/143 [10:42<14:56, 10.42s/it]

Step step55000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 200.38197326660156
Step step55000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.453700065612793
Step step55000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.832516074180603
Step step55000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.354848623275757
Step step55000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.5612690448760986
Step step55000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 18.378786087036133
Step step55000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 27.595012664794922
Step step55000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 7.986347675323486
Step step55000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.5979360938072205
Step step55000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight

 41%|██████████████████████████████████████████████████████████████████████▏                                                                                                      | 58/143 [10:43<10:57,  7.74s/it]

Step step54000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 202.74456787109375
Step step54000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.494204521179199
Step step54000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.8687443733215332
Step step54000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.3789114952087402
Step step54000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.5723295211791992
Step step54000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 18.65452003479004
Step step54000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 29.012418746948242
Step step54000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 8.118690490722656
Step step54000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.6152997612953186
Step step54000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weigh

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 41%|███████████████████████████████████████████████████████████████████████▍                                                                                                     | 59/143 [10:52<11:17,  8.07s/it]

Step step56000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 197.9805450439453
Step step56000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.385384559631348
Step step56000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.7999951839447021
Step step56000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.2855064868927
Step step56000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.5563716888427734
Step step56000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 18.121572494506836
Step step56000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 26.03289222717285
Step step56000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 7.894290924072266
Step step56000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.5785292983055115
Step step56000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: E

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 42%|████████████████████████████████████████████████████████████████████████▌                                                                                                    | 60/143 [11:00<11:21,  8.21s/it]

Step step61000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 185.75289916992188
Step step61000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.100931167602539
Step step61000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.6376218795776367
Step step61000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.106187582015991
Step step61000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.5164696574211121
Step step61000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 16.850601196289062
Step step61000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 19.261173248291016
Step step61000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 7.33197546005249
Step step61000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.5015605688095093
Step step61000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 43%|█████████████████████████████████████████████████████████████████████████▊                                                                                                   | 61/143 [11:04<09:23,  6.88s/it]

Step step59000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 190.63563537597656
Step step59000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.2169976234436035
Step step59000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.7017782926559448
Step step59000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.178560256958008
Step step59000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.5312467217445374
Step step59000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 17.350004196166992
Step step59000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 21.704395294189453
Step step59000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 7.554245471954346
Step step59000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.5308369994163513
Step step59000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weig

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 43%|███████████████████████████████████████████████████████████████████████████                                                                                                  | 62/143 [11:19<12:18,  9.12s/it]

Step step66000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 173.1875457763672
Step step66000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 3.7943360805511475
Step step66000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.4984623193740845
Step step66000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 1.8853416442871094
Step step66000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.4843783974647522
Step step66000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 15.612637519836426
Step step66000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 13.289007186889648
Step step66000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 6.8115057945251465
Step step66000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.4383831024169922
Step step66000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.wei

 44%|████████████████████████████████████████████████████████████████████████████▏                                                                                                | 63/143 [11:20<08:54,  6.68s/it]

Step step63000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 180.73243713378906
Step step63000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 3.9435760974884033
Step step63000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.5885059833526611
Step step63000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 1.9467289447784424
Step step63000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.5060172080993652
Step step63000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 16.350685119628906
Step step63000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 16.379161834716797
Step step63000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 7.16433048248291
Step step63000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.47333842515945435
Step step63000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.wei

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 45%|█████████████████████████████████████████████████████████████████████████████▍                                                                                               | 64/143 [11:23<07:21,  5.59s/it]

Step step60000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 188.1691436767578
Step step60000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 4.175662517547607
Step step60000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.66521418094635
Step step60000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 2.1734516620635986
Step step60000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.5203520655632019
Step step60000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 17.071300506591797
Step step60000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 20.534191131591797
Step step60000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 7.418323516845703
Step step60000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.5151064395904541
Step step60000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight:

 45%|██████████████████████████████████████████████████████████████████████████████▋                                                                                              | 65/143 [11:23<05:16,  4.05s/it]

Step step64000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 178.16969299316406
Step step64000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 3.922907829284668
Step step64000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.5554836988449097
Step step64000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 1.97566819190979
Step step64000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.4942311942577362
Step step64000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 16.07673454284668
Step step64000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 15.23305892944336
Step step64000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 7.016293525695801
Step step64000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.4619559645652771
Step step64000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight: 

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 46%|███████████████████████████████████████████████████████████████████████████████▊                                                                                             | 66/143 [11:42<10:46,  8.40s/it]

Step step65000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 175.6750030517578
Step step65000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 3.8597939014434814
Step step65000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.5254639387130737
Step step65000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 1.9206806421279907
Step step65000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.4914511442184448
Step step65000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 15.842309951782227
Step step65000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 14.24655532836914
Step step65000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 6.914928913116455
Step step65000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.450059175491333
Step step65000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 47%|█████████████████████████████████████████████████████████████████████████████████                                                                                            | 67/143 [11:58<13:40, 10.79s/it]

Step step67000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 170.69692993164062
Step step67000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 3.730102062225342
Step step67000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.4691005945205688
Step step67000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 1.8416444063186646
Step step67000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.47797268629074097
Step step67000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 15.394221305847168
Step step67000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 12.446106910705566
Step step67000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 6.723733901977539
Step step67000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.42777496576309204
Step step67000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.we

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 48%|██████████████████████████████████████████████████████████████████████████████████▎                                                                                          | 68/143 [12:04<11:47,  9.43s/it]

Step step68000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 168.20603942871094
Step step68000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 3.662479877471924
Step step68000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.4420572519302368
Step step68000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 1.793122410774231
Step step68000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.4723345637321472
Step step68000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 15.156963348388672
Step step68000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 11.655946731567383
Step step68000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 6.60925817489624
Step step68000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.4163432717323303
Step step68000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weight

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 48%|███████████████████████████████████████████████████████████████████████████████████▍                                                                                         | 69/143 [12:12<10:57,  8.89s/it]

Step step70000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 163.17327880859375
Step step70000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 3.5100677013397217
Step step70000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.3911619186401367
Step step70000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 1.6738253831863403
Step step70000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.4615548551082611
Step step70000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 14.712798118591309
Step step70000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 10.303614616394043
Step step70000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 6.449073314666748
Step step70000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.39510244131088257
Step step70000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.we

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 49%|████████████████████████████████████████████████████████████████████████████████████▋                                                                                        | 70/143 [12:32<14:49, 12.18s/it]

Step step69000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 165.71937561035156
Step step69000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 3.60237717628479
Step step69000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.416118860244751
Step step69000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 1.7539193630218506
Step step69000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.46495023369789124
Step step69000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 14.909076690673828
Step step69000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 10.981670379638672
Step step69000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 6.5090107917785645
Step step69000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.4056159257888794
Step step69000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weig

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

 50%|█████████████████████████████████████████████████████████████████████████████████████▉                                                                                       | 71/143 [12:48<16:06, 13.43s/it]

Step step73000 Layer gpt_neox.embed_in.weight: Euclidean Distance = 155.28036499023438
Step step73000 Layer gpt_neox.layers.0.input_layernorm.weight: Euclidean Distance = 3.3782835006713867
Step step73000 Layer gpt_neox.layers.0.input_layernorm.bias: Euclidean Distance = 1.3101263046264648
Step step73000 Layer gpt_neox.layers.0.post_attention_layernorm.weight: Euclidean Distance = 1.6126105785369873
Step step73000 Layer gpt_neox.layers.0.post_attention_layernorm.bias: Euclidean Distance = 0.4395098090171814
Step step73000 Layer gpt_neox.layers.0.attention.query_key_value.weight: Euclidean Distance = 13.96826457977295
Step step73000 Layer gpt_neox.layers.0.attention.query_key_value.bias: Euclidean Distance = 8.578335762023926
Step step73000 Layer gpt_neox.layers.0.attention.dense.weight: Euclidean Distance = 6.112070560455322
Step step73000 Layer gpt_neox.layers.0.attention.dense.bias: Euclidean Distance = 0.3720385730266571
Step step73000 Layer gpt_neox.layers.0.mlp.dense_h_to_4h.weigh

pytorch_model.bin:   0%|          | 0.00/166M [00:00<?, ?B/s]

In [None]:
with open('pythia_70m_weight_dist.pickle', 'wb') as file:
    pickle.dump(step_dists, file)

In [None]:
# Distance form the last-epoch's layer's weights vs current-epcoh's

In [None]:
plot_data = []
for epoch, layers in step_dists.items():
    for layer, distance in layers.items():
        plot_data.append({'Epoch': int(epoch), 'Layer': layer, 'Distance': distance})

# Creating a DataFrame from the structured data
df = pd.DataFrame(plot_data)

# Sorting the DataFrame by Epoch for better plotting
df.sort_values(by='Epoch', inplace=True)

# Setting up the plot style
sns.set(style="whitegrid")

# Creating the plot
plt.figure(figsize=(30, 16))

# Plotting a line for each layer
for layer in list(ref_model_params.keys()):
    if "rotary_emb.inv_freq" not in layer:
        sns.lineplot(data=df[df['Layer'] == layer], x='Epoch', y='Distance', label=layer)

# Adding plot labels and title
plt.xlabel('Epoch')
plt.ylabel('Distance')
plt.title('Distance by Layer over Different Epochs')
plt.legend(title='Layers', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()

plt.show()

In [None]:
# Distance form the last-epoch's layer's weights vs current-epcoh's (log-scale)

In [None]:

# Setting up the plot style
sns.set(style="whitegrid")

# Creating the plot
plt.figure(figsize=(30, 16))

# Dictionary to store slope data for each layer
slope_data = {'Layer': [], 'Slope': []}

# Plotting a line for each layer
for layer in list(ref_model_params.keys()):
    if ("rotary_emb.inv_freq" not in layer) and ("bias" not in layer):
        layer_data = df[df['Layer'] == layer]
        sns.lineplot(data=layer_data, x='Epoch', y='Distance', label=layer)

        # Calculating the slope using linear regression
        x = layer_data['Epoch']
        y = layer_data['Distance']
        slope, intercept = np.polyfit(x, y, 1)

        # Adding slope data to the dictionary
        slope_data['Layer'].append(layer)
        slope_data['Slope'].append(slope)

# Adding plot labels and title
plt.xlabel('Epoch')
plt.ylabel('Distance (log scale)')
plt.yscale('log')  # Set y-axis to logarithmic scale
plt.title('Distance by Layer over Different Epochs')
plt.legend(title='Layers', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.xticks(rotation=45)
plt.tight_layout()

plt.show()

In [None]:
# Creating a DataFrame for the slope data
slope_df = pd.DataFrame(slope_data)

# Displaying the slope data
print("Slope Data:")
print(slope_df)