In [1]:
import torch
import numpy as np
import scipy
import pandas as pd
from itertools import islice

In [2]:
import random

def sample_keys(d, n):
    return random.sample(list(d.keys()), n)

In [3]:
import warnings

warnings.filterwarnings("ignore")

In [4]:
df = pd.read_csv('Desktop/pile_pythia_pplx_10k_gptj.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [8]:
sample_sizes = [500, 1000, 2000, 4000, 6000, 8000, 10000]

### (diff model = reference model)

### No difference model

In [10]:
print("Diff model: None")
print()

for num in sample_sizes:
    df_sample = df.sample(n=num)
    cor, pvalue = scipy.stats.spearmanr(df_sample['index'], -df_sample['deduped_pplx'])
    print(num, pvalue)

Diff model: None

500 0.451511393389349
1000 0.3874607076005504
2000 0.06312504195726486
4000 0.30076504215961036
6000 0.3069801899642444
8000 0.3911058496219253
10000 0.2133978224203535


### Pythia reference model

In [11]:
print("Diff model: pythia-duped-6.9b")
print()

for num in sample_sizes:
    df_sample = df.sample(n=num)
    cor, pvalue = scipy.stats.spearmanr(df_sample['index'], df_sample['duped_pplx'] - df_sample['deduped_pplx'])
    print(num, pvalue)

Diff model: pythia-duped-6.9b

500 0.005392014801079363
1000 5.315932405344063e-05
2000 1.1782825573026501e-06
4000 2.2045749702961732e-08
6000 4.9725838938692856e-15
8000 1.1310785621105327e-16
10000 8.032984078932448e-20


### Llama reference model

In [12]:
print("Diff model: llama3-8b")
print()

for num in sample_sizes:
    df_sample = df.sample(n=num)
    cor, pvalue = scipy.stats.spearmanr(df_sample['index'], df_sample['llama3_pplx'] - df_sample['deduped_pplx'])
    print(num, pvalue)

Diff model: llama3-8b

500 0.5411376516805082
1000 0.5680890303569781
2000 0.19229275652388464
4000 0.006086050290524001
6000 0.1810272621766135
8000 0.10800484843994829
10000 0.00799426148457033


### Reference model = averaging over Pythia

In [39]:
print("Diff model: avg. pythia")
print()

for num in sample_sizes:
    df_sample = df.sample(n=num)
    models = ['duped_pplx', 'deduped-1-4b', 'deduped-2-8b', 'deduped-1b', 'deduped-410m',
       'deduped-160m', 'deduped-70m', 'duped-2-8b', 'duped-1-4b', 'duped-1b',
       'duped-410m', 'duped-160m']
    avg_pplx = np.zeros(len(df_sample))
    for model in models:
        avg_pplx += df_sample[model]
    avg_pplx /= len(models)
    cor, pvalue = scipy.stats.spearmanr(df_sample['index'], avg_pplx - df_sample['deduped_pplx'])
    print(num, pvalue)

Diff model: avg. pythia

500 0.07159199008440532
1000 0.1262794757375701
2000 0.004629676565855284
4000 0.0014216568142167902
6000 0.000545564973951944
8000 3.209966957637225e-05
10000 1.6743454961093683e-07


### Reference model = averaging over Llama1,2,3

In [40]:
print("Diff model: avg. llama")
print()

for num in sample_sizes:
    df_sample = df.sample(n=num)
    models = ['llama3_pplx', 'llama2-7b', 'llama1-7b']
    avg_pplx = np.zeros(len(df_sample))
    for model in models:
        avg_pplx += df_sample[model]
    avg_pplx /= len(models)
    cor, pvalue = scipy.stats.spearmanr(df_sample['index'], avg_pplx - df_sample['deduped_pplx'])
    print(num, pvalue)

Diff model: avg. llama

500 0.03506773970496128
1000 0.5703938986145594
2000 0.7355982395366336
4000 0.09804675321784534
6000 0.012049574098794738
8000 0.0054571112453445005
10000 0.003890886928543179


## Doing some normalizing

In [58]:
print("Diff model: MINMAX NORM. pythia-duped-6.9b")
print()

for num in sample_sizes:
    df_sample = df.sample(n=num)
    cor, pvalue = scipy.stats.spearmanr(df_sample['index'], 
                                       (df_sample['duped_pplx'] - min(df_sample['duped_pplx'])) / (max(df_sample['duped_pplx']) - min(df_sample['duped_pplx'])) - 
                                       (df_sample['deduped_pplx'] - min(df_sample['deduped_pplx'])) / (max(df_sample['deduped_pplx']) - min(df_sample['deduped_pplx'])))
    print(num, pvalue)

Diff model: MINMAX NORM. pythia-duped-6.9b

500 0.6516623332425857
1000 0.005666515628304675
2000 2.25158539921238e-05
4000 4.229314217788246e-08
6000 1.3056100102241288e-13
8000 2.4525927921180357e-15
10000 4.0800380274280514e-20


In [64]:
print("Diff model: STD NORM. pythia-duped-6.9b")
print()

for num in sample_sizes:
    df_sample = df.sample(n=num)
    cor, pvalue = scipy.stats.spearmanr(df_sample['index'], 
                                       (df_sample['duped_pplx'] - np.mean(df_sample['duped_pplx'])) / np.std(df_sample['duped_pplx'])- 
                                       (df_sample['deduped_pplx'] - np.mean(df_sample['deduped_pplx'])) / np.std(df_sample['deduped_pplx']))
    print(num, pvalue)

Diff model: STD NORM. pythia-duped-6.9b

500 0.2520034402104735
1000 0.027821891194410947
2000 0.00039541351922579897
4000 1.172019128514986e-07
6000 1.4572920302581844e-09
8000 3.4935378284883042e-15
10000 4.059541583049317e-20


### Using each of the Pythia duped models as a reference model for Pythia 6.9b-deduped.

Generally, model closer in size (bigger model) works better as a reference model (lower PPLX)

In [8]:
for num in sample_sizes:
    df_sample = df.sample(n=num)
    print(num)
    models = ['duped_pplx', 'duped-2-8b', 'duped-1-4b', 'duped-1b',
       'duped-410m', 'duped-160m']
    for diff_model in models:
        print(diff_model, end = ' ')
        cor, pvalue = scipy.stats.spearmanr(df_sample['index'], df_sample[diff_model] - df_sample['deduped_pplx'])
        print(num, pvalue)
    print()

500
duped_pplx 500 0.5232468044954425
duped-2-8b 500 0.29372762611451136
duped-1-4b 500 0.7711060895901847
duped-1b 500 0.9262985060646471
duped-410m 500 0.9118970176249057
duped-160m 500 0.7685407624684397

1000
duped_pplx 1000 0.0003905897577136928
duped-2-8b 1000 0.06207427760500445
duped-1-4b 1000 0.02565852271844251
duped-1b 1000 0.17669874878617467
duped-410m 1000 0.10368078363962951
duped-160m 1000 0.07907387609174789

2000
duped_pplx 2000 2.593331417877856e-05
duped-2-8b 2000 0.0036556239218365223
duped-1-4b 2000 0.013233330200129257
duped-1b 2000 0.05504848402969311
duped-410m 2000 0.2274933097081205
duped-160m 2000 0.9028021481207623

4000
duped_pplx 4000 4.94413941758504e-07
duped-2-8b 4000 4.089779303004275e-05
duped-1-4b 4000 1.902322320728162e-05
duped-1b 4000 0.00016959852595525737
duped-410m 4000 0.002344975529700958
duped-160m 4000 0.004061809555769445

6000
duped_pplx 6000 3.057592390281609e-15
duped-2-8b 6000 6.983731723026605e-12
duped-1-4b 6000 3.3823843471547657e-

Llama2-7B is a bad reference model (relatively high p-values). We suspect it's because it's trained on different
data than the Pythia models?

In [17]:
print("Diff model: llama2-7b")
print()

for num in sample_sizes:
    df_sample = df.sample(n=num)
    cor, pvalue = scipy.stats.spearmanr(df_sample['index'], df_sample['llama2-7b'] - df_sample['deduped_pplx'])
    print(num, pvalue)

Diff model: llama2-7b

500 0.26861665675303514
1000 0.68693507015899
2000 0.34164976949410464
4000 0.011173478917135164
6000 0.0001366073886289737
8000 0.0074291704239786
10000 0.004949236011227232


***GPT-J test***

GPT-J had more overlap in training data with Pythia, so might be a better reference model?
(It is; lower p-values compared to Llama2)

In [15]:
print("Diff model: gpt-j-6b")
print()

for num in sample_sizes:
    df_sample = df.sample(n=num)
    cor, pvalue = scipy.stats.spearmanr(df_sample['index'], df_sample['gpt-j-6b'] - df_sample['deduped_pplx'])
    print(num, pvalue)

Diff model: gpt-j-6b

500 0.4752496323220523
1000 0.10094909640464075
2000 0.02274583892932036
4000 0.03518705957845987
6000 0.0006934982882806516
8000 0.008349819023537304
10000 0.0003368746002582249


In [17]:
for num in sample_sizes:
    df_sample = df.sample(n=num)
    print(num)
    models = ['gpt-j-6b', 'llama2-7b']
    for diff_model in models:
        print(diff_model, end = ' ')
        cor, pvalue = scipy.stats.spearmanr(df_sample['index'], df_sample[diff_model] - df_sample['deduped_pplx'])
        print(num, pvalue)
    print()

500
gpt-j-6b 500 0.3445534792502537
llama2-7b 500 0.0031335462568181315

1000
gpt-j-6b 1000 0.049156271541169916
llama2-7b 1000 0.8665978009582441

2000
gpt-j-6b 2000 0.02440365309855038
llama2-7b 2000 0.5017829857970226

4000
gpt-j-6b 4000 0.009189705205548413
llama2-7b 4000 0.020352292810525932

6000
gpt-j-6b 6000 0.0003929187083492641
llama2-7b 6000 0.012210158488851453

8000
gpt-j-6b 8000 0.0023884058138882547
llama2-7b 8000 0.003558697446652442

10000
gpt-j-6b 10000 0.0003368746002582249
llama2-7b 10000 0.004949236011227232

