# Estimating Data Contamination

In this code, I use the approaches outlined [in this paper](https://arxiv.org/pdf/2308.08493.pdf) to ask whether there is evidence of data contamination for any of the datasets.

## Helper functions

In [44]:
import pandas as pd
import numpy as np
import openai
import backoff  # for exponential backoff
import os
import scipy.stats as ss
import seaborn as sns
from tqdm import tqdm

%matplotlib inline
%config InlineBackend.figure_format = 'retina'  # makes figs nicer!




In [2]:
def openai_auth():
    """Try to authenticate with OpenAI."""
    ## Read in key
    with open('src/models/gpt_key', 'r') as f:
        lines = f.read().split("\n")
    org = lines[0]
    api_key = lines[1]
    openai.organization = org # org
    openai.api_key = api_key # api_key

In [3]:
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

### Helper functions
def get_number(ans):
    """Retrieves number from GPT-4 response."""
    if type(ans) is float:
        return ans
    res = [float(i) for i in ans.split() if is_number(i)]
    if len(res) == 0:
        return None
    return res[0]

In [4]:
openai_auth()

In [5]:
@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def pred_tokens(prompt, n=40, model="gpt-4"):
    """Get response."""
    output = openai.ChatCompletion.create(
        model = model,
        temperature = 0,
        messages=[
                {"role": "system", "content": "You are a helpful assistant. Your job is to provide data from published datasets."},
                {"role": "user", "content": prompt}
            ],
      max_tokens=n,
      top_p=1
        )

    return output# output['choices'][0]['message']['content']

## Method: reconstructing the `.csv`

### Glasgow Norms

In [163]:
PROMPT = "These are psycholinguistic norms from the Glasgow Norms dataset. Please continue each row with the correct number from the dataset.." + \
"\nword,Length,Arousal.M" + \
"\n{word},{length},"
print(PROMPT)

These are psycholinguistic norms from the Glasgow Norms dataset. Please continue each row with the correct number from the dataset..
word,Length,Arousal.M
{word},{length},


In [164]:
df = pd.read_csv("data/raw/glasgow/glasgow.csv")
df.head(5)

Unnamed: 0,word,Length,Arousal.M,Arousal.SD,Arousal.N,Valence.M,Valence.SD,Valence.N,Dominance.M,Dominance.SD,...,Familiarity.N,AoA.M,AoA.SD,AoA.N,Size.M,Size.SD,Size.N,Gender.M,Gender.SD,Gender.N
0,address (postal),7,3.546,1.94,33,5.0,0.739,33,5.114,1.489,...,32,3.059,1.056,34,3.471,1.419,34,4.0,0.612,32
1,address (speak to),7,4.0,2.029,34,5.559,1.063,34,6.182,2.066,...,35,4.8,1.09,35,4.171,1.502,35,3.824,0.954,34
2,aim (objective),3,4.909,2.34,33,6.382,1.189,34,5.909,2.021,...,33,4.529,1.334,34,4.5,1.399,34,4.235,1.086,34
3,aim (target),3,5.2,2.214,35,5.6,1.642,35,6.714,1.631,...,33,3.618,1.534,34,3.686,1.617,35,4.743,1.104,35
4,Apple (brand),5,4.849,2.687,33,5.971,1.902,34,4.6,2.44,...,34,6.0,1.553,34,5.353,1.954,34,4.647,1.21,34


In [170]:
results = []
for index, row in tqdm(df.iterrows(), total=df.shape[0]):
    
    word = row['word']
    length = row['Length']
    arousal = row['Arousal.M']
    
    p = PROMPT.format(word=word, length = length)
    
    response = pred_tokens(p, n = 3, model = "gpt-4")
    extracted_response = response['choices'][0]['message']['content']
    
    results.append(extracted_response)

100%|███████████████████████████████████████████| 48/48 [00:42<00:00,  1.13it/s]


In [175]:
df['response'] = results
df['num_response'] = df['response'].apply(lambda x: get_number(x))

In [176]:
df = df.dropna(subset = ["num_response"])
len(df)

859

### Assessment 1

In [177]:
ss.spearmanr(df['num_response'], df['Arousal.M'])

SpearmanrResult(correlation=0.5287051752221139, pvalue=5.00548497010235e-63)

### Assessment 2: ROUGE-L

Calculate ROUGE-L and compare to those produced under general instruction.

In [178]:
from rouge import Rouge

In [179]:
def calculate_rouge_l(row, target):
    """
    Calculate ROUGE-L score given a hypothesis and reference string.
    """
    hypothesis = str(row[target])
    reference = str(row['Arousal.M'])
    
    rouge = Rouge(metrics=['rouge-l'])
    scores = rouge.get_scores(hypothesis, reference)

    # Extract ROUGE-L scores
    rouge_l = scores[0]['rouge-l']

    return rouge_l['f']

#### Guided instruction ROUGE

In [180]:
df['rouge_l_guided'] = df.apply(calculate_rouge_l, target = "response", axis = 1)

In [181]:
df['rouge_l_guided'].mean()

0.18781528722868898

In [182]:
df['rouge_l_guided'].std()

0.24401464770964837

#### Original norms ROUGE

In [183]:
df_glasgow = pd.read_csv("data/processed/glasgow/glasgow_gpt-4.csv")
df_merged = pd.merge(df_glasgow, df)

In [184]:
df_merged['rouge_l_original'] = df_merged.apply(calculate_rouge_l, target = "Arousal", axis = 1)

In [185]:
df_merged['rouge_l_original'].mean()

0.22894838734251616

In [186]:
df_merged['rouge_l_original'].std()

0.2552888732278159

#### t-test

In [207]:
result = ss.ttest_ind( df_merged['rouge_l_original'], df_merged['rouge_l_guided'])
df = (len(df_merged) + len(df_merged)) - 2 
print("p: {p}".format(p = result.pvalue))
print("t: {t}".format(t = result.statistic))
print("df: {df}".format(df = df))

p: 0.0006557343300089885
t: 3.4137185257709834
df: 1716


### Save contamination test data

In [200]:
df_merged_select = df_merged[['word', 'Length', 'Arousal.M', 'Arousal', 'num_response']]

In [202]:
df_merged_select.to_csv("data/processed/glasgow/gpt4_data_contamination_test.csv", index = False)