In [None]:
import plotly.express as px
import numpy as np
import duckdb
from tqdm import tqdm

In [None]:
import sys
sys.path.append("/home/ubuntu/sky_workdir/encoding-schemes")

from encoding_schemes import get_deterministic_adherence_fn

In [None]:
import ray

ray.init()

In [None]:
import os
import psycopg2
import json

conn_string = os.environ["SUPABASE_CONNECTION_URL"]

conn = psycopg2.connect(conn_string)

import pandas as pd

In [None]:
# df = pd.read_sql("SELECT * FROM public.encoding_schemes WHERE (data->'experiment_tags'->'sft')::boolean", conn)

# sel_str = """
# -- redo prompted
# (
#     (data->'experiment_tags'->'numina_math_cot_rerun')::BOOL
#     AND (NOT (data->'force_overwrite')::BOOL OR data->'force_overwrite' IS NULL)
#     AND (data->'experiment_name')::TEXT LIKE '%prompted_%'
# )
# """

# sel_str = """
# -- Few shot
#  (
#      (data->'experiment_tags'->'numina_math_cot_rerun')::BOOL
#      AND (NOT (data->'force_overwrite')::BOOL OR data->'force_overwrite' IS NULL)
#      AND (
#          (data->'experiment_params'->'n_few_shot_examples')::INT = 8
#      )
#   )
# """

sel_str = """
-- NuminaMath CoT Rerun
 (
     (data->'experiment_tags'->'numina_math_cot_rerun')::BOOL
     AND (NOT (data->'force_overwrite')::BOOL OR data->'force_overwrite' IS NULL)
     AND (
         (data->'experiment_params'->'sampling_params'->'n')::INT = 4
         OR ((data->'experiment_params'->'model')::TEXT LIKE '%gpt%' AND (data->'experiment_params'->'sft_params'->'batch_size')::INT != 48)
     )
  )
"""

# sel_str = """
# -- prompted no sft decode
# (
#     (data->'experiment_tags'->'numina_math_cot_rerun')::BOOL
#     AND (NOT (data->'force_overwrite')::BOOL OR data->'force_overwrite' IS NULL)
#     AND (data->'experiment_name')::TEXT LIKE '%prompteddecode%'
# )
# """

df = pd.read_sql(f"""
SELECT * FROM public.encoding_schemes 
    WHERE 

{sel_str}


ORDER BY created_at DESC
""", conn)

l_examples = df.to_dict('records')

df.head()

In [None]:
[example for example in l_examples if example["data"]["experiment_params"]['encoding_scheme'] == 'speaking_identity' and "14B" in example["data"]["experiment_params"]["model"]]

In [None]:
print(len(df))

l_keep_encoding_schemes = [
 'speaking_leet_speak',
 'speaking_pirate_speak',
 'speaking_yoda_speak',
 'speaking_Morse_code',
 'speaking_Adyghe',
 'speaking_Arabic',
 'speaking_French',
 'speaking_Python',
 'speaking_space_between_chars',
 'speaking_base64_2x_cipher',
 'speaking_base64_3x_cipher',
 'speaking_swap_even_odd_letters_in_each_word',
 'speaking_reverse_fibonacci_indices_in_each_word',
 'speaking_base64_cipher',
 'speaking_reverse_letters_in_each_word',
 'speaking_dot_between_chars',
 'speaking_rot13_cipher',
 'speaking_gzip_to_base64_encoded',
 'speaking_letter_to_word_with_dot',
 'speaking_reverse_letters_in_each_word',
 'speaking_letter_to_word_with_dot',
 'speaking_identity']

df = df[df['data'].map(lambda x: '14B' in x['experiment_params']['model'])]
df = df[df['data'].map(lambda x: x['experiment_name'].startswith('math_cot'))]
df = df[df['data'].map(lambda x: x['experiment_params']['encoding_scheme'] in l_keep_encoding_schemes)]

print(len(df))

In [None]:
root_dir = "/home/ubuntu/sky_workdir/encoding-schemes/output"

# Extract n grams and fetch counts

Use Llama 2 tokenizer to match infinigram engine and fetch 5 gram token outputs

In [None]:
from transformers import AutoTokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", add_bos_token=False, add_eos_token=False)

def extract_ngrams(s, n=5):
    tokens = tokenizer.encode(s)

    if len(tokens) < n:
        return set()

    s_ngrams = set()
    for i in range(0, len(tokens) - n):
        ngram_toks = tokens[i:i+n]
        
        decoded_str = tokenizer.decode(ngram_toks)
        s_ngrams.add(decoded_str)
    return s_ngrams

@ray.remote
def extract_ngrams_df(df, n=5):
    s_ngrams = set()
    
    for _, row in df.iterrows():
        s_ngrams = s_ngrams.union(extract_ngrams(row['translated_solution'], n=n))

    return s_ngrams

In [None]:
import aiohttp
import asyncio
from tqdm.asyncio import tqdm_asyncio 
import random

In [None]:
async def gather_all(tasks, **kwargs):
    return await tqdm_asyncio.gather(*tasks, **kwargs)

In [None]:
rate_limit = asyncio.Semaphore(256)

async def compute_ngram_for_example(example, n):
    df = pd.read_parquet(os.path.join(root_dir, example['experiment_hash'], 'data', "joined_output.parquet"))

    s_ngrams = ray.get(extract_ngrams_df.remote(df, n=n))
    n_occurrences = 0

    async def query_infinigram(payload):
        async with rate_limit:
            async with aiohttp.ClientSession() as session:
                for i in range(1000):
                    async with session.post('https://api.infini-gram.io/', json=payload) as resp:
                        if resp.status != 200:
                            await asyncio.sleep(1 + random.random())
                            continue
    
                        return await resp.json()
    
                raise RuntimeError(f"{payload} failed after 10 attempts!")

    l_tasks = []
    for ngram in s_ngrams:
        l_tasks.append(query_infinigram(
            {
                'index': 'v4_rpj_llama_s4',
                'query_type': 'count',
                'query': ngram,
            }
        ))

    # l_tasks = asyncio.run(gather_all(l_tasks))
    l_tasks = await gather_all(l_tasks, miniters=1000)

    for result in l_tasks:
        n_occurrences += result['count']

    return n_occurrences

# await compute_ngram_for_example(l_examples[0], 4)

In [None]:
l_n_4grams = []

for _, example in df.iterrows():
    l_n_4grams.append(await compute_ngram_for_example(example, 4))

In [19]:
l_n_4grams

[24628987121,
 22929369771,
 23814570491,
 7348235206,
 1146624755,
 11085434805,
 9723038114,
 20964913670,
 16199246975,
 19458561192,
 936726227,
 21576376532,
 20401655455,
 1176270205,
 924717530,
 20405497376,
 927863731,
 2501525117,
 1298642849,
 23964318713]

In [None]:
i = 0
for _, example in df.iterrows():
    experiment_hash = example['experiment_hash']

    with open(os.path.join('/home/ubuntu/sky_workdir/encoding-schemes', 'output', experiment_hash, 'data', 'num_pretraining_4grams_redpajama.json'), 'w') as fp:
        json.dump({
            'num_occurrences': l_n_4grams[i]
        }, fp)

    i += 1

In [None]:
df.head()