In [1]:
import utils as utils
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer,
    AutoConfig,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained(
        '/mnt/cephfs/echoi/models/L1-Qwen-1.5B-Max',
        trust_remote_code = True,
        padding_side='left'
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [6]:
text = "</think>\n\nTo solve this problem, we need to determine the number of digits in the base-3 representation of the number $ 987{10} $ and the base-8 representation of the same number, and then find the difference between these two quantities.\n\n---\n\n### Step 1: Convert $ 987{10} $ to base-3\n\nWe repeatedly divide the number by 3 and record the remainders.\n\n$$\n\\begin{align}\n987 \\div 3 &= 329 \\text{ remainder } 0 \\\\\n329 \\div 3 &= 109 \\text{ remainder } 2 \\\\\n109 \\div 3 &= 36 \\text{ remainder } 1 \\\\\n36 \\div 3 &= 12 \\text{ remainder } 0 \\\\\n12 \\div 3 &= 4 \\text{ remainder } 0 \\\\\n4 \\div 3 &= 1 \\text{ remainder } 1 \\\\\n1 \\div 3 &= 0 \\text{ remainder } 1 \\\\\n\\end{align}\n$$\n\nNow, write the remainders from last to first:\n\n$$\n987_{10} = 110101013\n$$\n\nThis is a 7-digit number in base-3.\n\n---\n\n### Step 2: Convert $ 987{10} $ to base-8\n\nWe repeatedly divide the number by 8 and record the remainders.\n\n$$\n\\begin{align}\n987 \\div 8 &= 123 \\text{ remainder } 3 \\\\\n123 \\div 8 &= 15 \\text{ remainder } 3 \\\\\n15 \\div 8 &= 1 \\text{ remainder } 7 \\\\\n1 \\div 8 &= 0 \\text{ remainder } 1 \\\\\n\\end{align}\n$$\n\nNow, write the remainders from last to first:\n\n$$\n987_{10} = 1733_8\n$$\n\nThis is a 4-digit number in base-8.\n\n---\n\n### Step 3: Find the difference in the number of digits\n\n- Base-3: 7 digits  \n- Base-8: 4 digits  \n\n$$\n\\text{Difference} = 7 - 4 = \\boxed{3}"

In [7]:
tokenizer(text, return_tensors='pt')['input_ids'][0].shape[0]

495

In [3]:
solutions = utils.load_solutions('/mnt/cephfs/sumin/TPT_reproduce/mk2/output_3b_2k_8_nogold/solutions_iter_0.json')

In [27]:
token_length_correct = {}
token_length_correct['tokens'] = 0
token_length_correct['count'] = 0
token_length_incorrect = {}
token_length_incorrect['tokens'] = 0
token_length_incorrect['count'] = 0

In [28]:
for item in solutions:
    tokens = tokenizer(item['solution'], return_tensors='pt')['input_ids'][0].shape[0]
    if utils.check_answer(item['extracted_answer'], utils.extract_answer(item['reference_answer'])):
        token_length_correct['tokens'] += tokens
        token_length_correct['count'] += 1
    else:
        token_length_incorrect['tokens'] += tokens
        token_length_incorrect['count'] += 1

In [30]:
print('Token Avg. length: Correct: ', token_length_correct['tokens'] / token_length_correct['count'])
print('Token Avg. length: InCorrect: ', token_length_incorrect['tokens'] / token_length_incorrect['count'])

Token Avg. length: Correct:  244.76470588235293
Token Avg. length: InCorrect:  300.7571946795647


In [5]:
def stat_token_length(data, tokenizer):
    token_length_correct = {}
    token_length_correct['tokens'] = 0
    token_length_correct['count'] = 0
    token_length_incorrect = {}
    token_length_incorrect['tokens'] = 0
    token_length_incorrect['count'] = 0
    for item in data:
        tokens = tokenizer(item['solution'], return_tensors='pt')['input_ids'][0].shape[0]
        if utils.check_answer(item['extracted_answer'], item['gold_answer']):
            token_length_correct['tokens'] += tokens
            token_length_correct['count'] += 1
        else:
            token_length_incorrect['tokens'] += tokens
            token_length_incorrect['count'] += 1
    print('Token Avg. length: Correct: ', token_length_correct['tokens'] / token_length_correct['count'])
    print('Token Avg. length: InCorrect: ', token_length_incorrect['tokens'] / token_length_incorrect['count'])
    print('Token Avg. length: Correct + InCorrect: ', (token_length_incorrect['tokens'] + token_length_correct['tokens'] ) / ( token_length_incorrect['count'] + token_length_correct['count']))

In [42]:
stat_token_length(solutions, tokenizer)

Token Avg. length: Correct:  244.76470588235293
Token Avg. length: InCorrect:  300.7571946795647
Token Avg. length: Correct + InCorrect:  267.9176


In [6]:
data = utils.load_solutions('/mnt/cephfs/sumin/TPT_reproduce/mk6-vllm/output/solutions_iter_0.json')
stat_token_length(data, tokenizer)

Token Avg. length: Correct:  1896.076388888889
Token Avg. length: InCorrect:  2445.5205765047135
Token Avg. length: Correct + InCorrect:  2138.53512


In [7]:
data = utils.load_solutions('/mnt/cephfs/sumin/TPT_reproduce/mk6-vllm/output/solutions_quant_iter_0.json')
stat_token_length(data, tokenizer)

Token Avg. length: Correct:  1745.218927771074
Token Avg. length: InCorrect:  2321.5071652485444
Token Avg. length: Correct + InCorrect:  2054.06332


In [45]:
data = utils.load_solutions('/mnt/cephfs/sumin/TPT_reproduce/mk2/output_3b_2k_8_nogold/solutions_quant_iter_1.json')
stat_token_length(data, tokenizer)

Token Avg. length: Correct:  216.2632381339567
Token Avg. length: InCorrect:  284.2059836630253
Token Avg. length: Correct + InCorrect:  250.78155


In [46]:
data = utils.load_solutions('/mnt/cephfs/sumin/TPT_reproduce/mk2/output_3b_2k_8_nogold/solutions_iter_2.json')
stat_token_length(data, tokenizer)

Token Avg. length: Correct:  247.64454976303318
Token Avg. length: InCorrect:  304.4176295413937
Token Avg. length: Correct + InCorrect:  271.47505


In [47]:
data = utils.load_solutions('/mnt/cephfs/sumin/TPT_reproduce/mk2/output_3b_2k_8_nogold/solutions_quant_iter_2.json')
stat_token_length(data, tokenizer)

Token Avg. length: Correct:  206.12175027462467
Token Avg. length: InCorrect:  273.0883649184663
Token Avg. length: Correct + InCorrect:  236.5112


In [48]:
data = utils.load_solutions('/mnt/cephfs/sumin/TPT_reproduce/mk2/output_3b_2k_8_nogold/solutions_iter_3.json')
stat_token_length(data, tokenizer)

Token Avg. length: Correct:  245.5944826370527
Token Avg. length: InCorrect:  303.9141437485556
Token Avg. length: Correct + InCorrect:  270.8294


In [49]:
data = utils.load_solutions('/mnt/cephfs/sumin/TPT_reproduce/mk2/output_3b_2k_8_nogold/solutions_quant_iter_3.json')
stat_token_length(data, tokenizer)

Token Avg. length: Correct:  200.60029460185424
Token Avg. length: InCorrect:  261.86617803522876
Token Avg. length: Correct + InCorrect:  226.5127


--------------------------------------------------------3B-------------------------------------------------------

In [50]:
tokenizer = AutoTokenizer.from_pretrained(
        '/mnt/cephfs/sumin/model/Llama-3.2-1B-Instruct',
        trust_remote_code = True,
        padding_side='left'
)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [51]:
data = utils.load_solutions('/mnt/cephfs/sumin/TPT_reproduce/mk2/output/solutions_iter_0.json')
stat_token_length(data, tokenizer)

Token Avg. length: Correct:  235.95531914893618
Token Avg. length: InCorrect:  354.54977645305513
Token Avg. length: Correct + InCorrect:  315.5322


In [52]:
data = utils.load_solutions('/mnt/cephfs/sumin/TPT_reproduce/mk2/output/solutions_quant_iter_0.json')
stat_token_length(data, tokenizer)

Token Avg. length: Correct:  260.0416666666667
Token Avg. length: InCorrect:  695.5796824758843
Token Avg. length: Correct + InCorrect:  693.4891


In [53]:
data = utils.load_solutions('/mnt/cephfs/sumin/TPT_reproduce/mk2/output/solutions_iter_1.json')
stat_token_length(data, tokenizer)

Token Avg. length: Correct:  233.19182597231378
Token Avg. length: InCorrect:  353.77045650301466
Token Avg. length: Correct + InCorrect:  317.1869


In [54]:
data = utils.load_solutions('/mnt/cephfs/sumin/TPT_reproduce/mk2/output/solutions_quant_iter_1.json')
stat_token_length(data, tokenizer)

Token Avg. length: Correct:  208.2569882777277
Token Avg. length: InCorrect:  320.06478461365424
Token Avg. length: Correct + InCorrect:  307.6653


In [55]:
data = utils.load_solutions('/mnt/cephfs/sumin/TPT_reproduce/mk2/output/solutions_iter_2.json')
stat_token_length(data, tokenizer)

Token Avg. length: Correct:  227.02246508803887
Token Avg. length: InCorrect:  348.96898300029824
Token Avg. length: Correct + InCorrect:  308.7998


In [56]:
data = utils.load_solutions('/mnt/cephfs/sumin/TPT_reproduce/mk2/output/solutions_quant_iter_2.json')
stat_token_length(data, tokenizer)

Token Avg. length: Correct:  202.06031746031746
Token Avg. length: InCorrect:  310.779290617849
Token Avg. length: Correct + InCorrect:  297.0807


In [57]:
data = utils.load_solutions('/mnt/cephfs/sumin/TPT_reproduce/mk2/output/solutions_iter_3.json')
stat_token_length(data, tokenizer)

Token Avg. length: Correct:  233.39923619271445
Token Avg. length: InCorrect:  352.72938144329896
Token Avg. length: Correct + InCorrect:  312.1094


In [58]:
data = utils.load_solutions('/mnt/cephfs/sumin/TPT_reproduce/mk2/output/solutions_quant_iter_3.json')
stat_token_length(data, tokenizer)

Token Avg. length: Correct:  202.6484440706476
Token Avg. length: InCorrect:  305.1123595505618
Token Avg. length: Correct + InCorrect:  292.9294
