In [2]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-72B-Instruct")

def get_input_lengths(texts, tokenizer):
    encoded = tokenizer.batch_encode_plus(texts)
    input_lengths = [len(ids) for ids in encoded['input_ids']]
    return input_lengths

input_lengths = get_input_lengths(['level_results', 'hello', 'world'], tokenizer)
print(f"Input lengths: {input_lengths}")

  from .autonotebook import tqdm as notebook_tqdm


Input lengths: [2, 1, 1]


In [3]:
math500_dataset_file = "/home/rishabhtiwari/adaptive_reasoning/qwen3_math_evaluation/data/math500/test.jsonl"
import json
from collections import defaultdict

# Read the JSONL file and extract IDs by difficulty level
level_to_ids = defaultdict(list)

with open(math500_dataset_file, 'r') as f:
    for idx, line in enumerate(f):
        data = json.loads(line.strip())
        # print(data.keys())
        # raise Exception("Stop here")
        problem_id = idx # Handle different ID field names
        level = data.get('level', 0)    # Handle different level field names
        
        if problem_id is not None and level is not None:
            level_to_ids[level].append(problem_id)

# Convert to regular dict and display summary
level_to_ids = dict(level_to_ids)
print("Question IDs by difficulty level:")
for level in sorted(level_to_ids.keys()):
    print(f"Level {level}: {len(level_to_ids[level])} questions")
    print(f"  IDs: {level_to_ids[level][:5]}{'...' if len(level_to_ids[level]) > 5 else ''}")


Question IDs by difficulty level:
Level 1: 43 questions
  IDs: [14, 16, 18, 38, 58]...
Level 2: 90 questions
  IDs: [0, 4, 5, 13, 27]...
Level 3: 105 questions
  IDs: [2, 3, 6, 8, 10]...
Level 4: 128 questions
  IDs: [7, 17, 31, 32, 33]...
Level 5: 134 questions
  IDs: [1, 9, 11, 12, 15]...


In [4]:
# Read the result file and analyze the results

def get_levelwise_results(result_file, level_to_ids, tokenizer, return_lengths=False):
    results = []
    with open(result_file, 'r') as f:
        for line in f:
            data = json.loads(line.strip())
            results.append(data)
    level_to_results = defaultdict(list)
    level_to_results_lengths = defaultdict(list)

    for level in sorted(level_to_ids.keys()):
        level_ids = level_to_ids[level]
        level_results = []
        level_results_lengths = []
        for level_id in level_ids:
            level_result = results[level_id]
            level_results.append(level_result['score'])
            if return_lengths:
                output_lengths = get_input_lengths(level_result['code'], tokenizer)
                level_results_lengths.append(output_lengths)
        level_to_results[level] = level_results
        level_to_results_lengths[level] = level_results_lengths
    return level_to_results, level_to_results_lengths


from transformers import AutoTokenizer
import numpy as np
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-72B-Instruct")
result_file ="/home/rishabhtiwari/adaptive_reasoning/experiments/03_evaluation/short_runs/qwen3-1676884_checkpoint416_max_tokens_per_call16000_thinking_budget-1/math500/test_qwen25-math-cot_-1_seed0_t0.7_top_k8_s0_e-1.jsonl"

level_to_results, level_to_results_lengths = get_levelwise_results(result_file, level_to_ids, tokenizer)
# print(level_to_results)

In [5]:
from math import comb
def get_levelwise_pass_at_1(level_to_results):
    level_to_pass_at_1 = {}
    for level, results in level_to_results.items():
        level_to_pass_at_1[level] = round(np.mean(results), 8)
    return level_to_pass_at_1

def get_pass_at_k(results, k):
    pass_at_k = []
    for result in results:
        c = np.sum(result)
        n = len(result)
        k = min(k, n)
        if k > n - c:
            pass_at_k.append(1.0)
        else:
            temp = 1.0 - comb(n-c, k)/comb(n, k)
            pass_at_k.append(temp)
    return pass_at_k

def get_levelwise_pass_at_k(level_to_results, k):
    level_to_pass_at_k = {}
    for level, results in level_to_results.items():
        level_to_pass_at_k[level] = round(np.mean(get_pass_at_k(results, k)), 2)
    return level_to_pass_at_k

def get_levelwise_pass_at_k_with_std(level_to_results, k):
    level_to_pass_at_k = {}
    level_to_std = {}
    for level, results in level_to_results.items():
        pass_at_k_values = get_pass_at_k(results, k)
        level_to_pass_at_k[level] = round(np.mean(pass_at_k_values), 2)
        level_to_std[level] = round(np.std(pass_at_k_values), 4)
    return level_to_pass_at_k, level_to_std

level_to_pass_at_1 = get_levelwise_pass_at_k(level_to_results, 2)
print(level_to_pass_at_1)

{1: 0.97, 2: 0.98, 3: 0.99, 4: 0.94, 5: 0.89}


# Math500

In [6]:
result_id = 'short_runs'
experiment_ids = ['1676884', '1677711', '1677706', '1677712']
top_ks = [8, 12, 16, 32]
result_file_format ="/home/rishabhtiwari/adaptive_reasoning/experiments/03_evaluation/{result_id}/qwen3-{experiment_id}_checkpoint416_max_tokens_per_call16000_thinking_budget-1/math500/test_qwen25-math-cot_-1_seed0_t0.7_top_k{top_k}_s0_e-1.jsonl"

for experiment_id in experiment_ids:
    print("--------------------------------")
    for top_k in top_ks:
        print(experiment_id, top_k)
        try:
            result_file = result_file_format.format(result_id=result_id, experiment_id=experiment_id, top_k=top_k)
            level_to_results, level_to_results_lengths = get_levelwise_results(result_file, level_to_ids, tokenizer)
            level_to_pass_at_k = get_levelwise_pass_at_k(level_to_results, 4)
            print(level_to_pass_at_k)
        except:
            print(f"Error in {experiment_id} {top_k}")



--------------------------------
1676884 8
{1: 0.98, 2: 0.98, 3: 1.0, 4: 0.96, 5: 0.92}
1676884 12
{1: 0.95, 2: 0.99, 3: 0.98, 4: 0.95, 5: 0.92}
1676884 16
{1: 0.97, 2: 0.99, 3: 0.96, 4: 0.96, 5: 0.89}
1676884 32
{1: 0.92, 2: 0.91, 3: 0.87, 4: 0.76, 5: 0.6}
--------------------------------
1677711 8
{1: 0.98, 2: 0.98, 3: 0.99, 4: 0.96, 5: 0.92}
1677711 12
{1: 0.95, 2: 0.98, 3: 0.99, 4: 0.95, 5: 0.91}
1677711 16
{1: 0.98, 2: 0.99, 3: 0.99, 4: 0.95, 5: 0.91}
1677711 32
{1: 0.95, 2: 0.98, 3: 0.96, 4: 0.94, 5: 0.88}
--------------------------------
1677706 8
{1: 0.98, 2: 0.97, 3: 1.0, 4: 0.96, 5: 0.92}
1677706 12
{1: 0.97, 2: 0.97, 3: 0.99, 4: 0.96, 5: 0.92}
1677706 16
{1: 0.98, 2: 0.98, 3: 1.0, 4: 0.96, 5: 0.92}
1677706 32
{1: 0.95, 2: 0.97, 3: 0.98, 4: 0.95, 5: 0.91}
--------------------------------
1677712 8
{1: 0.99, 2: 0.97, 3: 1.0, 4: 0.96, 5: 0.9}
1677712 12
{1: 0.97, 2: 0.97, 3: 1.0, 4: 0.95, 5: 0.92}
1677712 16
{1: 0.97, 2: 0.98, 3: 1.0, 4: 0.95, 5: 0.92}
1677712 32
{1: 0.94, 2: 0

In [28]:
result_id = 'short_runs'
experiment_ids = ['1695599', '1695805', '1676884', '1695806','1677711','1695807','1677706']
top_ks = [4, 6, 8, 10, 12, 14, 16]
result_file_format ="/home/rishabhtiwari/adaptive_reasoning/experiments/03_evaluation/{result_id}/qwen3-{experiment_id}_checkpoint416_max_tokens_per_call16000_thinking_budget-1/math500/test_qwen25-math-cot_-1_seed0_t0.7_top_k{top_k}_s0_e-1.jsonl"

for idx, experiment_id in enumerate(experiment_ids):
    print("--------------------------------")
    print("trained for ", top_ks[idx])
    for top_k in top_ks:
        print(experiment_id, top_k)
        try:
            result_file = result_file_format.format(result_id=result_id, experiment_id=experiment_id, top_k=top_k)
            level_to_results, level_to_results_lengths = get_levelwise_results(result_file, level_to_ids, tokenizer)
            level_to_pass_at_k = get_levelwise_pass_at_k(level_to_results, 4)
            print(level_to_pass_at_k)
        except:
            print(f"Error in {experiment_id} {top_k}")



--------------------------------
trained for  4
1695599 4
{0: 0.96}
1695599 6
{0: 0.95}
1695599 8
{0: 0.95}
1695599 10
Error in 1695599 10
1695599 12
{0: 0.88}
1695599 14
{0: 0.83}
1695599 16
Error in 1695599 16
--------------------------------
trained for  6
1695805 4
Error in 1695805 4
1695805 6
{0: 0.96}
1695805 8
{0: 0.96}
1695805 10
{0: 0.96}
1695805 12
{0: 0.95}
1695805 14
Error in 1695805 14
1695805 16
{0: 0.92}
--------------------------------
trained for  8
1676884 4
{0: 0.94}
1676884 6
{0: 0.96}
1676884 8
{0: 0.96}
1676884 10
{0: 0.96}
1676884 12
{0: 0.95}
1676884 14
{0: 0.95}
1676884 16
{0: 0.95}
--------------------------------
trained for  10
1695806 4
Error in 1695806 4
1695806 6
{0: 0.96}
1695806 8
{0: 0.95}
1695806 10
{0: 0.96}
1695806 12
{0: 0.96}
1695806 14
Error in 1695806 14
1695806 16
{0: 0.95}
--------------------------------
trained for  12
1677711 4
{0: 0.91}
1677711 6
{0: 0.96}
1677711 8
{0: 0.96}
1677711 10
{0: 0.96}
1677711 12
{0: 0.96}
1677711 14
{0: 0.96}
1

In [18]:
result_id = 'short_runs'
experiment_ids = ['1709227', '1709239']
top_ks = [4, 6, 8, 12, 14, 16, 32]
result_file_format ="/home/rishabhtiwari/adaptive_reasoning/experiments/03_evaluation/{result_id}/qwen3-{experiment_id}_checkpoint416_max_tokens_per_call16000_thinking_budget-1/math500/test_qwen25-math-cot_-1_seed0_t0.7_top_k{top_k}_s0_e-1.jsonl"

for experiment_id in experiment_ids:
    print("--------------------------------")
    for top_k in top_ks:
        print(experiment_id, top_k)
        try:
            result_file = result_file_format.format(result_id=result_id, experiment_id=experiment_id, top_k=top_k)
            level_to_results, level_to_results_lengths = get_levelwise_results(result_file, level_to_ids, tokenizer)
            level_to_pass_at_k = get_levelwise_pass_at_k(level_to_results, 4)
            print(level_to_pass_at_k)
        except:
            print(f"Error in {experiment_id} {top_k}")



--------------------------------
1709227 4
{1: 0.97, 2: 0.99, 3: 0.99, 4: 0.97, 5: 0.91}
1709227 6
{1: 0.97, 2: 0.99, 3: 0.99, 4: 0.97, 5: 0.91}
1709227 8
{1: 0.95, 2: 0.98, 3: 1.0, 4: 0.96, 5: 0.91}
1709227 12
{1: 0.97, 2: 0.98, 3: 0.99, 4: 0.97, 5: 0.92}
1709227 14
{1: 0.98, 2: 0.98, 3: 1.0, 4: 0.98, 5: 0.92}
1709227 16
{1: 0.98, 2: 0.97, 3: 0.99, 4: 0.96, 5: 0.92}
1709227 32
{1: 0.98, 2: 0.99, 3: 0.99, 4: 0.96, 5: 0.93}
--------------------------------
1709239 4
Error in 1709239 4
1709239 6
{1: 0.97, 2: 0.98, 3: 1.0, 4: 0.97, 5: 0.92}
1709239 8
{1: 0.97, 2: 0.98, 3: 1.0, 4: 0.97, 5: 0.92}
1709239 12
{1: 0.97, 2: 0.98, 3: 0.99, 4: 0.96, 5: 0.92}
1709239 14
{1: 0.98, 2: 0.99, 3: 1.0, 4: 0.97, 5: 0.91}
1709239 16
Error in 1709239 16
1709239 32
{1: 0.98, 2: 0.98, 3: 0.99, 4: 0.97, 5: 0.93}


In [23]:
result_id = 'short_runs'
experiment_ids = ['1717456']
top_ks = [4, 6, 8, 12, 14, 16, 32]
result_file_format ="/home/rishabhtiwari/adaptive_reasoning/experiments/03_evaluation/{result_id}/qwen3-{experiment_id}_checkpoint416_max_tokens_per_call16000_thinking_budget-1/math500/test_qwen25-math-cot_-1_seed0_t0.7_top_k{top_k}_s0_e-1.jsonl"

for experiment_id in experiment_ids:
    print("--------------------------------")
    for top_k in top_ks:
        print(experiment_id, top_k)
        try:
            result_file = result_file_format.format(result_id=result_id, experiment_id=experiment_id, top_k=top_k)
            level_to_results, level_to_results_lengths = get_levelwise_results(result_file, level_to_ids, tokenizer)
            level_to_pass_at_k = get_levelwise_pass_at_k(level_to_results, 4)
            print(level_to_pass_at_k)
        except:
            print(f"Error in {experiment_id} {top_k}")



--------------------------------
1717456 4
Error in 1717456 4
1717456 6
Error in 1717456 6
1717456 8
{1: 0.98, 2: 0.98, 3: 0.99, 4: 0.97, 5: 0.92}
1717456 12
{1: 0.99, 2: 0.99, 3: 0.99, 4: 0.97, 5: 0.93}
1717456 14
{1: 0.97, 2: 0.99, 3: 0.99, 4: 0.97, 5: 0.91}
1717456 16
{1: 0.98, 2: 0.99, 3: 0.97, 4: 0.96, 5: 0.91}
1717456 32
Error in 1717456 32


## AIME

In [7]:
aime_dataset_file = "/home/rishabhtiwari/adaptive_reasoning/qwen3_math_evaluation/data/aime25/test.jsonl"
import json
from collections import defaultdict

# Read the JSONL file and extract IDs by difficulty level
level_to_ids_aime = defaultdict(list)

with open(aime_dataset_file, 'r') as f:
    for idx, line in enumerate(f):
        data = json.loads(line.strip())
        # print(data.keys())
        # raise Exception("Stop here")
        problem_id = idx # Handle different ID field names
        level = data.get('level', data.get('i', 0))    # Handle different level field names
        
        if problem_id is not None and level is not None:
            level_to_ids_aime[level].append(problem_id)

# Convert to regular dict and display summary
level_to_ids_aime = dict(level_to_ids_aime)
print("Question IDs by difficulty level:")
for level in sorted(level_to_ids_aime.keys()):
    print(f"Level {level}: {len(level_to_ids_aime[level])} questions")
    print(f"  IDs: {level_to_ids_aime[level][:5]}{'...' if len(level_to_ids_aime[level]) > 5 else ''}")


Question IDs by difficulty level:
Level 0: 30 questions
  IDs: [0, 1, 2, 3, 4]...


In [11]:
result_id = 'short_runs'
experiment_ids = ['1676884', '1677711', '1677706', '1677712']
top_ks = [8, 12, 16, 32]
result_file_format ="/home/rishabhtiwari/adaptive_reasoning/experiments/03_evaluation/{result_id}/qwen3-{experiment_id}_checkpoint416_max_tokens_per_call16000_thinking_budget-1/aime25/test_qwen25-math-cot_-1_seed0_t0.7_top_k{top_k}_s0_e-1.jsonl"


for experiment_id in experiment_ids:
    print("--------------------------------")
    for top_k in top_ks:
        print(experiment_id, top_k)
        try:
            result_file = result_file_format.format(result_id=result_id, experiment_id=experiment_id, top_k=top_k)
            level_to_results, level_to_results_lengths = get_levelwise_results(result_file, level_to_ids_aime, tokenizer)
            level_to_pass_at_k = get_levelwise_pass_at_k(level_to_results, 1)
            print(level_to_pass_at_k)
        except:
            print(f"Error in {experiment_id} {top_k}")


--------------------------------
1676884 8
{0: 0.39}
1676884 12
{0: 0.33}
1676884 16
{0: 0.32}
1676884 32
{0: 0.09}
--------------------------------
1677711 8
{0: 0.41}
1677711 12
{0: 0.39}
1677711 16
{0: 0.35}
1677711 32
{0: 0.27}
--------------------------------
1677706 8
{0: 0.37}
1677706 12
{0: 0.41}
1677706 16
{0: 0.4}
1677706 32
{0: 0.34}
--------------------------------
1677712 8
{0: 0.36}
1677712 12
{0: 0.41}
1677712 16
{0: 0.4}
1677712 32
{0: 0.4}


In [14]:
result_id = 'short_runs'
experiment_ids = ['1695599', '1695805', '1676884', '1695806','1677711','1695807','1677706']
top_ks = [4, 6, 8, 10, 12, 14, 16]
result_file_format ="/home/rishabhtiwari/adaptive_reasoning/experiments/03_evaluation/{result_id}/qwen3-{experiment_id}_checkpoint416_max_tokens_per_call16000_thinking_budget-1/aime25/test_qwen25-math-cot_-1_seed0_t0.7_top_k{top_k}_s0_e-1.jsonl"


for idx, experiment_id in enumerate(experiment_ids):
    print("--------------------------------")
    print("trained for ", top_ks[idx])
    for top_k in top_ks:
        print(experiment_id, top_k)
        try:
            result_file = result_file_format.format(result_id=result_id, experiment_id=experiment_id, top_k=top_k)
            level_to_results, level_to_results_lengths = get_levelwise_results(result_file, level_to_ids_aime, tokenizer)
            level_to_pass_at_k = get_levelwise_pass_at_k(level_to_results, 1)
            print(level_to_pass_at_k)
        except:
            print(f"Error in {experiment_id} {top_k}")


--------------------------------
trained for  4
1695599 4
{0: 0.35}
1695599 6
{0: 0.34}
1695599 8
{0: 0.28}
1695599 10
{0: 0.18}
1695599 12
{0: 0.12}
1695599 14
{0: 0.1}
1695599 16
{0: 0.04}
--------------------------------
trained for  6
1695805 4
Error in 1695805 4
1695805 6
{0: 0.35}
1695805 8
{0: 0.35}
1695805 10
{0: 0.36}
1695805 12
{0: 0.29}
1695805 14
{0: 0.24}
1695805 16
{0: 0.2}
--------------------------------
trained for  8
1676884 4
Error in 1676884 4
1676884 6
Error in 1676884 6
1676884 8
{0: 0.39}
1676884 10
{0: 0.38}
1676884 12
{0: 0.33}
1676884 14
{0: 0.3}
1676884 16
{0: 0.32}
--------------------------------
trained for  10
1695806 4
{0: 0.22}
1695806 6
{0: 0.36}
1695806 8
{0: 0.35}
1695806 10
Error in 1695806 10
1695806 12
{0: 0.36}
1695806 14
{0: 0.38}
1695806 16
{0: 0.36}
--------------------------------
trained for  12
1677711 4
{0: 0.25}
1677711 6
Error in 1677711 6
1677711 8
{0: 0.41}
1677711 10
Error in 1677711 10
1677711 12
{0: 0.39}
1677711 14
{0: 0.37}
167771

In [13]:
result_id = 'short_runs'
experiment_ids = ['1709227', '1709239']
top_ks = [4, 6, 8, 12, 14, 16, 32]
result_file_format ="/home/rishabhtiwari/adaptive_reasoning/experiments/03_evaluation/{result_id}/qwen3-{experiment_id}_checkpoint416_max_tokens_per_call16000_thinking_budget-1/aime25/test_qwen25-math-cot_-1_seed0_t0.7_top_k{top_k}_s0_e-1.jsonl"


for idx, experiment_id in enumerate(experiment_ids):
    print("--------------------------------")
    # print("trained for ", top_ks[idx])
    for top_k in top_ks:
        print(experiment_id, top_k)
        try:
            result_file = result_file_format.format(result_id=result_id, experiment_id=experiment_id, top_k=top_k)
            level_to_results, level_to_results_lengths = get_levelwise_results(result_file, level_to_ids_aime, tokenizer)
            level_to_pass_at_k = get_levelwise_pass_at_k(level_to_results, 1)
            print(f"Mean: {level_to_pass_at_k}")
            # print(f"Std:  {level_to_std}")
        except:
            print(f"Error in {experiment_id} {top_k}")


--------------------------------
1709227 4
Mean: {0: 0.37}
1709227 6
Mean: {0: 0.41}
1709227 8
Mean: {0: 0.41}
1709227 12
Mean: {0: 0.45}
1709227 14
Mean: {0: 0.48}
1709227 16
Mean: {0: 0.4}
1709227 32
Mean: {0: 0.46}
--------------------------------
1709239 4
Mean: {0: 0.38}
1709239 6
Mean: {0: 0.42}
1709239 8
Mean: {0: 0.4}
1709239 12
Mean: {0: 0.41}
1709239 14
Mean: {0: 0.46}
1709239 16
Mean: {0: 0.41}
1709239 32
Mean: {0: 0.42}


In [12]:
result_id = 'short_runs'
experiment_ids = ['1717456']
top_ks = [4, 6, 8, 12, 14, 16, 32]
result_file_format ="/home/rishabhtiwari/adaptive_reasoning/experiments/03_evaluation/{result_id}/qwen3-{experiment_id}_checkpoint416_max_tokens_per_call16000_thinking_budget-1/aime25/test_qwen25-math-cot_-1_seed0_t0.7_top_k{top_k}_s0_e-1.jsonl"


for idx, experiment_id in enumerate(experiment_ids):
    print("--------------------------------")
    # print("trained for ", top_ks[idx])
    for top_k in top_ks:
        print(experiment_id, top_k)
        try:
            result_file = result_file_format.format(result_id=result_id, experiment_id=experiment_id, top_k=top_k)
            level_to_results, level_to_results_lengths = get_levelwise_results(result_file, level_to_ids_aime, tokenizer)
            level_to_pass_at_k = get_levelwise_pass_at_k(level_to_results,1)
            print(f"Mean: {level_to_pass_at_k}")
            # print(f"Std:  {level_to_std}")
        except:
            print(f"Error in {experiment_id} {top_k}")


--------------------------------
1717456 4
Mean: {0: 0.3}
1717456 6
Mean: {0: 0.4}
1717456 8
Mean: {0: 0.43}
1717456 12
Mean: {0: 0.45}
1717456 14
Mean: {0: 0.39}
1717456 16
Mean: {0: 0.38}
1717456 32
Mean: {0: 0.18}


In [40]:
result_id = 'short_runs'
experiment_ids = ['1676884', '1677711', '1677706', '1677712']
top_ks = [8, 12, 16, 32]
result_file_format ="/home/rishabhtiwari/adaptive_reasoning/experiments/03_evaluation/{result_id}/qwen3-{experiment_id}_checkpoint416_max_tokens_per_call16000_thinking_budget-1/aime25/test_qwen25-math-cot_-1_seed0_t0.7_top_k{top_k}_s0_e-1.jsonl"


for idx, experiment_id in enumerate(experiment_ids):
    print("--------------------------------")
    overall_level_to_results = {}
    for top_k in top_ks[:idx+1]:
        print(experiment_id, top_k)
        try:
            result_file = result_file_format.format(result_id=result_id, experiment_id=experiment_id, top_k=top_k)
            level_to_results, level_to_results_lengths = get_levelwise_results(result_file, level_to_ids_aime, tokenizer)
            
            # Combine results across different top_k values for each level
            for level, results in level_to_results.items():
                if level not in overall_level_to_results:
                    combined_results = []
                    for i in range(len(results)):
                        combined_results.append(results[i][:8//(idx+1)])
                else:
                    combined_results = overall_level_to_results[level]
                    for i in range(len(results)):
                        combined_results[i] += results[i][:8//(idx+1)]
                overall_level_to_results[level] = combined_results
        except Exception as e:
            print(f"Error in {experiment_id} {top_k}: {e}")
    
    # Calculate pass@k for the combined results
    level_to_pass_at_k = get_levelwise_pass_at_k(overall_level_to_results, 4)
    print(level_to_pass_at_k)


--------------------------------
1676884 8
{0: 0.57}
--------------------------------
1677711 8
1677711 12
{0: 0.51}
--------------------------------
1677706 8
1677706 12
1677706 16
{0: 0.54}
--------------------------------
1677712 8
1677712 12
1677712 16
1677712 32
{0: 0.6}


In [34]:
len(overall_level_to_results[0][0])

14

In [31]:
level_to_pass_at_k

{0: 0.4}

In [39]:
np.sum(level_to_results[0])

97

In [42]:
comb(8,8)/1

1.0

In [38]:
results[:, :2]

TypeError: list indices must be integers or slices, not tuple