In [1]:
import pandas as pd
import numpy as np

# Create a prediction function using 50-50 split training data thr=0.8

In [8]:
data_df = pd.read_csv("data/all_queries_all_depth-bm25-mono_1000-dcg.csv")

In [9]:
prediction_df = pd.read_csv("data/equal_content_50_50_split-bm25-mono_1000-dcg-initial_bm25_retrieved_dcg_value.csv")

In [10]:
filtered_df = prediction_df[prediction_df['Percentile'] == 80].copy()
lookup_table = filtered_df[['Bin', 'Predicted Depth']].drop_duplicates()

intervals = lookup_table['Bin'].str.split('~', expand=True).astype(float)
    
# closed='left' means [min, max), so 10.0 matches 10~20 but not 0~10
itvl_index = pd.IntervalIndex.from_arrays(
    intervals[0], 
    intervals[1], 
    closed='left'
)
    
lookup_table.index = itvl_index
    
def predict_using_initial_retrieved_dcg(val):
    min_bound = lookup_table.index.left.min()
    max_bound = lookup_table.index.right.max()
    
    # Clip the value: 
    # If val > max_bound, it becomes max_bound - epsilon (to stay in the last bin)
    # If val < min_bound, it becomes min_bound
    # We subtract a tiny amount (1e-9) from the max to respect the closed='left' rule
    safe_val = np.clip(val, min_bound, max_bound - 1e-9)
    
    return lookup_table.iloc[lookup_table.index.get_loc(safe_val)]['Predicted Depth']

In [11]:
predict_using_initial_retrieved_dcg(4.33)

np.int64(603)

In [12]:
# Get the same set of test queries as training data
test_queries = filtered_df['Query ID'].unique()
test_data = data_df[data_df['query_id'].isin(test_queries)]

# Bootstrap (Compare predictor thr=0.8 against fixed depth 333)

In [15]:
n_bootstrap = 10000
ci=95
fixed_depth = 333

differences = []

for qid, group in test_data.groupby('query_id'):
    initial_retrieved_dcg = group[group['depth'] == 0]['metric_value_at_depth'].iloc[0]

    # Determine Predicted Depth
    predicted_depth = predict_using_initial_retrieved_dcg(initial_retrieved_dcg)

    metric_at_predicted = group[group['depth'] == predicted_depth]['metric_value_at_depth'].iloc[0]
    metric_at_fixed = group[group['depth'] == fixed_depth]['metric_value_at_depth'].iloc[0]
        
    differences.append(metric_at_predicted - metric_at_fixed)

# Bootstrap Sampling
diffs = np.array(differences)
bootstrap_means = []

for _ in range(n_bootstrap):
    sample = np.random.choice(diffs, size=len(diffs), replace=True)
    bootstrap_means.append(np.mean(sample))

# Calculate Confidence Interval
lower_bound = np.percentile(bootstrap_means, (100 - ci) / 2)
upper_bound = np.percentile(bootstrap_means, 100 - (100 - ci) / 2)
observed_mean = np.mean(diffs)

significant = not (lower_bound <= 0 <= upper_bound)

print(f"observed_mean_diff: {observed_mean}")
print(f"ci_lower: {lower_bound}")
print(f"ci_upper: {upper_bound}")
print(f"is_significant: {significant}")
print(f"sample_size: {len(diffs)}")

observed_mean_diff: 0.07045443175981722
ci_lower: 0.024794660312601198
ci_upper: 0.11722187414438362
is_significant: True
sample_size: 500
