In [252]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import markdown
from markdown.extensions import codehilite


In [253]:
df = pd.read_csv('../data_files/raw/summaries_V0903_for_humanstudy_detail.csv')

In [254]:
df.model.unique()

array(['deepseek-chat', 'gemini-2.5-pro', 'gpt-5', 'qwen3-32b',
       'web-rev-claude-opus-4-20250514'], dtype=object)

In [255]:
df = df.sample(frac=1).reset_index(drop=True)

In [256]:
df

Unnamed: 0,id,topic,question,summary,model,comment_num,comments,num_samples_group,sample_id,dataset_name,source_path
0,be44d0160fd1ec36,Binary-Refugee-Policies,Do you support the government accepting more r...,Here is a summary of the comments provided:\n\...,gemini-2.5-pro,70,"0: I am pretty conflicted, but I think overall...",70,1,Binary-Refugee-Policies,results/human_judgement/gemini-2.5-pro/70/Bina...
1,c5e28aaa30501e42,Binary-Online-Identity-Policies,Do you support requiring real-name registratio...,"Overall, comments reflect a divided perspectiv...",deepseek-chat,90,0: No I don’t support it. I think it’s fine to...,90,3,Binary-Online-Identity-Policies,results/human_judgement/deepseek-chat/90/Binar...
2,ad06843babdcceca,Binary-Online-Identity-Policies,Do you support requiring real-name registratio...,## Summary of Comments on Real-Name Registrati...,web-rev-claude-opus-4-20250514,90,0: No I don’t support it. I think it’s fine to...,90,2,Binary-Online-Identity-Policies,results/human_judgement/web-rev-claude-opus-4-...
3,e38ed9ce8b9e3220,Binary-Tariff-Policy,Do you think the current tariff policy under t...,The comments reflect a wide range of opinions ...,deepseek-chat,90,0: i think that it will have a negative impact...,90,2,Binary-Tariff-Policy,results/human_judgement/deepseek-chat/90/Binar...
4,5eb7a59774dbb009,Binary-Tariff-Policy,Do you think the current tariff policy under t...,The comments on the impact of the Trump admini...,qwen3-32b,90,0: i think that it will have a negative impact...,90,2,Binary-Tariff-Policy,results/human_judgement/qwen3-32b/90/Binary-Ta...
...,...,...,...,...,...,...,...,...,...,...,...
745,43386f89462d5edf,Binary-Vaccination-Policy,Do you support the government having the autho...,"Based on the comments provided, opinions are d...",deepseek-chat,10,0: I do not. I think that people can be highly...,10,1,Binary-Vaccination-Policy,results/human_judgement/deepseek-chat/10/Binar...
746,4ed4077a8fc1a871,Openqa-AI-changes-human-life,How has AI changed your life? Please answer br...,## Summary of Comments on AI's Impact on Daily...,web-rev-claude-opus-4-20250514,10,0: Al changed my life by making my daily tasks...,10,3,Openqa-AI-changes-human-life,results/human_judgement/web-rev-claude-opus-4-...
747,41fbf63fa33dc720,Openqa-Updates-of-electronic-products,What is your opinion on the rapid update cycle...,Overall summary of the comments:\n\n- Sentimen...,gpt-5,50,0: The rapid updates cycle electronic products...,50,1,Openqa-Updates-of-electronic-products,results/human_judgement/gpt-5/50/Openqa-Update...
748,74f5e06827c57f1c,Binary-Refugee-Policies,Do you support the government accepting more r...,The comments reflect a strong humanitarian sen...,deepseek-chat,50,"0: I am pretty conflicted, but I think overall...",50,3,Binary-Refugee-Policies,results/human_judgement/deepseek-chat/50/Binar...


In [257]:


# Process the dataframe into the desired format
processed_data = []

for i, (_, row) in enumerate(df.iterrows()):
    raw_id = row['id']
    question = row['question']
    summary = row['summary']
    
    # Add a row for the question
    question_entry = {
        "id": f"{raw_id}_question",
        "raw_id": raw_id,
        "question": question,
        "text": '[Question]' + question.replace("\n", "<br>").replace(" Please answer briefly in 2–3 sentences.", "").replace("Please answer briefly in 1–2 sentences.", ""),
        "model": row['model'],
        "summary_length": row.get('summary_length', None)
    }
    processed_data.append(question_entry)
    
    # Add a row for the summary (convert markdown to HTML using markdown package)
    summary_html = markdown.markdown(summary, extensions=['extra', 'codehilite'])
    summary_entry = {
        "id": f"{raw_id}_summary",
        "raw_id": raw_id,
        "question": question,
        "text": "<h4>Below is a summary of people's opinions on the issue.</h4><hr>" + summary_html,
        "model": row['model'],
        "summary_length": row.get('summary_length', None)
    }
    processed_data.append(summary_entry)

# Convert the processed data into a DataFrame
processed_df = pd.DataFrame(processed_data)


In [258]:
processed_df

Unnamed: 0,id,raw_id,question,text,model,summary_length
0,be44d0160fd1ec36_question,be44d0160fd1ec36,Do you support the government accepting more r...,[Question]Do you support the government accept...,gemini-2.5-pro,
1,be44d0160fd1ec36_summary,be44d0160fd1ec36,Do you support the government accepting more r...,<h4>Below is a summary of people's opinions on...,gemini-2.5-pro,
2,c5e28aaa30501e42_question,c5e28aaa30501e42,Do you support requiring real-name registratio...,[Question]Do you support requiring real-name r...,deepseek-chat,
3,c5e28aaa30501e42_summary,c5e28aaa30501e42,Do you support requiring real-name registratio...,<h4>Below is a summary of people's opinions on...,deepseek-chat,
4,ad06843babdcceca_question,ad06843babdcceca,Do you support requiring real-name registratio...,[Question]Do you support requiring real-name r...,web-rev-claude-opus-4-20250514,
...,...,...,...,...,...,...
1495,41fbf63fa33dc720_summary,41fbf63fa33dc720,What is your opinion on the rapid update cycle...,<h4>Below is a summary of people's opinions on...,gpt-5,
1496,74f5e06827c57f1c_question,74f5e06827c57f1c,Do you support the government accepting more r...,[Question]Do you support the government accept...,deepseek-chat,
1497,74f5e06827c57f1c_summary,74f5e06827c57f1c,Do you support the government accepting more r...,<h4>Below is a summary of people's opinions on...,deepseek-chat,
1498,1dd8a02edf92e5b0_question,1dd8a02edf92e5b0,Do you support the government accepting more r...,[Question]Do you support the government accept...,web-rev-claude-opus-4-20250514,


In [259]:
processed_df['text'].value_counts()[:10]

text
[Question]What is your opinion on the rapid update cycle of electronic products, especially smartphones?                                                            75
[Question]How has AI changed your life?                                                                                                                             75
[Question]What is your opinion on internet influencers (e.g., streamers, bloggers, short video creators) increasingly becoming a recognized profession?             75
[Question]Do you support the government accepting more refugees fleeing war or persecution?                                                                         75
[Question]Do you support requiring real-name registration on social media platforms, where users must register and post under their real identity?                  75
[Question]What is your opinion on tipping, and if given the chance, how would you improve or change the current tipping system?                                 

In [260]:
# Check data structure before processing pairs
print("Total rows in df:", len(df))
print("Unique questions:", df['question'].nunique())
print("Unique models:", df['model'].nunique())
print("Unique num_samples_group:", df['num_samples_group'].nunique())

# Check grouping structure
group_sizes = df.groupby(['question', 'num_samples_group']).size()
print("\nGroup sizes (question, num_samples_group):")
print(group_sizes.describe())
print("\nFirst few groups:")
print(group_sizes.head(10))


Total rows in df: 750
Unique questions: 10
Unique models: 5
Unique num_samples_group: 5

Group sizes (question, num_samples_group):
count    50.0
mean     15.0
std       0.0
min      15.0
25%      15.0
50%      15.0
75%      15.0
max      15.0
dtype: float64

First few groups:
question                                                                                                                                                                          num_samples_group
Do you support requiring real-name registration on social media platforms, where users must register and post under their real identity? Please answer briefly in 2–3 sentences.  10                   15
                                                                                                                                                                                  30                   15
                                                                                                                          

In [261]:
processed_df.to_csv('../data_files/processed/sum_humanstudy_rating_full.csv', index=False)

In [262]:
# Configuration for pairwise comparisons
MIN_COMPARISONS_PER_SUMMARY = 6  # Each summary should be compared at least N times

# Process the dataframe into the desired format
processed_pair_data = []

# Group summaries by question AND num_samples_group to create pairs
# Only summaries with the same input size should be compared
# Expected: 5 models × 3 resamples = 15 summaries per group
question_sample_groups = df.groupby(['question', 'num_samples_group'])

print(f"Total groups to process: {len(question_sample_groups)}")

for (question, num_samples_group), group in question_sample_groups:
    print(f"Processing group: question='{question[:50]}...', num_samples_group={num_samples_group}, size={len(group)}")
    
    # Verify we have the expected number of summaries (should be 15: 5 models × 3 resamples)
    if len(group) != 15:
        print(f"  WARNING: Expected 15 summaries, got {len(group)}")
        print(f"  Models in group: {group['model'].value_counts().to_dict()}")
    
    # Get all summaries for this group
    summaries = list(group.iterrows())
    
    # Generate pairs with more balanced comparison distribution
    pairs_created = []
    summary_counts = {i: 0 for i in range(len(summaries))}
    
    # Create all possible pairs
    from itertools import combinations
    all_pairs = list(combinations(range(len(summaries)), 2))
    
    # Configuration for balanced pairing
    MAX_COMPARISONS_PER_SUMMARY = MIN_COMPARISONS_PER_SUMMARY + 3  # Allow some flexibility (6-9 comparisons)
    
    # Improved priority function that considers both minimum requirement and balance
    def balanced_pair_priority(pair_indices):
        i, j = pair_indices
        count_i, count_j = summary_counts[i], summary_counts[j]
        
        # Prioritize pairs where both summaries are below minimum
        both_below_min = (count_i < MIN_COMPARISONS_PER_SUMMARY) and (count_j < MIN_COMPARISONS_PER_SUMMARY)
        one_below_min = (count_i < MIN_COMPARISONS_PER_SUMMARY) or (count_j < MIN_COMPARISONS_PER_SUMMARY)
        
        # Calculate balance score (lower is better)
        max_count = max(count_i, count_j)
        min_count = min(count_i, count_j)
        sum_count = count_i + count_j
        
        if both_below_min:
            return (0, sum_count, max_count - min_count)  # Highest priority
        elif one_below_min:
            return (1, sum_count, max_count - min_count)  # Medium priority
        else:
            return (2, sum_count, max_count - min_count)  # Lowest priority
    
    # Keep adding pairs with balanced approach
    while any(count < MIN_COMPARISONS_PER_SUMMARY for count in summary_counts.values()) and all_pairs:
        # Filter out pairs that would exceed maximum comparisons
        valid_pairs = [
            pair for pair in all_pairs 
            if summary_counts[pair[0]] < MAX_COMPARISONS_PER_SUMMARY 
            and summary_counts[pair[1]] < MAX_COMPARISONS_PER_SUMMARY
        ]
        
        if not valid_pairs:
            # If no valid pairs, allow exceeding max for summaries still below minimum
            valid_pairs = [
                pair for pair in all_pairs
                if (summary_counts[pair[0]] < MIN_COMPARISONS_PER_SUMMARY 
                    or summary_counts[pair[1]] < MIN_COMPARISONS_PER_SUMMARY)
            ]
        
        if not valid_pairs:
            break
            
        # Sort pairs by balanced priority
        valid_pairs.sort(key=balanced_pair_priority)
        
        # Take the best pair
        pair_indices = valid_pairs[0]
        all_pairs.remove(pair_indices)
        i, j = pair_indices
        
        # Add this pair
        pairs_created.append(pair_indices)
        summary_counts[i] += 1
        summary_counts[j] += 1
    
    # Create comparison entries for each pair
    for pair_idx, (i, j) in enumerate(pairs_created):
        _, row_a = summaries[i]
        _, row_b = summaries[j]
        
        # Add a unique question entry before each pair
        question_entry = {
            "id": f"{row_a['id']}_{row_b['id']}_pair_{pair_idx}_question",
            "raw_id": f"{row_a['id']}_{row_b['id']}_pair_{pair_idx}",
            "question": question,
            "text": '<h3>[Question]</h3>' + '<h4>' + question.replace("\n", "<br>").replace(" Please answer briefly in 2–3 sentences.", "").replace("Please answer briefly in 1–2 sentences.", "") + '</h4>',
            "model": "question",
            "num_samples_group": num_samples_group,
            "summary_length": None
        }
        processed_pair_data.append(question_entry)
        
        # Convert summaries to HTML with smaller headers
        summary_a_html = markdown.markdown(row_a['summary'], extensions=['extra', 'codehilite'])
        summary_b_html = markdown.markdown(row_b['summary'], extensions=['extra', 'codehilite'])
        
        # Make headers smaller by replacing h1-h3 with h4-h6
        import re
        summary_a_html = re.sub(r'<h1>', '<h4>', summary_a_html)
        summary_a_html = re.sub(r'</h1>', '</h4>', summary_a_html)
        summary_a_html = re.sub(r'<h2>', '<h5>', summary_a_html)
        summary_a_html = re.sub(r'</h2>', '</h5>', summary_a_html)
        summary_a_html = re.sub(r'<h3>', '<h6>', summary_a_html)
        summary_a_html = re.sub(r'</h3>', '</h6>', summary_a_html)
        
        summary_b_html = re.sub(r'<h1>', '<h4>', summary_b_html)
        summary_b_html = re.sub(r'</h1>', '</h4>', summary_b_html)
        summary_b_html = re.sub(r'<h2>', '<h5>', summary_b_html)
        summary_b_html = re.sub(r'</h2>', '</h5>', summary_b_html)
        summary_b_html = re.sub(r'<h3>', '<h6>', summary_b_html)
        summary_b_html = re.sub(r'</h3>', '</h6>', summary_b_html)
        
        # Create HTML layout for pairwise comparison with separate scrollbars
        comparison_html = f"""
        <div style="display: flex; gap: 20px;">
            <div style="flex: 1; border: 1px solid #ccc; padding: 15px; border-radius: 5px;">
                <h4 style="margin-top: 0; color: #2c5aa0;">Summary A</h4>
                <div style="max-height: 800px; overflow-y: auto; padding-right: 10px;">
                    {summary_a_html}
                </div>
            </div>
            <div style="flex: 1; border: 1px solid #ccc; padding: 15px; border-radius: 5px;">
                <h4 style="margin-top: 0; color: #2c5aa0;">Summary B</h4>
                <div style="max-height: 800px; overflow-y: auto; padding-right: 10px;">
                    {summary_b_html}
                </div>
            </div>
        </div>
        """
        
        pair_entry = {
            "id": f"{row_a['id']}_{row_b['id']}_pair_{pair_idx}",
            "raw_id": f"{row_a['id']}_{row_b['id']}",
            "question": question,
            "text": "<h4>Two summaries of opinions are shown below. Read carefully and answer according to your prior opinion. Both are scrollable.</h4><hr>" + comparison_html,
            "model_a": row_a['model'],
            "model_b": row_b['model'],
            "num_samples_group": num_samples_group,
            "summary_a_id": row_a['id'],
            "summary_b_id": row_b['id'],
            "summary_a_text": row_a['summary'],
            "summary_b_text": row_b['summary'],
            "summary_length_a": row_a.get('summary_length', None),
            "summary_length_b": row_b.get('summary_length', None)
        }
        processed_pair_data.append(pair_entry)
    
    # Print statistics for this group
    print(f"  Created {len(pairs_created)} pairs for this group")
    print(f"  Each summary comparison count: {dict(summary_counts)}")

# Convert the processed data into a DataFrame
processed_pair_df = pd.DataFrame(processed_pair_data)

print(f"\nFinal statistics:")
print(f"Total processed_pair_data entries: {len(processed_pair_data)}")
print(f"Total question entries: {len([x for x in processed_pair_data if x.get('model') == 'question'])}")
print(f"Total pair entries: {len([x for x in processed_pair_data if x.get('model_a') is not None])}")
print(f"Expected pairs per group (15 summaries, min 6 comparisons each): ~{15 * 6 // 2}")
print(f"Expected total pairs for all groups: ~{len(question_sample_groups) * (15 * 6 // 2)}")


Total groups to process: 50
Processing group: question='Do you support requiring real-name registration on...', num_samples_group=10, size=15
  Created 45 pairs for this group
  Each summary comparison count: {0: 6, 1: 6, 2: 6, 3: 6, 4: 6, 5: 6, 6: 6, 7: 6, 8: 6, 9: 6, 10: 6, 11: 6, 12: 6, 13: 6, 14: 6}
Processing group: question='Do you support requiring real-name registration on...', num_samples_group=30, size=15
  Created 45 pairs for this group
  Each summary comparison count: {0: 6, 1: 6, 2: 6, 3: 6, 4: 6, 5: 6, 6: 6, 7: 6, 8: 6, 9: 6, 10: 6, 11: 6, 12: 6, 13: 6, 14: 6}
Processing group: question='Do you support requiring real-name registration on...', num_samples_group=50, size=15
  Created 45 pairs for this group
  Each summary comparison count: {0: 6, 1: 6, 2: 6, 3: 6, 4: 6, 5: 6, 6: 6, 7: 6, 8: 6, 9: 6, 10: 6, 11: 6, 12: 6, 13: 6, 14: 6}
Processing group: question='Do you support requiring real-name registration on...', num_samples_group=70, size=15
  Created 45 pairs for this

  Created 45 pairs for this group
  Each summary comparison count: {0: 6, 1: 6, 2: 6, 3: 6, 4: 6, 5: 6, 6: 6, 7: 6, 8: 6, 9: 6, 10: 6, 11: 6, 12: 6, 13: 6, 14: 6}
Processing group: question='What is your opinion on the rapid update cycle of ...', num_samples_group=90, size=15
  Created 45 pairs for this group
  Each summary comparison count: {0: 6, 1: 6, 2: 6, 3: 6, 4: 6, 5: 6, 6: 6, 7: 6, 8: 6, 9: 6, 10: 6, 11: 6, 12: 6, 13: 6, 14: 6}
Processing group: question='What is your opinion on tipping, and if given the ...', num_samples_group=10, size=15
  Created 45 pairs for this group
  Each summary comparison count: {0: 6, 1: 6, 2: 6, 3: 6, 4: 6, 5: 6, 6: 6, 7: 6, 8: 6, 9: 6, 10: 6, 11: 6, 12: 6, 13: 6, 14: 6}
Processing group: question='What is your opinion on tipping, and if given the ...', num_samples_group=30, size=15
  Created 45 pairs for this group
  Each summary comparison count: {0: 6, 1: 6, 2: 6, 3: 6, 4: 6, 5: 6, 6: 6, 7: 6, 8: 6, 9: 6, 10: 6, 11: 6, 12: 6, 13: 6, 14: 6}
Proces

In [263]:
# Test the optimized algorithm on one group to see the improvement
test_question = df['question'].iloc[0]
test_sample_group = df['num_samples_group'].iloc[0]
test_group = df[(df['question'] == test_question) & (df['num_samples_group'] == test_sample_group)]

print(f"Testing optimized algorithm on: {test_question[:50]}..., sample_group={test_sample_group}")
print(f"Group size: {len(test_group)}")

# Run the optimized algorithm on this test group
test_summaries = list(test_group.iterrows())
test_pairs_created = []
test_summary_counts = {i: 0 for i in range(len(test_summaries))}

from itertools import combinations
test_all_pairs = list(combinations(range(len(test_summaries)), 2))

MIN_COMPARISONS_PER_SUMMARY = 6
MAX_COMPARISONS_PER_SUMMARY = MIN_COMPARISONS_PER_SUMMARY + 3

def balanced_pair_priority(pair_indices):
    i, j = pair_indices
    count_i, count_j = test_summary_counts[i], test_summary_counts[j]
    
    both_below_min = (count_i < MIN_COMPARISONS_PER_SUMMARY) and (count_j < MIN_COMPARISONS_PER_SUMMARY)
    one_below_min = (count_i < MIN_COMPARISONS_PER_SUMMARY) or (count_j < MIN_COMPARISONS_PER_SUMMARY)
    
    max_count = max(count_i, count_j)
    min_count = min(count_i, count_j)
    sum_count = count_i + count_j
    
    if both_below_min:
        return (0, sum_count, max_count - min_count)
    elif one_below_min:
        return (1, sum_count, max_count - min_count)
    else:
        return (2, sum_count, max_count - min_count)

while any(count < MIN_COMPARISONS_PER_SUMMARY for count in test_summary_counts.values()) and test_all_pairs:
    valid_pairs = [
        pair for pair in test_all_pairs 
        if test_summary_counts[pair[0]] < MAX_COMPARISONS_PER_SUMMARY 
        and test_summary_counts[pair[1]] < MAX_COMPARISONS_PER_SUMMARY
    ]
    
    if not valid_pairs:
        valid_pairs = [
            pair for pair in test_all_pairs
            if (test_summary_counts[pair[0]] < MIN_COMPARISONS_PER_SUMMARY 
                or test_summary_counts[pair[1]] < MIN_COMPARISONS_PER_SUMMARY)
        ]
    
    if not valid_pairs:
        break
        
    valid_pairs.sort(key=balanced_pair_priority)
    pair_indices = valid_pairs[0]
    test_all_pairs.remove(pair_indices)
    i, j = pair_indices
    
    test_pairs_created.append(pair_indices)
    test_summary_counts[i] += 1
    test_summary_counts[j] += 1

print(f"Optimized result:")
print(f"  Created {len(test_pairs_created)} pairs")
print(f"  Comparison counts: {dict(test_summary_counts)}")
print(f"  Min comparisons: {min(test_summary_counts.values())}")
print(f"  Max comparisons: {max(test_summary_counts.values())}")
print(f"  Standard deviation: {np.std(list(test_summary_counts.values())):.2f}")

import numpy as np


Testing optimized algorithm on: Do you support the government accepting more refug..., sample_group=70
Group size: 15
Optimized result:
  Created 45 pairs
  Comparison counts: {0: 6, 1: 6, 2: 6, 3: 6, 4: 6, 5: 6, 6: 6, 7: 6, 8: 6, 9: 6, 10: 6, 11: 6, 12: 6, 13: 6, 14: 6}
  Min comparisons: 6
  Max comparisons: 6
  Standard deviation: 0.00


In [264]:
processed_pair_df.to_csv('../data_files/processed/sum_humanstudy_pair_full.csv', index=False)

In [265]:
# Load both datasets for natural join
rating_df = pd.read_csv('../data_files/processed/sum_humanstudy_rating_full.csv')
pair_df = pd.read_csv('../data_files/processed/sum_humanstudy_pair_full.csv')

print(f"Rating data shape: {rating_df.shape}")
print(f"Pair data shape: {pair_df.shape}")

# Check the structure
print("\nRating data columns:", rating_df.columns.tolist())
print("Pair data columns:", pair_df.columns.tolist())

print("\nRating data sample:")
print(rating_df.head(3))
print("\nPair data sample:")
print(pair_df.head(3))


Rating data shape: (1500, 6)
Pair data shape: (4500, 15)

Rating data columns: ['id', 'raw_id', 'question', 'text', 'model', 'summary_length']
Pair data columns: ['id', 'raw_id', 'question', 'text', 'model', 'num_samples_group', 'summary_length', 'model_a', 'model_b', 'summary_a_id', 'summary_b_id', 'summary_a_text', 'summary_b_text', 'summary_length_a', 'summary_length_b']

Rating data sample:
                          id            raw_id  \
0  be44d0160fd1ec36_question  be44d0160fd1ec36   
1   be44d0160fd1ec36_summary  be44d0160fd1ec36   
2  c5e28aaa30501e42_question  c5e28aaa30501e42   

                                            question  \
0  Do you support the government accepting more r...   
1  Do you support the government accepting more r...   
2  Do you support requiring real-name registratio...   

                                                text           model  \
0  [Question]Do you support the government accept...  gemini-2.5-pro   
1  <h4>Below is a summary of peo

In [266]:
# Correct understanding: Both datasets have pairs of rows
# Rating: question + rating summary (2 rows per pair)  
# Pair: question + comparison (2 rows per pair)
# Goal: Create triplets by natural join on raw_id

print("Understanding data structure...")
print(f"Rating data: {len(rating_df)} total rows")
print(f"Pair data: {len(pair_df)} total rows")

# Extract rating pairs (question + summary)
rating_pairs = []
for i in range(0, len(rating_df), 2):
    if i + 1 < len(rating_df):
        question_row = rating_df.iloc[i]
        summary_row = rating_df.iloc[i + 1]
        
        # Verify this is a proper pair
        if (question_row['raw_id'] == summary_row['raw_id'] and 
            '_question' in question_row['id'] and 
            '_summary' in summary_row['id']):
            rating_pairs.append({
                'raw_id': question_row['raw_id'],
                'question': question_row['question'],
                'question_text': question_row['text'],
                'summary_text': summary_row['text'],
                'model': question_row['model'],
                'summary_length': question_row.get('summary_length')
            })

# Debug: Check the pair data structure first
print("Debugging pair data structure...")
print(f"First few rows of pair_df:")
print(pair_df.head(10))
print(f"\nChecking model column values:")
print(pair_df['model'].value_counts())

# Extract pair comparisons (question + comparison)  
pair_comparisons = []
for i in range(0, len(pair_df), 2):
    if i + 1 < len(pair_df):
        question_row = pair_df.iloc[i]
        comparison_row = pair_df.iloc[i + 1]
        
        # Show progress for debugging (only first few and every 100th)
        if i < 20 or i % 200 == 0:
            print(f"Row {i}: model='{question_row['model']}', raw_id='{question_row['raw_id']}'")
            print(f"Row {i+1}: model='{comparison_row['model']}', raw_id='{comparison_row['raw_id']}'")
        
        # Verify this is a proper pair
        # Question row has '_pair_X' suffix, comparison row doesn't
        question_base_id = question_row['raw_id'].rsplit('_pair_', 1)[0] if '_pair_' in question_row['raw_id'] else question_row['raw_id']
        comparison_base_id = comparison_row['raw_id']
        
        if i < 20 or i % 200 == 0:
            print(f"  Comparing: '{question_base_id}' == '{comparison_base_id}'")
        
        if (question_base_id == comparison_base_id and
            question_row['model'] == 'question'):
            pair_comparisons.append({
                'raw_id': comparison_row['raw_id'], 
                'question': question_row['question'],
                'question_text': question_row['text'],
                'comparison_text': comparison_row['text'],
                'model_a': comparison_row.get('model_a'),
                'model_b': comparison_row.get('model_b'),
                'summary_a_id': comparison_row.get('summary_a_id'),
                'summary_b_id': comparison_row.get('summary_b_id'),
                'summary_a_text': comparison_row.get('summary_a_text'),
                'summary_b_text': comparison_row.get('summary_b_text')
            })
            if i < 20 or i % 200 == 0:
                print(f"  -> Added pair comparison")
        else:
            if i < 20 or i % 200 == 0:
                print(f"  -> Skipped: base_id_match={question_base_id == comparison_base_id}, model_check={question_row['model'] == 'question'}")

print(f"Extracted {len(rating_pairs)} rating pairs")
print(f"Extracted {len(pair_comparisons)} comparison pairs")

# Convert to DataFrames for easier joining
rating_pairs_df = pd.DataFrame(rating_pairs)
pair_comparisons_df = pd.DataFrame(pair_comparisons)


Understanding data structure...
Rating data: 1500 total rows
Pair data: 4500 total rows
Debugging pair data structure...
First few rows of pair_df:
                                                  id  \
0  b47684688d868a0f_bcf20dbee5035406_pair_0_question   
1           b47684688d868a0f_bcf20dbee5035406_pair_0   
2  c763306fb89c8d1a_ce164d741106ab5e_pair_1_question   
3           c763306fb89c8d1a_ce164d741106ab5e_pair_1   
4  ad3135722c23a68f_ec6a5ffe40b150f0_pair_2_question   
5           ad3135722c23a68f_ec6a5ffe40b150f0_pair_2   
6  7cb5806a4aa6f3ff_38a9bde51b2e82b9_pair_3_question   
7           7cb5806a4aa6f3ff_38a9bde51b2e82b9_pair_3   
8  dcb26be08b758bb5_97800d729fb0bca1_pair_4_question   
9           dcb26be08b758bb5_97800d729fb0bca1_pair_4   

                                     raw_id  \
0  b47684688d868a0f_bcf20dbee5035406_pair_0   
1         b47684688d868a0f_bcf20dbee5035406   
2  c763306fb89c8d1a_ce164d741106ab5e_pair_1   
3         c763306fb89c8d1a_ce164d741106ab5e   


In [267]:
# Match by summary_a_id: For each pair comparison, find rating with matching ID
print("Creating triplets by matching summary_a_id...")
print(f"Total pair comparisons: {len(pair_comparisons_df)}")

# Debug: Check rating_pairs_df structure first
print(f"Rating pairs sample:")
print(rating_pairs_df.head(3))
print(f"Rating pairs columns: {rating_pairs_df.columns.tolist()}")

# Create a lookup dictionary for faster matching: rating_raw_id -> rating_pair
ratings_by_id = {}
for _, rating_pair in rating_pairs_df.iterrows():
    # Use the raw_id from rating data as the key
    raw_id = rating_pair['raw_id']
    ratings_by_id[raw_id] = rating_pair

print(f"Created lookup for {len(ratings_by_id)} rating summaries")
print(f"Sample rating raw_ids: {list(ratings_by_id.keys())[:5]}")

# Debug: Check pair summary_a_id format
print(f"Sample pair summary_a_ids: {pair_comparisons_df['summary_a_id'].head(5).tolist()}")

# For each pair comparison, match by summary_a_id
joined_data = []
matched_count = 0
unmatched_count = 0
rating_usage_count = {}

for idx, comparison_pair in pair_comparisons_df.iterrows():
    summary_a_id = comparison_pair['summary_a_id']
    pair_raw_id = comparison_pair['raw_id']
    question = comparison_pair['question']
    
    # Find rating pair with matching summary_a_id
    if summary_a_id in ratings_by_id:
        matched_count += 1
        rating_pair = ratings_by_id[summary_a_id]
        
        # Track usage
        rating_usage_count[summary_a_id] = rating_usage_count.get(summary_a_id, 0) + 1
        
        # Create the triplet: question + rating + comparison
        clean_question = question.replace(" Please answer briefly in 2–3 sentences.", "").replace("Please answer briefly in 1–2 sentences.", "")
        
        # Use pair's raw_id as the triplet identifier
        triplet_id = f"triplet_{idx}"
        
        # Row 1: Question (with proper HTML formatting, preserve all columns)
        question_row = {
            'id': f"{triplet_id}_question",
            'raw_id': pair_raw_id,
            'question': question,
            'text': f'<h3>[Question]</h3><h4>{clean_question}</h4>',
            'type': 'question',
            'model': 'question',
            'num_samples_group': comparison_pair.get('num_samples_group'),
            'summary_length': None,
            'model_a': None,
            'model_b': None,
            'summary_a_id': None,
            'summary_b_id': None,
            'summary_a_text': None,
            'summary_b_text': None,
            'summary_length_a': None,
            'summary_length_b': None
        }
        joined_data.append(question_row)
        
        # Row 2: Rating (from rating data, matched by summary_a_id, preserve all columns)
        rating_row = {
            'id': f"{triplet_id}_rating",
            'raw_id': pair_raw_id,
            'question': question,
            'text': rating_pair['summary_text'],
            'type': 'rating',
            'model': rating_pair['model'],
            'num_samples_group': comparison_pair.get('num_samples_group'),
            'summary_length': rating_pair.get('summary_length'),
            'model_a': None,
            'model_b': None,
            'summary_a_id': None,
            'summary_b_id': None,
            'summary_a_text': None,
            'summary_b_text': None,
            'summary_length_a': None,
            'summary_length_b': None
        }
        joined_data.append(rating_row)
        
        # Row 3: Comparison (from pair data, preserve all columns)
        comparison_row = {
            'id': f"{triplet_id}_comparison",
            'raw_id': pair_raw_id,
            'question': question,
            'text': comparison_pair['comparison_text'],
            'type': 'comparison',
            'model': 'comparison',
            'num_samples_group': comparison_pair.get('num_samples_group'),
            'summary_length': None,
            'model_a': comparison_pair.get('model_a'),
            'model_b': comparison_pair.get('model_b'),
            'summary_a_id': comparison_pair.get('summary_a_id'),
            'summary_b_id': comparison_pair.get('summary_b_id'),
            'summary_a_text': comparison_pair.get('summary_a_text'),
            'summary_b_text': comparison_pair.get('summary_b_text'),
            'summary_length_a': comparison_pair.get('summary_length_a'),
            'summary_length_b': comparison_pair.get('summary_length_b')
        }
        joined_data.append(comparison_row)
    else:
        unmatched_count += 1
        if idx < 5:  # Show first few unmatched for debugging
            print(f"No match found for summary_a_id: {summary_a_id}")

print(f"Matched {matched_count} pair comparisons with ratings")
print(f"Unmatched {unmatched_count} pair comparisons")
print(f"Total triplets created: {len(joined_data) // 3}")
print(f"Total rows: {len(joined_data)}")

# Check rating usage distribution
if rating_usage_count:
    usage_values = list(rating_usage_count.values())
    print(f"Rating usage stats: min={min(usage_values)}, max={max(usage_values)}, avg={sum(usage_values)/len(usage_values):.1f}")

# Create final DataFrame
triplet_df = pd.DataFrame(joined_data)


Creating triplets by matching summary_a_id...
Total pair comparisons: 2250
Rating pairs sample:
             raw_id                                           question  \
0  be44d0160fd1ec36  Do you support the government accepting more r...   
1  c5e28aaa30501e42  Do you support requiring real-name registratio...   
2  ad06843babdcceca  Do you support requiring real-name registratio...   

                                       question_text  \
0  [Question]Do you support the government accept...   
1  [Question]Do you support requiring real-name r...   
2  [Question]Do you support requiring real-name r...   

                                        summary_text  \
0  <h4>Below is a summary of people's opinions on...   
1  <h4>Below is a summary of people's opinions on...   
2  <h4>Below is a summary of people's opinions on...   

                            model  summary_length  
0                  gemini-2.5-pro             NaN  
1                   deepseek-chat             NaN  
2

In [268]:
# Debug: Check the triplet_df structure first
print("\nDebugging triplet_df structure:")
print(f"triplet_df shape: {triplet_df.shape}")
print(f"triplet_df columns: {triplet_df.columns.tolist()}")
print(f"First few rows:")
print(triplet_df.head())

# Check if triplet_df is empty
if len(triplet_df) == 0:
    print("ERROR: triplet_df is empty! Check the matching process.")
else:
    # Verify the triplet structure and show samples
    print("\nVerifying triplet structure:")
    if 'type' in triplet_df.columns:
        print(f"Question entries: {len(triplet_df[triplet_df['type'] == 'question'])}")
        print(f"Rating entries: {len(triplet_df[triplet_df['type'] == 'rating'])}")  
        print(f"Comparison entries: {len(triplet_df[triplet_df['type'] == 'comparison'])}")
        
        # Show sample triplets
        print("\nSample triplet structure:")
        for i in range(0, min(9, len(triplet_df)), 3):
            print(f"\nTriplet {i//3 + 1}:")
            print(f"  Question: {triplet_df.iloc[i]['text']}")
            print(f"  Rating: {triplet_df.iloc[i+1]['text'][:100]}...")
            if i+2 < len(triplet_df):
                print(f"  Comparison: {triplet_df.iloc[i+2]['text'][:100]}...")
    else:
        print("ERROR: 'type' column missing from triplet_df")

# Save the triplet data
output_path = '../data_files/processed/sum_humanstudy_triplet_full.csv'
triplet_df.to_csv(output_path, index=False)
print(f"\nSaved triplet data to: {output_path}")
print(f"Total rows: {len(triplet_df)}")
print(f"Total triplets: {len(triplet_df) // 3}")

# Final verification
triplet_count = len(triplet_df) // 3
remainder = len(triplet_df) % 3
if remainder == 0:
    print(f"✅ Perfect triplet structure: {triplet_count} complete triplets")
else:
    print(f"⚠️ Incomplete triplets: {triplet_count} complete + {remainder} remaining entries")



Debugging triplet_df structure:
triplet_df shape: (6750, 16)
triplet_df columns: ['id', 'raw_id', 'question', 'text', 'type', 'model', 'num_samples_group', 'summary_length', 'model_a', 'model_b', 'summary_a_id', 'summary_b_id', 'summary_a_text', 'summary_b_text', 'summary_length_a', 'summary_length_b']
First few rows:
                     id                             raw_id  \
0    triplet_0_question  b47684688d868a0f_bcf20dbee5035406   
1      triplet_0_rating  b47684688d868a0f_bcf20dbee5035406   
2  triplet_0_comparison  b47684688d868a0f_bcf20dbee5035406   
3    triplet_1_question  c763306fb89c8d1a_ce164d741106ab5e   
4      triplet_1_rating  c763306fb89c8d1a_ce164d741106ab5e   

                                            question  \
0  Do you support requiring real-name registratio...   
1  Do you support requiring real-name registratio...   
2  Do you support requiring real-name registratio...   
3  Do you support requiring real-name registratio...   
4  Do you support requirin

In [269]:
len(processed_pair_data)

4500