In [10]:
import pandas as pd
import json

race_df = pd.read_parquet('./datasets/race/test-00000-of-00001.parquet')

print(f"RACE test samples: {len(race_df)}")

RACE test samples: 3498


In [11]:
race_df.head()

Unnamed: 0,example_id,article,answer,question,options
0,high19432.txt,The rain had continued for a week and the floo...,C,What did Nancy try to do before she fell over?,"[Measure the depth of the river, Look for a fa..."
1,high19432.txt,The rain had continued for a week and the floo...,D,The following are true according to the passag...,[It took Lizzie and Nancy about 20 minutes to ...
2,high19432.txt,The rain had continued for a week and the floo...,A,What did the local people do to help those in ...,"[They put up shelter for them in a school., Th..."
3,high6268.txt,There is probably no field of human activity i...,B,The passage tells us that _ .,[our values and lifestyles are in no field of ...
4,high6268.txt,There is probably no field of human activity i...,B,"Traditionally,people usually thought that _ .","[men cared very much for clothes, women were c..."


In [12]:
race_df.head()

Unnamed: 0,example_id,article,answer,question,options
0,high19432.txt,The rain had continued for a week and the floo...,C,What did Nancy try to do before she fell over?,"[Measure the depth of the river, Look for a fa..."
1,high19432.txt,The rain had continued for a week and the floo...,D,The following are true according to the passag...,[It took Lizzie and Nancy about 20 minutes to ...
2,high19432.txt,The rain had continued for a week and the floo...,A,What did the local people do to help those in ...,"[They put up shelter for them in a school., Th..."
3,high6268.txt,There is probably no field of human activity i...,B,The passage tells us that _ .,[our values and lifestyles are in no field of ...
4,high6268.txt,There is probably no field of human activity i...,B,"Traditionally,people usually thought that _ .","[men cared very much for clothes, women were c..."


In [13]:
import random

# Set seed for reproducibility
random.seed(42)

# Number of samples to select
N = 100  # You can change this value

def select_samples_by_question_count(df, n_samples):
    """
    Select n_samples with the most questions from the dataframe
    """
    # Count questions per example_id
    question_counts = df.groupby('example_id').size().reset_index(name='question_count')
    
    # Sort by question count (descending) and get top n
    top_examples = question_counts.nlargest(n_samples, 'question_count')
    selected_ids = top_examples['example_id'].tolist()
    
    return df[df['example_id'].isin(selected_ids)], selected_ids

# Select samples from RACE dataset
race_samples, race_ids = select_samples_by_question_count(race_df, N)

print(f"Selected {len(race_ids)} unique articles from RACE (with most questions)")

Selected 100 unique articles from RACE (with most questions)


In [14]:
def transform_to_format(df, source_name, start_id=1):
    """
    Transform dataframe to the desired format, grouping by example_id
    Only keep content, remove questions and answers
    """
    results = []
    current_id = start_id
    
    # Group by example_id to get unique articles
    grouped = df.groupby('example_id')
    
    for example_id, group in grouped:
        # Get the article content (same for all rows in group)
        content = group.iloc[0]['article']
        
        # Create the result object (no questions field)
        result = {
            'id': current_id,
            'source': source_name,
            'content': content
        }
        
        results.append(result)
        current_id += 1
    
    return results, current_id

# Transform RACE dataset
race_results, next_id = transform_to_format(race_samples, 'race', start_id=1)

# All results is just RACE results
all_results = race_results

print(f"Total samples: {len(all_results)}")
print(f"RACE samples: {len(race_results)}")

Total samples: 100
RACE samples: 100


In [15]:
# Display a sample to verify the format
import json

print("Sample output:")
print(json.dumps(all_results[0], indent=2, ensure_ascii=False))

Sample output:
{
  "id": 1,
  "source": "race",
  "content": "Studies show that you may be lied to every day anywhere from 10 to 200 times. We say, \"Nice song.\" \"Honey, you don't look fat in that, no.\" But another study showed that strangers lied three times within the first 10 minutes of meeting each other. We lie more to strangers than we lie to coworkers. Men lie eight times more about themselves than they do other people. Women lie more to protect other people. If you're married, you're going to lie to your wife/ husband in one out of every 10 communications. If you're unmarried, that number drops to three. But look, if at some point you got lied to, it's because you agreed to get lied to. Truth about lying: lying's a cooperative act. Not all lies are harmful. Sometimes we're willing to lie for the sake of social dignity  , maybe to keep a private secret.\nLying is complex. It's woven into the fabric of our daily and business lives. We're deeply disturbed by the truth. We expla

In [16]:
# Display a few more samples to verify
print("\nSample RACE outputs:")
for i, result in enumerate(all_results[:3]):
    print(f"\nSample {i+1}:")
    print(json.dumps(result, indent=2, ensure_ascii=False))


Sample RACE outputs:

Sample 1:
{
  "id": 1,
  "source": "race",
  "content": "Studies show that you may be lied to every day anywhere from 10 to 200 times. We say, \"Nice song.\" \"Honey, you don't look fat in that, no.\" But another study showed that strangers lied three times within the first 10 minutes of meeting each other. We lie more to strangers than we lie to coworkers. Men lie eight times more about themselves than they do other people. Women lie more to protect other people. If you're married, you're going to lie to your wife/ husband in one out of every 10 communications. If you're unmarried, that number drops to three. But look, if at some point you got lied to, it's because you agreed to get lied to. Truth about lying: lying's a cooperative act. Not all lies are harmful. Sometimes we're willing to lie for the sake of social dignity  , maybe to keep a private secret.\nLying is complex. It's woven into the fabric of our daily and business lives. We're deeply disturbed by t

In [17]:
# Summary statistics
print("\n=== Summary Statistics ===")
for result in all_results:
    print(f"ID: {result['id']}, Source: {result['source']}, Content length: {len(result['content'])} chars")


=== Summary Statistics ===
ID: 1, Source: race, Content length: 2462 chars
ID: 2, Source: race, Content length: 3408 chars
ID: 3, Source: race, Content length: 1168 chars
ID: 4, Source: race, Content length: 4244 chars
ID: 5, Source: race, Content length: 1664 chars
ID: 6, Source: race, Content length: 2101 chars
ID: 7, Source: race, Content length: 1696 chars
ID: 8, Source: race, Content length: 1548 chars
ID: 9, Source: race, Content length: 1656 chars
ID: 10, Source: race, Content length: 1490 chars
ID: 11, Source: race, Content length: 4164 chars
ID: 12, Source: race, Content length: 1052 chars
ID: 13, Source: race, Content length: 2375 chars
ID: 14, Source: race, Content length: 2140 chars
ID: 15, Source: race, Content length: 1468 chars
ID: 16, Source: race, Content length: 2392 chars
ID: 17, Source: race, Content length: 1440 chars
ID: 18, Source: race, Content length: 1158 chars
ID: 19, Source: race, Content length: 1862 chars
ID: 20, Source: race, Content length: 3611 chars
I

In [18]:
# Save results to JSON file
import json

output_path = './datasets/unified/data.json'
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(all_results, f, indent=2, ensure_ascii=False)

print(f"Saved {len(all_results)} samples to {output_path}")

Saved 100 samples to ./datasets/unified/data.json
