In [1]:
import pandas as pd

race_h_df = pd.read_parquet('./datasets/race-h/test-00000-of-00001.parquet')
race_m_df = pd.read_parquet('./datasets/race-m/test-00000-of-00001.parquet')

In [2]:
race_h_df.head()

Unnamed: 0,example_id,article,answer,question,options
0,high19432.txt,The rain had continued for a week and the floo...,C,What did Nancy try to do before she fell over?,"[Measure the depth of the river, Look for a fa..."
1,high19432.txt,The rain had continued for a week and the floo...,D,The following are true according to the passag...,[It took Lizzie and Nancy about 20 minutes to ...
2,high19432.txt,The rain had continued for a week and the floo...,A,What did the local people do to help those in ...,"[They put up shelter for them in a school., Th..."
3,high6268.txt,There is probably no field of human activity i...,B,The passage tells us that _ .,[our values and lifestyles are in no field of ...
4,high6268.txt,There is probably no field of human activity i...,B,"Traditionally,people usually thought that _ .","[men cared very much for clothes, women were c..."


In [3]:
race_m_df.head()

Unnamed: 0,example_id,article,answer,question,options
0,middle2177.txt,"It is well-known that the ""prom"", a formal dan...",B,"In which country is the prom called a ""formal""?","[America., Canada., Britain., Australia.]"
1,middle2177.txt,"It is well-known that the ""prom"", a formal dan...",A,Why is the prom important in the students' lives?,"[It's the sign of becoming an adult., It's an ..."
2,middle2177.txt,"It is well-known that the ""prom"", a formal dan...",D,What is the passage mainly about?,"[The history of the prom., The traditions of t..."
3,middle6088.txt,Kites have a long history. They may date back ...,B,The first kite was born when _ .,"[kites were used for sailing, a Chinese farmer..."
4,middle6088.txt,Kites have a long history. They may date back ...,C,Kites are made of the following materials EXCE...,"[silk, bamboo, cloth, paper]"


In [4]:
import random

# Set seed for reproducibility (not needed anymore but keeping for consistency)
random.seed(42)

# Number of samples to select from each dataset
N = 100  # You can change this value

def select_samples_by_question_count(df, source_name, n_samples):
    """
    Select n_samples with the most questions from the dataframe
    """
    # Count questions per example_id
    question_counts = df.groupby('example_id').size().reset_index(name='question_count')
    
    # Sort by question count (descending) and get top n
    top_examples = question_counts.nlargest(n_samples, 'question_count')
    selected_ids = top_examples['example_id'].tolist()
    
    return df[df['example_id'].isin(selected_ids)], selected_ids

# Select samples from both datasets
race_h_samples, race_h_ids = select_samples_by_question_count(race_h_df, 'race-h', N)
race_m_samples, race_m_ids = select_samples_by_question_count(race_m_df, 'race-m', N)

print(f"Selected {len(race_h_ids)} unique articles from RACE-H (with most questions)")
print(f"Selected {len(race_m_ids)} unique articles from RACE-M (with most questions)")

Selected 100 unique articles from RACE-H (with most questions)
Selected 100 unique articles from RACE-M (with most questions)


In [5]:
def transform_to_format(df, source_name, start_id=1):
    """
    Transform dataframe to the desired format, grouping by example_id
    Convert ABCD to 0123 in correct field
    """
    results = []
    current_id = start_id
    
    # Mapping for converting ABCD to 0123
    answer_map = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    
    # Group by example_id
    grouped = df.groupby('example_id')
    
    for example_id, group in grouped:
        # Get the article content (same for all rows in group)
        content = group.iloc[0]['article']
        
        # Create questions list
        questions = []
        for _, row in group.iterrows():
            questions.append({
                'content': row['question'],
                'correct': answer_map.get(row['answer'], row['answer']),  # Convert ABCD to 0123
                'options': row['options'].tolist() if hasattr(row['options'], 'tolist') else list(row['options'])
            })
        
        # Create the result object
        result = {
            'id': current_id,
            'source': source_name,
            'content': content,
            'questions': questions
        }
        
        results.append(result)
        current_id += 1
    
    return results, current_id

# Transform both datasets
race_h_results, next_id = transform_to_format(race_h_samples, 'race-h', start_id=1)
race_m_results, _ = transform_to_format(race_m_samples, 'race-m', start_id=next_id)

# Combine all results
all_results = race_h_results + race_m_results

print(f"Total samples: {len(all_results)}")
print(f"RACE-H samples: {len(race_h_results)}")
print(f"RACE-M samples: {len(race_m_results)}")

Total samples: 200
RACE-H samples: 100
RACE-M samples: 100


In [6]:
# Display a sample to verify the format
import json

print("Sample output:")
print(json.dumps(all_results[0], indent=2, ensure_ascii=False))

Sample output:
{
  "id": 1,
  "source": "race-h",
  "content": "Studies show that you may be lied to every day anywhere from 10 to 200 times. We say, \"Nice song.\" \"Honey, you don't look fat in that, no.\" But another study showed that strangers lied three times within the first 10 minutes of meeting each other. We lie more to strangers than we lie to coworkers. Men lie eight times more about themselves than they do other people. Women lie more to protect other people. If you're married, you're going to lie to your wife/ husband in one out of every 10 communications. If you're unmarried, that number drops to three. But look, if at some point you got lied to, it's because you agreed to get lied to. Truth about lying: lying's a cooperative act. Not all lies are harmful. Sometimes we're willing to lie for the sake of social dignity  , maybe to keep a private secret.\nLying is complex. It's woven into the fabric of our daily and business lives. We're deeply disturbed by the truth. We exp

In [7]:
# Summary statistics
print("\n=== Summary Statistics ===")
for result in all_results:
    print(f"ID: {result['id']}, Source: {result['source']}, Questions: {len(result['questions'])}, Content length: {len(result['content'])} chars")


=== Summary Statistics ===
ID: 1, Source: race-h, Questions: 5, Content length: 2462 chars
ID: 2, Source: race-h, Questions: 6, Content length: 3408 chars
ID: 3, Source: race-h, Questions: 5, Content length: 1168 chars
ID: 4, Source: race-h, Questions: 6, Content length: 4244 chars
ID: 5, Source: race-h, Questions: 5, Content length: 1664 chars
ID: 6, Source: race-h, Questions: 5, Content length: 2101 chars
ID: 7, Source: race-h, Questions: 5, Content length: 1696 chars
ID: 8, Source: race-h, Questions: 5, Content length: 1548 chars
ID: 9, Source: race-h, Questions: 5, Content length: 1656 chars
ID: 10, Source: race-h, Questions: 5, Content length: 1490 chars
ID: 11, Source: race-h, Questions: 5, Content length: 4164 chars
ID: 12, Source: race-h, Questions: 5, Content length: 1052 chars
ID: 13, Source: race-h, Questions: 5, Content length: 2375 chars
ID: 14, Source: race-h, Questions: 5, Content length: 2140 chars
ID: 15, Source: race-h, Questions: 5, Content length: 1468 chars
ID: 16

In [8]:
# Save results to JSON file
import json

output_path = './datasets/unified/data.json'
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(all_results, f, indent=2, ensure_ascii=False)

print(f"Saved {len(all_results)} samples to {output_path}")

Saved 200 samples to ./datasets/unified/data.json
