In [11]:
import pandas as pd
import json

race_df = pd.read_parquet('./datasets/race/test-00000-of-00001.parquet')
# race_m_df = pd.read_parquet('./datasets/race-m/test-00000-of-00001.parquet')

# Load DREAM dataset (both test and dev)
with open('./datasets/dream/test.json', 'r', encoding='utf-8') as f:
    dream_test_data = json.load(f)

with open('./datasets/dream/dev.json', 'r', encoding='utf-8') as f:
    dream_dev_data = json.load(f)

# Combine both DREAM datasets
dream_data = dream_test_data + dream_dev_data

# Load ReClor validation dataset
with open('./datasets/reclor/val.json', 'r', encoding='utf-8') as f:
    reclor_data = json.load(f)

print(f"DREAM test samples: {len(dream_test_data)}")
print(f"DREAM dev samples: {len(dream_dev_data)}")
print(f"Total DREAM samples: {len(dream_data)}")
print(f"ReClor validation samples: {len(reclor_data)}")

DREAM test samples: 1287
DREAM dev samples: 1288
Total DREAM samples: 2575
ReClor validation samples: 500


In [12]:
race_df.head()

Unnamed: 0,example_id,article,answer,question,options
0,high19432.txt,The rain had continued for a week and the floo...,C,What did Nancy try to do before she fell over?,"[Measure the depth of the river, Look for a fa..."
1,high19432.txt,The rain had continued for a week and the floo...,D,The following are true according to the passag...,[It took Lizzie and Nancy about 20 minutes to ...
2,high19432.txt,The rain had continued for a week and the floo...,A,What did the local people do to help those in ...,"[They put up shelter for them in a school., Th..."
3,high6268.txt,There is probably no field of human activity i...,B,The passage tells us that _ .,[our values and lifestyles are in no field of ...
4,high6268.txt,There is probably no field of human activity i...,B,"Traditionally,people usually thought that _ .","[men cared very much for clothes, women were c..."


In [13]:
# race_m_df.head()
race_df.head()

Unnamed: 0,example_id,article,answer,question,options
0,high19432.txt,The rain had continued for a week and the floo...,C,What did Nancy try to do before she fell over?,"[Measure the depth of the river, Look for a fa..."
1,high19432.txt,The rain had continued for a week and the floo...,D,The following are true according to the passag...,[It took Lizzie and Nancy about 20 minutes to ...
2,high19432.txt,The rain had continued for a week and the floo...,A,What did the local people do to help those in ...,"[They put up shelter for them in a school., Th..."
3,high6268.txt,There is probably no field of human activity i...,B,The passage tells us that _ .,[our values and lifestyles are in no field of ...
4,high6268.txt,There is probably no field of human activity i...,B,"Traditionally,people usually thought that _ .","[men cared very much for clothes, women were c..."


In [14]:
import random

# Set seed for reproducibility (not needed anymore but keeping for consistency)
random.seed(42)

# Number of samples to select from each dataset
N = 100  # You can change this value

def select_samples_by_question_count(df, source_name, n_samples):
    """
    Select n_samples with the most questions from the dataframe
    """
    # Count questions per example_id
    question_counts = df.groupby('example_id').size().reset_index(name='question_count')
    
    # Sort by question count (descending) and get top n
    top_examples = question_counts.nlargest(n_samples, 'question_count')
    selected_ids = top_examples['example_id'].tolist()
    
    return df[df['example_id'].isin(selected_ids)], selected_ids

def select_dream_samples_by_question_count(dream_data, n_samples):
    """
    Select n_samples with the most questions from DREAM dataset
    """
    # Create list of (index, question_count) tuples
    sample_counts = [(i, len(sample[1])) for i, sample in enumerate(dream_data)]
    
    # Sort by question count (descending) and get top n
    sample_counts.sort(key=lambda x: x[1], reverse=True)
    selected_indices = [idx for idx, _ in sample_counts[:n_samples]]
    
    return [dream_data[i] for i in selected_indices], selected_indices

def select_reclor_samples(reclor_data, n_samples):
    """
    Select n_samples from ReClor dataset (only 1 question per sample)
    Just take the first n_samples since each sample has 1 question
    """
    return reclor_data[:n_samples], list(range(n_samples))

# Select samples from all datasets
race_samples, race_ids = select_samples_by_question_count(race_df, 'race', N)
dream_samples, dream_indices = select_dream_samples_by_question_count(dream_data, N)
reclor_samples, reclor_indices = select_reclor_samples(reclor_data, N)

print(f"Selected {len(race_ids)} unique articles from RACE (with most questions)")
print(f"Selected {len(dream_indices)} samples from DREAM (with most questions)")
print(f"Selected {len(reclor_indices)} samples from ReClor")

Selected 100 unique articles from RACE (with most questions)
Selected 100 samples from DREAM (with most questions)
Selected 100 samples from ReClor


In [15]:
def transform_to_format(df, source_name, start_id=1):
    """
    Transform dataframe to the desired format, grouping by example_id
    Convert ABCD to 0123 in correct field
    """
    results = []
    current_id = start_id
    
    # Mapping for converting ABCD to 0123
    answer_map = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
    
    # Group by example_id
    grouped = df.groupby('example_id')
    
    for example_id, group in grouped:
        # Get the article content (same for all rows in group)
        content = group.iloc[0]['article']
        
        # Create questions list
        questions = []
        for _, row in group.iterrows():
            questions.append({
                'content': row['question'],
                'correct': answer_map.get(row['answer'], row['answer']),  # Convert ABCD to 0123
                'options': row['options'].tolist() if hasattr(row['options'], 'tolist') else list(row['options'])
            })
        
        # Create the result object
        result = {
            'id': current_id,
            'source': source_name,
            'content': content,
            'questions': questions
        }
        
        results.append(result)
        current_id += 1
    
    return results, current_id

def transform_dream_to_format(dream_samples, source_name, start_id=1):
    """
    Transform DREAM dataset to the desired format
    DREAM format: [dialogue_lines_list, questions_list, id]
    """
    results = []
    current_id = start_id
    
    for sample in dream_samples:
        # sample[0] is the dialogue (list of strings)
        # sample[1] is the questions list
        # sample[2] is the original id
        
        dialogue_lines = sample[0]
        questions_data = sample[1]
        
        # Join dialogue lines to create content
        content = '\n'.join(dialogue_lines)
        
        # Create questions list
        questions = []
        for q in questions_data:
            # Find the correct answer index in the choice list
            correct_index = q['choice'].index(q['answer'])
            
            questions.append({
                'content': q['question'],
                'correct': correct_index,
                'options': q['choice']
            })
        
        # Create the result object
        result = {
            'id': current_id,
            'source': source_name,
            'content': content,
            'questions': questions
        }
        
        results.append(result)
        current_id += 1
    
    return results, current_id

def transform_reclor_to_format(reclor_samples, source_name, start_id=1):
    """
    Transform ReClor dataset to the desired format
    ReClor format: {context, question, answers, label, id_string}
    Each sample has only 1 question
    """
    results = []
    current_id = start_id
    
    for sample in reclor_samples:
        # Create questions list with single question
        questions = [{
            'content': sample['question'],
            'correct': sample['label'],  # Already in 0-3 format
            'options': sample['answers']
        }]
        
        # Create the result object
        result = {
            'id': current_id,
            'source': source_name,
            'content': sample['context'],
            'questions': questions
        }
        
        results.append(result)
        current_id += 1
    
    return results, current_id

# Transform all datasets
race_results, next_id = transform_to_format(race_samples, 'race', start_id=1)
# race_m_results, next_id = transform_to_format(race_m_samples, 'race-m', start_id=next_id)
dream_results, next_id = transform_dream_to_format(dream_samples, 'dream', start_id=next_id)
reclor_results, next_id = transform_reclor_to_format(reclor_samples, 'reclor', start_id=next_id)

# Combine all results
all_results = race_results + dream_results + reclor_results  # + race_m_results

print(f"Total samples: {len(all_results)}")
print(f"race samples: {len(race_results)}")
# print(f"RACE-M samples: {len(race_m_results)}")
print(f"DREAM samples: {len(dream_results)}")
print(f"ReClor samples: {len(reclor_results)}")

Total samples: 300
race samples: 100
DREAM samples: 100
ReClor samples: 100


In [16]:
# Display a sample to verify the format
import json

print("Sample output:")
print(json.dumps(all_results[0], indent=2, ensure_ascii=False))

Sample output:
{
  "id": 1,
  "source": "race",
  "content": "Studies show that you may be lied to every day anywhere from 10 to 200 times. We say, \"Nice song.\" \"Honey, you don't look fat in that, no.\" But another study showed that strangers lied three times within the first 10 minutes of meeting each other. We lie more to strangers than we lie to coworkers. Men lie eight times more about themselves than they do other people. Women lie more to protect other people. If you're married, you're going to lie to your wife/ husband in one out of every 10 communications. If you're unmarried, that number drops to three. But look, if at some point you got lied to, it's because you agreed to get lied to. Truth about lying: lying's a cooperative act. Not all lies are harmful. Sometimes we're willing to lie for the sake of social dignity  , maybe to keep a private secret.\nLying is complex. It's woven into the fabric of our daily and business lives. We're deeply disturbed by the truth. We expla

In [17]:
# Display samples from each source to verify the format
print("\nSample DREAM output:")
for result in all_results:
    if result['source'] == 'dream':
        print(json.dumps(result, indent=2, ensure_ascii=False))
        break

print("\nSample ReClor output:")
for result in all_results:
    if result['source'] == 'reclor':
        print(json.dumps(result, indent=2, ensure_ascii=False))
        break


Sample DREAM output:
{
  "id": 101,
  "source": "dream",
  "content": "Man: I'm home!\nWoman: Hi sweetie. Welcome home. Are you ready to eat?\nMan: Well, ... uh ...\nWoman: Uh, we're having fish and seasoned rice and carrots for the main course.\nMan: Really? Um .. Yeah.\nWoman: What? What?\nMan: Oh, that sounds great.\nWoman: Are you sure?\nMan: Sure.\nWoman: Hm. Well, before that, I have a delicious seafood salad with shrimp.\nMan: Oh.\nWoman: And, for desert, I made apple pie.\nMan: Oooh.\nWoman: Honey. It took me three hours to make.\nMan: Oh ... well ... yeah ... um ... yeah.\nWoman: Hmm. What's wrong? Don't you like ... don't you like what I made?\nMan: Well, to be honest, I just had a hamburger, fries, and a chocolate shake.\nWoman: You're kidding. Really?\nMan: Yeah. I'm sorry. I didn't know that you were making ...\nWoman: Honey. I told you this morning that I was making you something nice.\nMan: Yeah, but I forgot. You ... you know how work is. Wait, wow, WAIT!! What are you

In [18]:
# Summary statistics
print("\n=== Summary Statistics ===")
for result in all_results:
    print(f"ID: {result['id']}, Source: {result['source']}, Questions: {len(result['questions'])}, Content length: {len(result['content'])} chars")


=== Summary Statistics ===
ID: 1, Source: race, Questions: 5, Content length: 2462 chars
ID: 2, Source: race, Questions: 6, Content length: 3408 chars
ID: 3, Source: race, Questions: 5, Content length: 1168 chars
ID: 4, Source: race, Questions: 6, Content length: 4244 chars
ID: 5, Source: race, Questions: 5, Content length: 1664 chars
ID: 6, Source: race, Questions: 5, Content length: 2101 chars
ID: 7, Source: race, Questions: 5, Content length: 1696 chars
ID: 8, Source: race, Questions: 5, Content length: 1548 chars
ID: 9, Source: race, Questions: 5, Content length: 1656 chars
ID: 10, Source: race, Questions: 5, Content length: 1490 chars
ID: 11, Source: race, Questions: 5, Content length: 4164 chars
ID: 12, Source: race, Questions: 5, Content length: 1052 chars
ID: 13, Source: race, Questions: 5, Content length: 2375 chars
ID: 14, Source: race, Questions: 5, Content length: 2140 chars
ID: 15, Source: race, Questions: 5, Content length: 1468 chars
ID: 16, Source: race, Questions: 5, 

In [19]:
# Save results to JSON file
import json

output_path = './datasets/unified/data.json'
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(all_results, f, indent=2, ensure_ascii=False)

print(f"Saved {len(all_results)} samples to {output_path}")

Saved 300 samples to ./datasets/unified/data.json
