In [None]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import pickle

In [None]:
text_template = """### CONTEXT ###
{context}

### QUESTION ###
{question}

### ANSWER ###
{answer}
"""

def get_text(context, question, answer):
    return text_template.format(context=context, question=question, answer=answer)

In [None]:
dolly_ds = load_dataset('databricks/databricks-dolly-15k')

dolly_cqa = dolly_ds['train'].filter(lambda row: row['category'] == 'closed_qa')
dolly_cqa_df = dolly_cqa.to_pandas()

dolly_cqa_df['text'] = dolly_cqa_df.apply(lambda x: get_text(context=x['context'], question=x['instruction'], answer=x['response']), axis=1)

dolly_cqa_df.to_csv('./data/dolly_cqa.csv', index=False)

In [None]:
dolly_cqa_df

In [None]:
def build_race_df(datasets, subsets, split, difficulties, sample_size):
    race_datasets = []
    for ds, ss in zip(datasets, subsets):
        if ss != None:
            race = load_dataset(ds, ss, split=split)
        else:
            race = load_dataset(ds, split=split)
        race_datasets.append(race)

    columns = ['example_id', 'article', 'answer', 'question', 'options']
    race_df = pd.DataFrame()
    for i, ds in enumerate(race_datasets):
        df = ds.to_pandas()
        if list(df.columns) != columns:
            df.columns = ['options', 'question', 'article', 'example_id', 'answer']
        df['difficulty'] = difficulties[i]
        df = df.sample(sample_size)
        race_df= race_df.append(df)

    mapping = {'A': '0', 'B': '1', 'C': '2', 'D': '3'}
    race_df['answer'] = race_df['answer'].replace(mapping)
    
    race_df = race_df.reset_index(drop=True)
    race_df['text'] = race_df.apply(lambda x: get_text(context=x['article'], question=x['question'], answer=x['options'][int(x['answer'])]), axis=1)
    
    # race_df['options'] = race_df['options'].apply(lambda x: x.tolist())
    return race_df

In [None]:
race_df_train = build_race_df(datasets=['race', 'race', 'metaeval/race-c'], subsets=['middle', 'high', None], split='train',
                              difficulties=['M', 'H', 'C'], sample_size=12702)

race_df_validation = build_race_df(datasets=['race', 'race', 'metaeval/race-c'], subsets=['middle', 'high', None], split='validation',
                                   difficulties=['M', 'H', 'C'], sample_size=712)

race_df_test = build_race_df(datasets=['race', 'race', 'metaeval/race-c'], subsets=['middle', 'high', None], split='test',
                             difficulties=['M', 'H', 'C'], sample_size=708)

race_df_train.to_csv('./data/race_train.csv', index=False)
race_df_validation.to_csv('./data/race_validation.csv', index=False)
race_df_test.to_csv('./data/race_test.csv', index=False)

In [None]:
race_df_train