In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset
import pickle

In [2]:
text_template = """### CONTEXT ###
{context}

### QUESTION ###
{question}

### ANSWER ###
{answer}
"""

def get_text(context, question, answer):
    return text_template.format(context=context, question=question, answer=answer)

In [3]:
dolly_ds = load_dataset('databricks/databricks-dolly-15k')

dolly_cqa = dolly_ds['train'].filter(lambda row: row['category'] == 'closed_qa')
dolly_cqa_df = dolly_cqa.to_pandas()

dolly_cqa_df['text'] = dolly_cqa_df.apply(lambda x: get_text(context=x['context'], question=x['instruction'], answer=x['response']), axis=1)

dolly_cqa_df.to_csv('./data/dolly_cqa.csv', index=False)

Found cached dataset json (C:/Users/vilce/.cache/huggingface/datasets/databricks___json/databricks--databricks-dolly-15k-7427aa6e57c34282/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at C:\Users\vilce\.cache\huggingface\datasets\databricks___json\databricks--databricks-dolly-15k-7427aa6e57c34282\0.0.0\e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4\cache-10ee0e6a8f61a002.arrow


In [4]:
dolly_cqa_df

Unnamed: 0,instruction,context,response,category,text
0,When did Virgin Australia start operating?,"Virgin Australia, the trading name of Virgin A...",Virgin Australia commenced services on 31 Augu...,closed_qa,"### CONTEXT ###\nVirgin Australia, the trading..."
1,When was Tomoaki Komorida born?,Komorida was born in Kumamoto Prefecture on Ju...,"Tomoaki Komorida was born on July 10,1981.",closed_qa,### CONTEXT ###\nKomorida was born in Kumamoto...
2,"Given a reference text about Lollapalooza, whe...",Lollapalooza /ˌlɒləpəˈluːzə/ (Lolla) is an ann...,Lollapalooze is an annual musical festival hel...,closed_qa,### CONTEXT ###\nLollapalooza /ˌlɒləpəˈluːzə/ ...
3,Who was Kyle Van Zyl playing against when he s...,Van Zyl joined the Eastern Province Kings Acad...,Kyle Van Zyl was playing against Boland U21 wh...,closed_qa,### CONTEXT ###\nVan Zyl joined the Eastern Pr...
4,Who was the first to record the Needles and Pi...,"""Needles and Pins"" is a rock song credited to ...",The first artist to record Needles and Pins so...,closed_qa,"### CONTEXT ###\n""Needles and Pins"" is a rock ..."
...,...,...,...,...,...
1768,How many types of cat does the world cat feder...,"As of 2023, The International Cat Association ...",69,closed_qa,"### CONTEXT ###\nAs of 2023, The International..."
1769,What is the history of coffee?,The history of coffee dates back to centuries ...,Much of the early history of coffee was tied t...,closed_qa,### CONTEXT ###\nThe history of coffee dates b...
1770,What are common florals found in Zigalga Natio...,Zigalga National Park (Russian: Национальный п...,Zigalga National Park has the majority of its ...,closed_qa,### CONTEXT ###\nZigalga National Park (Russia...
1771,What is linux Bootloader,"A bootloader, also spelled as boot loader or c...",A bootloader is a program written in machine c...,closed_qa,"### CONTEXT ###\nA bootloader, also spelled as..."


In [5]:
def build_race_df(datasets, subsets, split, difficulties, sample_size):
    race_datasets = []
    for ds, ss in zip(datasets, subsets):
        if ss != None:
            race = load_dataset(ds, ss, split=split)
        else:
            race = load_dataset(ds, split=split)
        race_datasets.append(race)

    columns = ['example_id', 'article', 'answer', 'question', 'options']
    race_df = pd.DataFrame()
    for i, ds in enumerate(race_datasets):
        df = ds.to_pandas()
        if list(df.columns) != columns:
            df.columns = ['options', 'question', 'article', 'example_id', 'answer']
        df['difficulty'] = difficulties[i]
        df = df.sample(sample_size)
        race_df= race_df.append(df)

    mapping = {'A': '0', 'B': '1', 'C': '2', 'D': '3'}
    race_df['answer'] = race_df['answer'].replace(mapping)
    
    race_df = race_df.reset_index(drop=True)
    race_df['text'] = race_df.apply(lambda x: get_text(context=x['article'], question=x['question'], answer=x['options'][int(x['answer'])]), axis=1)
    
    # race_df['options'] = race_df['options'].apply(lambda x: x.tolist())
    return race_df

In [6]:
race_df_train = build_race_df(datasets=['race', 'race', 'metaeval/race-c'], subsets=['middle', 'high', None], split='train',
                              difficulties=['M', 'H', 'C'], sample_size=12702)

race_df_validation = build_race_df(datasets=['race', 'race', 'metaeval/race-c'], subsets=['middle', 'high', None], split='validation',
                                   difficulties=['M', 'H', 'C'], sample_size=712)

race_df_test = build_race_df(datasets=['race', 'race', 'metaeval/race-c'], subsets=['middle', 'high', None], split='test',
                             difficulties=['M', 'H', 'C'], sample_size=708)

race_df_train.to_csv('./data/race_train.csv', index=False)
race_df_validation.to_csv('./data/race_validation.csv', index=False)
race_df_test.to_csv('./data/race_test.csv', index=False)

Found cached dataset race (C:/Users/vilce/.cache/huggingface/datasets/race/middle/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b)
Found cached dataset race (C:/Users/vilce/.cache/huggingface/datasets/race/high/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b)
Found cached dataset json (C:/Users/vilce/.cache/huggingface/datasets/metaeval___json/metaeval--race-c-287d12d931602f45/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4)
  race_df= race_df.append(df)
  race_df= race_df.append(df)
  race_df= race_df.append(df)
Found cached dataset race (C:/Users/vilce/.cache/huggingface/datasets/race/middle/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b)
Found cached dataset race (C:/Users/vilce/.cache/huggingface/datasets/race/high/0.1.0/5839ff74a429622f5f20cca69c5fcf0e87ac6d5fd2777c42b948000684829f7b)
Found cached dataset json (C:/Users/vilce/.cache/huggingface/datasets/metaeval___json/metaeval--race-c-287

In [7]:
race_df_train

Unnamed: 0,example_id,article,answer,question,options,difficulty,text
0,middle6454.txt,"Hans said to his friend Kurt, ""I'm going to ta...",3,Hans went to London by _ .,"[car, sea, air, both sea and land]",M,"### CONTEXT ###\nHans said to his friend Kurt,..."
1,middle234.txt,Bob is six years old.He is old enough to go to...,0,What do you think of the boy?,"[He is not clever., He is helpful., He is poli...",M,### CONTEXT ###\nBob is six years old.He is ol...
2,middle3148.txt,A teacher stood in front of his history class ...,1,Why did some students stay in their seats?,"[Because they were afraid to leave., Because t...",M,### CONTEXT ###\nA teacher stood in front of h...
3,middle1661.txt,A famous building in New York City is turning ...,0,Which of the following statements can we infer...,"[The station won't be changed., People will sp...",M,### CONTEXT ###\nA famous building in New York...
4,middle4207.txt,Chen Kai is a Chinese boy. He is a student. He...,0,Chen Kai is _ .,"[a student, a teacher, fourteen, from Canada]",M,### CONTEXT ###\nChen Kai is a Chinese boy. He...
...,...,...,...,...,...,...,...
38101,3154.txt,Sometimes you'll hear people say that you can'...,2,What is the passage mainly about?,"[How to prepare for your success., How to face...",C,### CONTEXT ###\nSometimes you'll hear people ...
38102,1616.txt,"A pretty, well-dressed young lady stopped a ta...",0,The young lady was,"[clever at making excuse., not late at all., 4...",C,"### CONTEXT ###\nA pretty, well-dressed young ..."
38103,850.txt,"A fluid is a substance, such as a liquid or ga...",3,"According to paragraph 2, all of the following...",[the breaking apart of water molecules by ultr...,C,"### CONTEXT ###\nA fluid is a substance, such ..."
38104,827.txt,The National Trust in Britain plays an increas...,1,The National Trust is _ .,"[a rich government department, a charity suppo...",C,### CONTEXT ###\nThe National Trust in Britain...
