In [1]:
import numpy as np
import pandas as pd
import string
from datasets import load_dataset
import nltk

In [53]:
def preprocess_dataset(dataset):
    # Convert from pyarrow to pandas df
    df = dataset.to_pandas()
    
    # Map letter answers to options list indices
    df['answer'] = df['answer'].map({'A':0, 'B':1, 'C':2, 'D':3})
    options_series = df['options']

    def is_valid_options(options_list):
        # Strip option, remove punctuation, return True if not all options are just numbers.
        is_valid = [not sent.strip().translate(str.maketrans('', '', string.punctuation)).isnumeric() for sent in options_list]
        return any(is_valid)

    # Map is_valid_options to every question
    is_valid_series = options_series.map(is_valid_options)
    num_dropped = is_valid_series.value_counts()[False]
    print(f'Dropped {num_dropped} rows with only numeric answers')
    return df[is_valid_series]


In [76]:
pd.set_option('display.max_colwidth', None)

def filt_df(df):
    
    ## Exclude phrase completion questions
    filt_df = df[ ~df['question'].str.contains('_')]

    ## Exclude "According to the passage" questions
    filt_df = filt_df[ filt_df['question'].str.contains('According to the passage')]

    ## Exclude questions shorter than 5 words
    filt_df = filt_df[ filt_df.question.str.replace(',','').str.split().str.len() > 5 ]
    
    ## Exclude contexts w/ less than n questions
    vc = filt_df['example_id'].value_counts().to_frame()
    vc.columns = ['Count']
    vc[vc.Count > 1]
    filt_df = filt_df[filt_df['example_id'].isin(vc.index)]

    return filt_df

In [None]:
#pd.set_option('display.max_colwidth', None)

race = load_dataset('race', 'all')
race_train = race['train'].to_pandas()
race_train

In [77]:
valid_df = filt_df(preprocess_dataset(race['validation']))
train_df = filt_df(preprocess_dataset(race['train']))

Dropped 67 rows with only numeric answers
Dropped 1304 rows with only numeric answers
