## DuoRC

In [None]:
from datasets import Dataset, DatasetDict, load_dataset
from tqdm import tqdm
import pandas as pd
dataset = load_dataset('ibm/duorc', 'SelfRC')

In [None]:
def process_split(split_name):
    keep_indices = []
    for idx, row in tqdm(enumerate(dataset[split_name])):
        flag = 1
        if row['no_answer'] == False:
            for ans in row['answers']:
                if ans not in row['plot']:
                    flag = 0
                    break
            if flag == 1:
                keep_indices.append(idx)
        else:
            keep_indices.append(idx)
    
    dataset_subset = dataset[split_name].select(keep_indices)
    dataset_df = pd.DataFrame(dataset_subset)
    
    answer_dicts = []
    for row in tqdm(dataset_df.itertuples(index=False)):
        answer_idxs = []
        if row.no_answer == False:
            for ans in row.answers:
                answer_idxs.append(row.plot.find(ans))
            answer_dicts.append({'text': row.answers, 'answer_start': answer_idxs})
        else:
            answer_dicts.append({'text': [], 'answer_start': []})
    
    dataset_df.drop(columns = ['answers'], inplace=True)
    dataset_df['answers'] = answer_dicts
    
    return dataset_df

In [None]:
train_hf = Dataset.from_pandas(process_split('train'))
validation_hf = Dataset.from_pandas(process_split('validation')) 
test_hf = Dataset.from_pandas(process_split('test'))

In [None]:
processed_dataset = DatasetDict()

processed_dataset['train'] = train_hf
processed_dataset['validation'] = validation_hf
processed_dataset['test'] = test_hf

In [None]:
processed_dataset = processed_dataset.rename_columns({"plot":"context", "plot_id":"id"})
processed_dataset = processed_dataset.remove_columns(['no_answer', 'question_id'])

In [None]:
processed_dataset.push_to_hub('Saptarshi7/duorc_processed', private=True)

# TechQA

In [None]:
from transformers import AutoTokenizer
from datasets import Dataset, DatasetDict, load_dataset
from tqdm import tqdm
import pandas as pd
techqa = load_dataset('Saptarshi7/techqa-squad-style')

In [None]:
# Removing a question in the validation set which is just a null string.
techqa['validation'] = techqa['validation'].filter(lambda x: x['question'] != '')

In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

In [None]:
def process_split(split_name):
    # Removing those questions for which the tokenizer freaks out.
    keep_indices = []
    for idx, row in tqdm(enumerate(techqa[split_name])):
        try:
            tokenized_examples = tokenizer(row['question'], row['context'], truncation="only_second", 
                                           max_length=512, stride=50, padding="max_length")
            keep_indices.append(idx)
        except:
            continue
    dataset_subset = techqa[split_name].select(keep_indices)
    
    # Removing those questions for which the answers are not found in the context
    keep_indices = []
    for idx, row in tqdm(enumerate(dataset_subset)):
        if row['is_impossible'] == False:
            flag = 1
            for ans in row['answers']['text']:
                if ans not in row['context']:
                    flag = 0
                    break
            if flag == 1:
                keep_indices.append(idx)
        else:
            keep_indices.append(idx)
    
    dataset_subset = dataset_subset.select(keep_indices)
    techqa[split_name] = dataset_subset

In [None]:
process_split('train')
process_split('validation')

In [None]:
techqa.push_to_hub('Saptarshi7/techqa_cleaned_for_bert', private=True)

In [1]:
from datasets import load_dataset
s = load_dataset('Saptarshi7/techqa_cleaned_for_bert')

Downloading readme:   0%|          | 0.00/684 [00:00<?, ?B/s]

Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 11.7M/11.7M [00:01<00:00, 8.65MB/s]
Downloading data: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10.7M/10.7M [00:00<00:00, 17.9MB/s]


Generating train split:   0%|          | 0/599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/297 [00:00<?, ? examples/s]

In [2]:
s

DatasetDict({
    train: Dataset({
        features: ['document_id', 'context', 'question', 'is_impossible', 'id', 'answers'],
        num_rows: 599
    })
    validation: Dataset({
        features: ['document_id', 'context', 'question', 'is_impossible', 'id', 'answers'],
        num_rows: 297
    })
})