In [1]:
import os
import sys
import json
import tqdm
from transformers import AutoTokenizer

In [2]:
data_path = "processed"

In [3]:
with open(os.path.join(data_path, "train_raw.jsonl")) as f:
    train_raw_data = f.readlines()
    
with open(os.path.join(data_path, "val_raw.jsonl")) as f:
    val_raw_data = f.readlines()
    
with open(os.path.join(data_path, "test_raw.jsonl")) as f:
    test_raw_data = f.readlines()

In [4]:
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [5]:
def process_data(data):
    all_dialog_pairs = []

    for line in tqdm.tqdm(data):
        history_turns = []
        dialog = json.loads(line)

        for i, turn in enumerate(dialog):

            if i==0:
                history_turns.append(turn)
                continue

            source = "</s>".join(history_turns)
            target = turn

            example = {
                "source": source,
                "target": target
            }

            tokens = tokenizer.encode(source + target)
            
            if len(tokens) > 512:
                break
            
            all_dialog_pairs.append(example)

            # post
            history_turns.append(turn)
            
    return all_dialog_pairs

In [6]:
train_data = process_data(train_raw_data)
val_data = process_data(val_raw_data)
test_data = process_data(test_raw_data)

  9%|▉         | 1053/11118 [00:01<00:18, 554.02it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (536 > 512). Running this sequence through the model will result in indexing errors
 10%|▉         | 1109/11118 [00:02<00:19, 518.17it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (514 > 512). Running this sequence through the model will result in indexing errors
 24%|██▍       | 2709/11118 [00:04<00:14, 593.21it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (528 > 512). Running this sequence through the model will result in indexing errors
 25%|██▍       | 2769/11118 [00:05<00:16, 518.48it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (558 > 512). Running this sequence through the model will result in indexing errors
 40%|███▉      | 4400/11118 [00:08<00:12, 544.52it/s]Token i

In [7]:
with open("train.jsonl", "w") as f:
    for line in train_data:
        f.write(json.dumps(line))
        f.write("\n")
        
with open("val.jsonl", "w") as f:
    for line in val_data:
        f.write(json.dumps(line))
        f.write("\n")
        
with open("test.jsonl", "w") as f:
    for line in test_data:
        f.write(json.dumps(line))
        f.write("\n")