In [1]:
import os

from typing import Any, List, Dict, Mapping, Tuple, Union, Optional
import copy
from dataclasses import dataclass, field
import json
import pathlib
from typing import Dict, Optional, Sequence
import pickle
import tqdm

import numpy as np
import torch
import torch.nn.functional as F
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
import transformers
from transformers import Trainer
from transformers.trainer_pt_utils import LabelSmoother

from transformers import AutoTokenizer, AutoModelForCausalLM

In [2]:
tokenizer = AutoTokenizer.from_pretrained("mosaicml/mpt-7b")

Downloading (…)okenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

In [3]:
with open("../finetune_data_prepared.jsonl") as f:
    data = [json.loads(line) for line in f]

In [4]:
all_examples = []

for item in tqdm.tqdm(data):
    prompt = item["prompt"]
    answer = item["completion"]

    prompt_ids = tokenizer.encode(prompt, add_special_tokens=False)
    answer_ids = tokenizer.encode(answer, add_special_tokens=False) + [tokenizer.eos_token_id]

    input_ids = torch.tensor(prompt_ids + answer_ids, dtype=torch.long)

    if len(input_ids) > 2048:
        print("skipping", len(input_ids))
        continue

    labels = input_ids.clone()
    labels[: len(prompt_ids)] = -100

    all_examples.append(
        dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=torch.ones_like(input_ids, dtype=torch.bool),
        )
    )

100%|██████████| 6203/6203 [00:18<00:00, 333.59it/s]


In [5]:
with open("./mpt_finetune_dataset.pkl", "wb") as f:
    pickle.dump(all_examples, f)