# Datasets

In [None]:
#| default_exp datasets

In [None]:
#| export
from __future__ import annotations
import math, random, torch, matplotlib.pyplot as plt, numpy as np, matplotlib as mpl, shutil, os, gzip, pickle, re, copy, time
from pathlib import Path
from functools import partial
import fastcore.all as fc
from glob import glob
import json

from torch import tensor, nn, optim
import torch.nn.functional as F
from datasets import load_dataset
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, default_collate
from torch.nn import init
from torch.nn.utils.rnn import pad_sequence
from typing import List, Optional

from datetime import datetime, timedelta
import calendar
from fastprogress import progress_bar
from einops import rearrange

from toolken.model import *
from toolken.tokenizer import *

### Build dataset

I'm going to work with a single dataset for now — the GSM8K-XL math dataset for training the calculator tool. The dataset is provided by the original toolken authors in [this GitHub repo](https://github.com/Ber666/ToolkenGPT).

In [None]:
path = '../model/tokenizer.model'
tokenizer = Tokenizer(path)

In [None]:
with open('../data/gsm8k-xl/train.json', 'r') as f: data = json.load(f)
with open('../data/gsm8k-xl/func_dict.json', 'r') as f: func_dict = json.load(f)

In [None]:
data[0]

{'text': "Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? Let's think step by step. Natalia sold 48/2 = 24 clips in May.\nNatalia sold 48+24 = 72 clips altogether in April and May.\n#### 72",
 'start_token_idx': [60, 80],
 'end_token_idx': [62, 82],
 'tar_eq': ['<divide>(48, 2)=24<eoe>', '<add>(48, 24)=72<eoe>'],
 'tar_number': ['24', '72']}

In [None]:
func_dict

{'<add>': 0, '<subtract>': 1, '<multiply>': 2, '<divide>': 3}

In [None]:
#| export
class PromptDS:
    def __init__(self, json, tokenizer, func_dict): fc.store_attr()
    def __len__(self): return len(self.json)
    def __getitem__(self, i): 
        item = self.json[i]
        item['input'] = tensor(self.tokenizer.encode(item['text'], bos=True, eos=True))
        item['label'] = tensor(self.tokenizer.encode(item['text'], bos=True, eos=True))
        for i, idx in enumerate(item['start_token_idx']):
            start, end = idx, item['end_token_idx'][i]
            op = re.search(r"(<.*?>)", item['tar_eq'][i]).group(1)
            item['label'][start] = self.func_dict[op] + 32000
            item['label'][start+1:end] = -100
        return item['input'], item['label']

In [None]:
#| export
class DataLoaders:
    def __init__(self, tds, vds, bs, **kwargs): 
        self.train = DataLoader(tds, batch_size=bs, shuffle=True, collate_fn=default_collate, num_workers=4, **kwargs)
        self.valid = DataLoader(vds, batch_size=bs, collate_fn=default_collate, num_workers=4, **kwargs)

In [None]:
tds = PromptDS(data[:int(0.9*len(data))], tokenizer, func_dict)
vds = PromptDS(data[int(0.9*len(data)):], tokenizer, func_dict)
len(tds), len(vds)

(5448, 606)

In [None]:
dls = DataLoaders(tds, vds, 1)
inp, label = next(iter(dls.train))
inp.shape, label.shape

(torch.Size([1, 98]), torch.Size([1, 98]))

The dataloader here is going to provide the model with an `input` and a `label`.

The `input`:

In [None]:
a = tokenizer.decode(list([i.item() for i in inp[0]]))
print(a)

For the school play, 40 rows of chairs were set up where there were 20 chairs in each row. If only 10 seats were not occupied, how many seats were taken? Let's think step by step. The school prepared 40 x 20 = 800 chairs.
So, 800 - 10 = 790 chairs were taken.
#### 790


The `label` is going to have the same structure as the input. However, the tokens in the input that represent an answer to a mathematical expression are going to be replaced by a new token that represents the operator required to calculate that result. Thus, the responsibility of the model is to, when faced with a mathematical expression, predict the operator token required to calculate that expression. Once the model is finetuned to do this, we'll do some additional processing in the inference method to formulate the arguments to be sent to an external calculator tool. In the case of other tools, the mechanism would be the same.

In [None]:
m = torch.ne(inp, label)
m2 = torch.where(label == -100, True, False)
m3 = torch.where(label >= 32000, True, False)
l = torch.zeros_like(label)
l[~m] = label[~m]
lab = tokenizer.decode([i.item() for i in l[~m2]])
r = [a.item()-32000 for a in label[m3]]
for h in r: 
    op = [k for k,v in func_dict.items() if v == h]
    lab = lab.replace("⁇", op[0], 1)

In [None]:
print(lab)

For the school play, 40 rows of chairs were set up where there were 20 chairs in each row. If only 10 seats were not occupied, how many seats were taken? Let's think step by step. The school prepared 40 x 20 =  <multiply>  chairs.
So, 800 - 10 =  <subtract>  chairs were taken.
#### 790


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()