<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->

In [None]:
#| code-fold: show
#| code-summary: "Exported source"
import os

import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from transformers import AutoTokenizer

os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from en_grammar_checker.config import Config

In [1]:
#| echo: false
#| output: asis
show_doc(BertClassificationDataset)

---

[source](https://github.com/rohitMalhotra07/en_grammar_checker/blob/main/en_grammar_checker/datasets.py#L17){target="_blank" style="float:right; font-size:smaller"}

### BertClassificationDataset

>      BertClassificationDataset (cnfg, df:pandas.core.frame.DataFrame,
>                                 is_test:bool=False, input_clm:str='sentence',
>                                 label_clm:str='label')

*An abstract class representing a :class:`Dataset`.

All datasets that represent a map from keys to data samples should subclass
it. All subclasses should overwrite :meth:`__getitem__`, supporting fetching a
data sample for a given key. Subclasses could also optionally overwrite
:meth:`__len__`, which is expected to return the size of the dataset by many
:class:`~torch.utils.data.Sampler` implementations and the default options
of :class:`~torch.utils.data.DataLoader`. Subclasses could also
optionally implement :meth:`__getitems__`, for speedup batched samples
loading. This method accepts list of indices of samples of batch and returns
list of samples.

.. note::
  :class:`~torch.utils.data.DataLoader` by default constructs an index
  sampler that yields integral indices.  To make it work with a map-style
  dataset with non-integral indices/keys, a custom sampler must be provided.*

In [None]:
#| code-fold: show
#| code-summary: "Exported source"
class BertClassificationDataset(Dataset):
    def __init__(
        self,
        cnfg,
        df: pd.DataFrame,
        is_test: bool = False,
        input_clm: str = "sentence",
        label_clm: str = "label",
    ):
        """
        cnfg: instance of Config class
        df: dataframe of data with label
        is_test: True if it for inference dataframe
        input_clm: column name for sentences
        label_clm: column name for label (dtype should not be object)
        """
        # get tokenizer from model name
        self.tokenizer = AutoTokenizer.from_pretrained(cnfg.base_model_name)
        self.df = df
        self.is_test = is_test
        self.cnfg = cnfg
        self.input_clm = input_clm
        self.label_clm = label_clm

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        row_tensors = []

        encoded_dict = self.tokenizer.encode_plus(
            row[self.input_clm],  # Sentence to encode.
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=self.cnfg.context_length,  # Pad & truncate all sentences.
            truncation=True,
            padding="max_length",
            return_attention_mask=True,  # Construct attn. masks.
            return_tensors="pt",  # Return pytorch tensors.
        )

        if self.is_test:
            return (
                encoded_dict["input_ids"].squeeze(),
                encoded_dict["attention_mask"].squeeze(),
            )
        else:
            label = torch.as_tensor(row[self.label_clm], dtype=torch.int64)
            return (
                encoded_dict["input_ids"].squeeze(),
                encoded_dict["attention_mask"].squeeze(),
                label,
            )

In [2]:
#| echo: false
#| output: asis
show_doc(get_train_data_loader)

---

[source](https://github.com/rohitMalhotra07/en_grammar_checker/blob/main/en_grammar_checker/datasets.py#L72){target="_blank" style="float:right; font-size:smaller"}

### get_train_data_loader

>      get_train_data_loader (cnfg, df, input_clm:str='sentence',
>                             label_clm:str='label')

In [None]:
#| code-fold: show
#| code-summary: "Exported source"
def get_train_data_loader(
    cnfg,
    df,
    input_clm: str = "sentence",
    label_clm: str = "label",
):
    dataset = BertClassificationDataset(
        cnfg, df, is_test=False, input_clm=input_clm, label_clm=label_clm
    )
    dataloader = DataLoader(
        dataset,
        sampler=RandomSampler(dataset),  # Select batches randomly
        batch_size=cnfg.train_batch_size,
        num_workers=cnfg.num_workers,
    )

    return dataloader

In [3]:
#| echo: false
#| output: asis
show_doc(get_val_data_loader)

---

[source](https://github.com/rohitMalhotra07/en_grammar_checker/blob/main/en_grammar_checker/datasets.py#L91){target="_blank" style="float:right; font-size:smaller"}

### get_val_data_loader

>      get_val_data_loader (cnfg, df, input_clm:str='sentence',
>                           label_clm:str='label')

In [None]:
#| code-fold: show
#| code-summary: "Exported source"
def get_val_data_loader(
    cnfg,
    df,
    input_clm: str = "sentence",
    label_clm: str = "label",
):
    dataset = BertClassificationDataset(
        cnfg, df, is_test=False, input_clm=input_clm, label_clm=label_clm
    )
    dataloader = DataLoader(
        dataset,
        sampler=SequentialSampler(dataset),  # Select batches sequentialy
        batch_size=cnfg.val_batch_size,
        num_workers=cnfg.num_workers,
    )

    return dataloader

In [4]:
#| echo: false
#| output: asis
show_doc(get_test_data_loader)

---

[source](https://github.com/rohitMalhotra07/en_grammar_checker/blob/main/en_grammar_checker/datasets.py#L110){target="_blank" style="float:right; font-size:smaller"}

### get_test_data_loader

>      get_test_data_loader (cnfg, df, input_clm:str='sentence', label_clm=None)

In [None]:
#| code-fold: show
#| code-summary: "Exported source"
def get_test_data_loader(
    cnfg,
    df,
    input_clm: str = "sentence",
    label_clm=None,
):
    dataset = BertClassificationDataset(
        cnfg, df, is_test=True, input_clm=input_clm, label_clm=label_clm
    )
    dataloader = DataLoader(
        dataset,
        sampler=SequentialSampler(dataset),  # Select batches sequentialy
        batch_size=cnfg.test_batch_size,
        num_workers=cnfg.num_workers,
    )

    return dataloader

### Testing DataSets

In [None]:
cnfg = Config()

In [None]:
df_train = pd.read_csv(
    f"{cnfg.train_path}",
    delimiter="\t",
    header=None,
    names=["sentence_source", "label", "label_notes", "sentence"],
)

In [None]:
train_dataloader = get_train_data_loader(cnfg, df_train)



In [None]:
train_dataloader_iterator = iter(train_dataloader)
X, X2, Y = next(train_dataloader_iterator)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [None]:
X.shape, X2.shape, Y.shape

(torch.Size([8, 512]), torch.Size([8, 512]), torch.Size([8]))

In [None]:
Y.view(-1).shape

torch.Size([8])

In [None]:
X

tensor([[   1,  585, 1234,  ...,    0,    0,    0],
        [   1,  273,  481,  ...,    0,    0,    0],
        [   1,  512,  313,  ...,    0,    0,    0],
        ...,
        [   1,  273, 1659,  ...,    0,    0,    0],
        [   1,  918, 3721,  ...,    0,    0,    0],
        [   1, 1887,  261,  ...,    0,    0,    0]])

In [None]:
X2

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])