In [1]:
import torch
import pandas as pd
import os
import json
import numpy as np

In [2]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, X, labels, attention_masks, BATCH_SIZE_FLAG=32):
        """Initialization"""
        self.y = labels
        self.X = X
        # self.rationale = rationale
        self.attention_masks = attention_masks
        self.BATCH_SIZE_FLAG = BATCH_SIZE_FLAG

    def __len__(self):
        """number of samples"""
        return self.X.shape[0]

    def __getitem__(self, index):
        """Get individual item from the tensor"""
        sample = {"input_ids": self.X[index],
                  "labels": self.y[index],
                  # "rationale": self.rationale[index],
                  "attention_mask": self.attention_masks[index]
                  }
        return sample

In [4]:
def create_dataloader(model, classes, filepath, batch_size=32, max_rows=None, class_specific=None, max_len=512, return_dataset=False, name=None):
    """Preparing dataloader"""
    data_df = pd.read_csv(filepath, lines=True)
    data_df = data_df[data_df['text'].notna()]
    data_df.reset_index(drop=True, inplace=True)

    # convert rationale column to list from string
    try:
        data_df = data_df[data_df['rationale'].notna()]
        data_df.reset_index(drop=True, inplace=True)
        try:
            data_df["rationale"] = data_df['rationale'].apply(lambda s: json.loads(s))
        except Exception as e:
            # for handling rationale string from wikiattack
            data_df["rationale"] = data_df["rationale"].apply(lambda s: s.strip("[").strip("]").split())
    except Exception as e:
        pass
    if max_rows is not None:
        data_df = data_df.iloc[:max_rows]

#     data_df['text']= data_df['text'].apply(lambda t:t.replace('[SEP]',model.tokenizer.sep_token))

    data_df['input_ids'], data_df['attention_mask'] = zip(*data_df['text'].map(model.tokenize))

    input_id_tensor = torch.tensor(data_df['input_ids'])
    attention_mask_tensor = torch.tensor(data_df['attention_mask'])

    labels_tensor = create_label_tensor(data_df, classes)

    dataset_ds = Dataset(input_id_tensor, labels_tensor, attention_mask_tensor,
                         BATCH_SIZE_FLAG=batch_size)
    if return_dataset:
        return dataset_ds
    return torch.utils.data.DataLoader(dataset_ds, batch_size=dataset_ds.BATCH_SIZE_FLAG, shuffle=True)

In [3]:
def prepare_data(model, classes, data_dir, train_path=None, dev_path=None, test_path=None, batch_size=32, max_rows=None, max_len=512, return_dataset=False, name=None):
    """Preparing data for training, evaluation and testing"""

    train_dataloader = create_dataloader(model, classes, train_path, max_rows=max_rows, batch_size=batch_size, max_len=max_len, return_dataset=return_dataset, name=name)
    dev_dataloader = create_dataloader(model, classes, dev_path, max_rows=max_rows, batch_size=batch_size, max_len=max_len, return_dataset=return_dataset, name=name)
    # test_dataloader = create_dataloader(model, classes, test_path, max_rows=max_rows, batch_size=batch_size,
    # max_len=max_len, return_dataset=return_dataset)
    return train_dataloader, dev_dataloader

In [None]:
create_dataloader(0, classes, train_path, max_rows=max_rows, batch_size=batch_size, max_len=max_len, return_dataset=True, name=name)