In [200]:
import torch
import io

from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.utils import unicode_csv_reader

In [210]:
import io
from typing import List

from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import ngrams_iterator
from torchtext.data.utils import get_tokenizer
from torchtext.utils import unicode_csv_reader


def create_vocab_from_tsv(
    filepath: str, column_indices_to_use: List[int], minimum_word_freq: int = 1, ngrams: int = 1, 
):
    """Creates a PyTorch vocab object from a TSV file.

    The resulting vocab object converts words to indices for assisting in embedding and DL operations.

    Args:
        filepath: The location of the TSV file
        minimum_word_freq: How many times a word must appear to be included
        ngrams: The size of ngrams to use for the vocab
        column_indices_to_use: Which columns from the TSV are part of the actual feature set

    Returns:
        A torchtext vocab object.
    """
    unk_token = "<unk>"
    vocab = build_vocab_from_iterator(
        _tsv_iterator(filepath, ngrams=ngrams, column_indices=column_indices_to_use),
        min_freq=minimum_word_freq,
        specials=[unk_token],
    )
    vocab.set_default_index(vocab[unk_token])
    return vocab


def _tsv_iterator(data_path, ngrams, column_indices):
    # Spacy has novel tokenizer
    tokenizer = get_tokenizer("spacy")
    with io.open(data_path, encoding="utf8") as f:
        reader = unicode_csv_reader(f, delimiter="\t")
        for row in reader:
            row_iter = [row[i] for i in column_indices]
            tokens = ' '.join(row_iter)
            yield ngrams_iterator(tokenizer(tokens), ngrams)


In [223]:
vocab = create_vocab_from_tsv("../datasets/systematic_review/phase1.train.shuf.tsv", [2])



In [216]:
tokenizer = get_tokenizer("spacy")
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: int(x) + 1


In [220]:
import io
from torchtext.utils import (
    unicode_csv_reader,
)
from torchtext.vocab import Vocab
from torchtext.data.utils import get_tokenizer
from typing import Callable, List
from torch.utils import data
import torch

_default_tokenizer = get_tokenizer("basic_english")
DEFAULT_LABEL_PIPELINE = lambda x: x
DEFAULT_TEXT_PIPELINE = lambda x: _default_tokenizer(x)


def create_torch_dataloader(
    dataset: data.Dataset,
    vocab: Vocab,
    label_pipeline: Callable = DEFAULT_LABEL_PIPELINE,
    text_pipeline: Callable = DEFAULT_TEXT_PIPELINE,
    **kwargs
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def _collate_batch(batch):
        label_list, text_list, offsets = [], [], [0]
        for (_label, _text) in batch:
            label_list.append(label_pipeline(_label))
            processed_text = torch.tensor(vocab(text_pipeline(_text)), dtype=torch.int64)
            text_list.append(processed_text)
            offsets.append(processed_text.size(0))
        label_list = torch.tensor(label_list, dtype=torch.int64)
        offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
        text_list = torch.cat(text_list)
        return label_list.to(device), text_list.to(device), offsets.to(device)

    return data.DataLoader(dataset, collate_fn=_collate_batch, **kwargs)


class TSVRawTextIterableDataset(data.IterableDataset):
    def __init__(self, filepath: str, data_columns: List[int]):
        self._number_of_items = _get_tsv_file_length(filepath)
        self._iterator = _create_data_from_tsv(
            filepath, data_column_indices=data_columns
        )
        self._current_position = 0

    def __iter__(self):
        return self

    def __next__(self):
        item = next(self._iterator)
        self._current_position += 1
        return item

    def __len__(self):
        return self._number_of_items


class TSVRawTextMapDataset(data.Dataset):
    def __init__(self, filepath: str, data_columns: List[int]):
        self._records = [
            record
            for record in _create_data_from_tsv(
                filepath, data_column_indices=data_columns
            )
        ]

    def __getitem__(self, index):
        return self._records[index]

    def __len__(self):
        return len(self._records)


def _create_data_from_tsv(data_path, data_column_indices):
    with io.open(data_path, encoding="utf8") as f:
        reader = unicode_csv_reader(f, delimiter="\t")
        for row in reader:
            data = [row[i] for i in data_column_indices]
            yield int(row[0]), " ".join(data)


def _get_tsv_file_length(data_path):
    with io.open(data_path, encoding="utf8") as f:
        row_count = sum(1 for row in f)

    return row_count


In [225]:
data_iter = TSVRawTextIterableDataset("../datasets/systematic_review/phase1.train.shuf.tsv", [2])
data_map = TSVRawTextMapDataset("../datasets/systematic_review/phase1.train.shuf.tsv", [2])

dl = create_torch_dataloader(data_map, vocab)

In [228]:
for idx, (label, text, offsets) in enumerate(dl):
    print(text)

tensor([3712,    9, 1171, 1001, 2687,    8,  203,  159,   90,    0])
tensor([1144,    8, 4673,    3, 1224,    3,  193,  538,  150,   21,   34,    2,
           0, 1525,    0,    7,    0])
tensor([    4,   349,     3,   276,     1,     8,   159,  3429,  4514,     1,
        21731,     2,     4,    49,     1,    22,    14,   288,     2,     0,
            7, 20050])
tensor([   4,  495,    1,    4, 4426,    1,   33,    9,  147,  530,    3,  115,
           2,    0])
tensor([    0, 21196,     1, 10534,     2,     4,   706,     1,    40,     0])
tensor([   0,    0,   30,    0,   28, 5311,   11, 1075,   32,    2,    0])
tensor([   0,   91,  727,    1,    0,  289,    7, 1475,    1, 5105,    2,    0,
           7,    0])
tensor([  197,  1114,    87,   300,    11,  1096,     4,  2261,     1,    76,
           96,   340,     2,     0,     7,     0,     7,     3, 22779])
tensor([   0,  895,    4,    0, 1131,  167, 2532])
tensor([   4,   68,   15,   77,  227,  449, 1523,    1,  579,    2,    8,   

IndexError: list index out of range

In [184]:
import io
from torchtext.utils import (
    unicode_csv_reader,
)
from torchtext.data.utils import get_tokenizer
from typing import Callable, List
from torch.utils import data
import torch

_default_tokenizer = get_tokenizer("basic_english")
DEFAULT_LABEL_PIPELINE = lambda x: x
DEFAULT_TEXT_PIPELINE = lambda x: _default_tokenizer(x)


def create_torch_dataloader(
    dataset: data.Dataset,
    label_pipeline: Callable = DEFAULT_LABEL_PIPELINE,
    text_pipeline: Callable = DEFAULT_TEXT_PIPELINE,
    **kwargs
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    def _collate_batch(batch):
        label_list, text_list, offsets = [], [], [0]
        for (_label, _text) in batch:
            label_list.append(label_pipeline(_label))
            print(_text)
            processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
            text_list.append(processed_text)
            offsets.append(processed_text.size(0))
        label_list = torch.tensor(label_list, dtype=torch.int64)
        offsets = torch.tensor(offsets[:-1]).cumsum(dim=0)
        text_list = torch.cat(text_list)
        return label_list.to(device), text_list.to(device), offsets.to(device)

    return data.DataLoader(dataset, collate_fn=_collate_batch, **kwargs)


class TSVRawTextIterableDataset(data.IterableDataset):
    def __init__(self, filepath: str, data_columns: List[int]):
        self._number_of_items = _get_tsv_file_length(filepath)
        self._iterator = _create_data_from_tsv(
            filepath, data_column_indices=data_columns
        )
        self._current_position = 0

    def __iter__(self):
        return self

    def __next__(self):
        item = next(self._iterator)
        self._current_position += 1
        return item

    def __len__(self):
        return self._number_of_items


class TSVRawTextMapDataset(data.Dataset):
    def __init__(self, filepath: str, data_columns: List[int]):
        self._number_of_items = _get_tsv_file_length(filepath)
        self._records = [
            record
            for record in _create_data_from_tsv(
                filepath, data_column_indices=data_columns
            )
        ]

    def __getitem__(self, index):
        return self._records[index]

    def __len__(self):
        return self._number_of_items


def _create_data_from_tsv(data_path, data_column_indices):
    with io.open(data_path, encoding="utf8") as f:
        reader = unicode_csv_reader(f, delimiter="\t")
        for row in reader:
            data = [row[i] for i in data_column_indices]
            yield int(row[0]), " ".join(data)


def _get_tsv_file_length(data_path):
    with io.open(data_path, encoding="utf8") as f:
        row_count = sum(1 for row in f)

    return row_count


In [185]:
dl = create_torch_dataloader(data_map)

In [186]:
for idx, (label, text, offsets) in enumerate(dl):
    print(offsets)

Misoprostol for treating postpartum haemorrhage: A randomized controlled trial [ISRCTN72263357]


ValueError: too many dimensions 'str'

In [None]:
text