In [1]:
import pandas as pd
import numpy as np
from typing import Dict
from allennlp.data import DatasetReader, Instance, Vocabulary
from allennlp.data.data_loaders import MultiProcessDataLoader
from allennlp.data.fields import TextField, LabelField
from allennlp.data.token_indexers import SingleIdTokenIndexer
from allennlp.data.tokenizers import WhitespaceTokenizer

In [2]:
class ULFReader(DatasetReader):
    def __init__(self,
                 max_instances = 100000,
                 tokenizer = None,
                 token_indexers = None,
                 max_tokens= None):
        
        super().__init__(max_instances = max_instances)
        self.tokenizer = tokenizer or WhitespaceTokenizer()
        self.token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self.max_tokens = max_tokens
    
    def text_to_instance(self, tokens, row):
        sentence = TextField(tokens, self.token_indexers)
        ID = LabelField(str(row['0']))
        ULF = LabelField(row['2'])
        ULF_AMR = LabelField(row['3'])
        fields = {'text': sentence, 'ID': ID, 'ULF': ULF, 'ULF_AMR': ULF_AMR}
        return Instance(fields)
    
    def _read(self, file_path: str):
        df = pd.read_json(file_path)
        df.columns = df.columns.astype(str)
        for _ , row in df.iterrows():
            text = row['1']
            tokens = self.tokenizer.tokenize(text)
            if self.max_tokens:
                tokens = tokens[:self.max_tokens]
            yield self.text_to_instance(tokens, row)

In [3]:
data_path = "ulf-1.0-stog.json"

In [4]:
reader = ULFReader()
dataloader = MultiProcessDataLoader(reader, data_path ,batch_size=10) 
instances = dataloader.iter_instances()  
vocab = Vocabulary.from_instances(instances)   
dataloader.index_with(vocab) 
for batch in dataloader:
    print(batch)
    break

loading instances: 0it [00:00, ?it/s]

building vocab: 0it [00:00, ?it/s]

{'text': {'tokens': {'tokens': tensor([[ 283,  180,  771,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0],
        [1345,   11,  284, 1346,    7,  140,   90,   37, 1347, 1348,  251,  772,
            0,    0,    0,    0,    0],
        [  81,  773,   40, 1349,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0],
        [1350,    2, 1351,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0],
        [ 147,   27,   70,   47,  774,    4, 1352,   95,   47,   51,   36,  405,
           70,   47,  774,    4, 1353],
        [   7,   71,   63,  141,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0],
        [   7,  775,   40, 1354,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0],
        [ 776, 1355,   11,  406,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0],
 