In [1]:
from typing import List, Dict, Iterable
import csv
import sys
import re

import tqdm
from allennlp.common import Params
from allennlp.common.checks import ConfigurationError
from allennlp.data import DatasetReader, Instance
from allennlp.data.fields import TextField, LabelField, ListField
from allennlp.data.tokenizers import Tokenizer, SpacyTokenizer, WhitespaceTokenizer
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
from nltk.corpus import stopwords
from allennlp.data import Vocabulary
from allennlp.data.batch import Batch
from allennlp.common.util import ensure_list
from overrides import overrides
import numpy as np
import torch
from allennlp.data.fields import TextField, ArrayField
from nltk.stem import PorterStemmer 
from nltk.stem import WordNetLemmatizer



In [2]:
class ToxicReader(DatasetReader):
    """ Toxic Dataset Reader"""
    def __init__(self, max_length:int = None, tokenizer: Tokenizer=None,
            token_indexers: Dict[str, TokenIndexer] = None,
            fill_in_empty_labels: bool = False,
            clean_text:bool = False) -> None:
        super().__init__()
        self._max_sequence_length = max_length
        self.fill_in_empty_labels = fill_in_empty_labels
        self._tokenizer = tokenizer or WhitespaceTokenizer()
        self._token_indexer = token_indexers or {'tokens': SingleIdTokenIndexer()}
        self._clean_text = clean_text

    @overrides
    def _read(self, file_path: str, skip_header:bool=True)->Iterable[Instance]:
        with open(file_path, 'r') as data_file:
            reader = csv.reader(data_file, quotechar='"', delimiter =',', quoting=csv.QUOTE_ALL, skipinitialspace=True)
            if skip_header:
                next(reader)

            for row in reader:
                _, text, *labels = row
                yield self.text_to_instance(text, labels)

    @overrides
    def text_to_instance(self,
                text: str,
                labels: List[str] = None)->Instance:
            # first clean text
            if self._clean_text:
                text = clean_text(text)
            
            if self._max_sequence_length is not None:
                text = text[:self._max_sequence_length]

            tokenized_text = self._tokenizer.tokenize(text)
            text_field = TextField(tokenized_text, self._token_indexer)
            fields = {'text': text_field}

            if labels or self.fill_in_empty_labels:
                labels = labels or [0, 0, 0, 0, 0, 0]

                toxic ,severe_toxic, obscene, threat, insult, identity_hate = labels
                fields['labels'] = ListField([
                    LabelField(int(toxic), skip_indexing=True),
                    LabelField(int(severe_toxic), skip_indexing=True),
                    LabelField(int(obscene), skip_indexing=True),
                    LabelField(int(threat), skip_indexing=True),
                    LabelField(int(insult), skip_indexing=True),
                    LabelField(int(identity_hate), skip_indexing=True)
                ])  

            return Instance(fields)        

In [3]:
reader = ToxicReader(max_length = 5000)

In [4]:
instances = ensure_list(reader.read('../data/train.csv'))

In [5]:
# print human readable form 
print(instances[0].fields['text'].human_readable_repr(),
instances[0].fields['labels'].human_readable_repr())

['Explanation', 'Why', 'the', 'edits', 'made', 'under', 'my', 'username', 'Hardcore', 'Metallica', 'Fan', 'were', 'reverted?', 'They', "weren't", 'vandalisms,', 'just', 'closure', 'on', 'some', 'GAs', 'after', 'I', 'voted', 'at', 'New', 'York', 'Dolls', 'FAC.', 'And', 'please', "don't", 'remove', 'the', 'template', 'from', 'the', 'talk', 'page', 'since', "I'm", 'retired', 'now.89.205.38.27'] [0, 0, 0, 0, 0, 0]


Till this point, fields are not indexed. Indexers have not been run because it needs vocab to convert them into numbers / indices. Let's give it a vocab now.

In [6]:
from allennlp.data.vocabulary import Vocabulary


In [8]:

vocab = Vocabulary.from_instances(instances)

building vocab:   0%|          | 0/159571 [00:00<?, ?it/s]

# Indexing

In [13]:
instances[0].index_fields(vocab)

In [15]:
instances[0].as_tensor_dict()

{'text': {'tokens': {'tokens': tensor([ 20619,    255,      2,    158,    119,    180,     30,    808,  14913,
            25808,  10615,     79,  16568,    315,   2356,  72513,     50,  10494,
               14,     62,  17355,    154,      7,   3242,     33,    421,   1500,
            33175,  12040,    123,    102,     63,    224,      2,    539,     29,
                2,     65,     41,    167,     72,   5350, 184237])}},
 'labels': tensor([0, 0, 0, 0, 0, 0])}

In [17]:
instances[0].get_padding_lengths()


{'text': {'tokens___tokens': 43}, 'labels': {'num_fields': 6}}

# Using multiple Indexers

In [18]:
from allennlp.data.token_indexers import TokenCharactersIndexer


In [19]:

token_character_indexer = TokenCharactersIndexer(min_padding_length=3)


In [20]:
reader = ToxicReader(max_length = 5000, token_indexers={'tokens': SingleIdTokenIndexer(), 'token_characters': token_character_indexer})

In [21]:
instances = ensure_list(reader.read('../data/train.csv'))

In [22]:

vocab = Vocabulary.from_instances(instances)

building vocab:   0%|          | 0/159571 [00:00<?, ?it/s]

In [23]:
instances[0].index_fields(vocab)

In [24]:
instances[0].as_tensor_dict()

{'text': {'tokens': {'tokens': tensor([ 20619,    255,      2,    158,    119,    180,     30,    808,  14913,
            25808,  10615,     79,  16568,    315,   2356,  72513,     50,  10494,
               14,     62,  17355,    154,      7,   3242,     33,    421,   1500,
            33175,  12040,    123,    102,     63,    224,      2,    539,     29,
                2,     65,     41,    167,     72,   5350, 184237])},
  'token_characters': {'token_characters': tensor([[34, 47, 17, 11,  4,  7,  4,  3,  6,  5,  7,  0,  0,  0,  0,  0],
           [32, 10, 16,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
           [ 3, 10,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
           [ 2, 12,  6,  3,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
           [15,  4, 12,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
           [13,  7, 12,  2,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
           [15, 16,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 

In [25]:
instances[0].get_padding_lengths()


{'text': {'tokens___tokens': 43,
  'token_characters___token_characters': 43,
  'token_characters___num_token_characters': 16},
 'labels': {'num_fields': 6}}