# Creating dictionary and data iterators using torchtext and spacy

In [1]:
#Install torchtext and update spicy
!pip install torchtext
!python -m spacy download en


!pip install spacy==2.0.18
!pip install -U spacy
!python -m spacy validate
!python -m spacy download en_core_web_sm
!python -m spacy download en

Collecting torchtext
[?25l  Downloading https://files.pythonhosted.org/packages/43/94/929d6bd236a4fb5c435982a7eb9730b78dcd8659acf328fd2ef9de85f483/torchtext-0.4.0-py3-none-any.whl (53kB)
[K    100% |████████████████████████████████| 61kB 2.8MB/s ta 0:00:011
[?25hCollecting tqdm (from torchtext)
[?25l  Downloading https://files.pythonhosted.org/packages/b9/08/8505f192efc72bfafec79655e1d8351d219e2b80b0dec4ae71f50934c17a/tqdm-4.38.0-py2.py3-none-any.whl (53kB)
[K    100% |████████████████████████████████| 61kB 29.9MB/s ta 0:00:01
[?25hCollecting torch (from torchtext)
[?25l  Downloading https://files.pythonhosted.org/packages/88/95/90e8c4c31cfc67248bf944ba42029295b77159982f532c5689bcfe4e9108/torch-1.3.1-cp36-cp36m-manylinux1_x86_64.whl (734.6MB)
[K    100% |████████████████████████████████| 734.6MB 14kB/s  eta 0:00:01  9% |███                             | 66.9MB 57.9MB/s eta 0:00:12    20% |██████▋                         | 152.7MB 60.6MB/s eta 0:00:10    22% |███████            

  Running setup.py bdist_wheel for ujson ... [?25ldone
[?25h  Stored in directory: /home/ec2-user/.cache/pip/wheels/28/77/e4/0311145b9c2e2f01470e744855131f9e34d6919687550f87d1
  Running setup.py bdist_wheel for wrapt ... [?25ldone
[?25h  Stored in directory: /home/ec2-user/.cache/pip/wheels/48/5d/04/22361a593e70d23b1f7746d932802efe1f0e523376a74f321e
Successfully built dill regex ujson wrapt
[31mtensorflow 1.14.0 has requirement wrapt>=1.11.1, but you'll have wrapt 1.10.11 which is incompatible.[0m
Installing collected packages: plac, cymem, preshed, dill, regex, msgpack, murmurhash, msgpack-numpy, wrapt, thinc, ujson, spacy
  Found existing installation: msgpack 0.6.0
    Uninstalling msgpack-0.6.0:
      Successfully uninstalled msgpack-0.6.0
  Found existing installation: wrapt 1.11.2
    Uninstalling wrapt-1.11.2:
      Successfully uninstalled wrapt-1.11.2
Successfully installed cymem-2.0.3 dill-0.2.9 msgpack-0.5.6 msgpack-numpy-0.4.3.2 murmurhash-1.0.2 plac-0.9.6 preshed-2.0

[31mtensorflow 1.14.0 has requirement wrapt>=1.11.1, but you'll have wrapt 1.10.11 which is incompatible.[0m
Installing collected packages: en-core-web-sm
  Running setup.py install for en-core-web-sm ... [?25ldone
[?25hSuccessfully installed en-core-web-sm-2.2.0
[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[31mtensorflow 1.14.0 has requirement wrapt>=1.11.1, but you'll have wrapt 1.10.11 which is incompatible.[0m
[33mYou are using pip version 10.0.1, however version 19.3.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/home/ec2-user/anaconda3/envs/tensorflow_p36/lib/pyth

## Migration to AWS

The idea of this notebook is using torchtext on a manner that is compatible the migration to AWS. The following approach aims to avoid having to rely on the installation of torchtex on SageMaker container, and actualizations of spicy on the network. 

In [2]:
%matplotlib inline
import numpy as np 
import pandas as pd 
import torch
import torchtext
from torchtext import data
import spacy
import os
import json


spacy.load('en')

# TODO: We are not removign stopwords because some comments are made only 
# of such cases. In the future we should impose a minimum lenght and rm stopwords
#_stopwords = spacy.lang.en.stop_words.STOP_WORDS 

os.environ['OMP_NUM_THREADS'] = '4'
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
# Creating data fields for text and labels.
# Takes some time but we only do it once

TEXT = data.Field(lower=True,include_lengths=False,tokenize='spacy')

LABEL = data.Field(sequential=False, 
                         use_vocab=False, 
                         pad_token=None, 
                            unk_token=None, dtype = torch.float)

dataFields = {"comment_text": ("comment_text", TEXT), 
              'toxic': ("toxic", LABEL), 
              'severe_toxic': ("severe_toxic", LABEL),
              'threat': ("threat", LABEL), 
              'obscene': ("obscene", LABEL),
              'insult': ("insult", LABEL), 
              'identity_hate': ("identity_hate", LABEL)}

dataset = data.TabularDataset(path='./data/preprocessed_train.json', format='json', fields=dataFields, skip_header=True)

In [18]:
# Split data into training (90%) and validation parts (10%)

import random
SEED = 3
train_data, val_data = dataset.split(split_ratio=0.9,random_state = random.seed(SEED))

In [19]:
# Split data into training (90%) and validation parts

MAX_VOCAB_SIZE = 20_000 # TODO: Try larger vocab

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)


# Save vocab vectors and dictionary latter dump into pytorch models

In [20]:
import json
with open(os.path.join('./data_to_s3','untrained_vocab_vectors_list.json'), 'w') as f:
            json.dump(TEXT.vocab.vectors.tolist(), f)

In [21]:
word_dict = dict(TEXT.vocab.stoi)

inverse_word_dict={v: k for k, v in word_dict.items()}

In [22]:
inverse_word_dict[0], word_dict['<unk>']

('<unk>', 0)

In [23]:
import pickle
data_dir = './data_to_s3'
with open(os.path.join(data_dir, 'word_dict.pkl'), "wb") as f:
    pickle.dump(word_dict, f)

In [24]:

data_dir = './data_to_s3'
with open(os.path.join(data_dir, 'inverse_word_dict.pkl'), "wb") as f:
    pickle.dump(inverse_word_dict, f)

In [25]:
with open(os.path.join(data_dir, 'TEXT_data_field.pkl'), "wb") as f:
    pickle.dump(TEXT, f)

In [26]:
BATCH_SIZE = 256

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, val_data), 
    batch_size = BATCH_SIZE,
    sort_key=lambda x: len(x.comment_text),
    sort_within_batch = True,
    device = device)

# Save train and validation iterators

In [27]:
def save_iterator(iterator, data_prefix, data_dir = data_dir, with_labels = True):

    yFields = ['toxic','severe_toxic',
               'obscene','threat','insult',
               'identity_hate']
 
    if with_labels:
        text_list =[]; labels_list=[] 
        for batch in iterator:
            text_list.append(batch.comment_text.tolist())
            labels = torch.stack([getattr(batch, y) for y in yFields])
            labels = torch.transpose(labels,0,1).tolist()
            labels_list.append(labels)
    
        with open(os.path.join(data_dir,
            data_prefix+'_labels_list.json'), 'w') as f:
            json.dump(labels_list, f)
        with open(os.path.join(data_dir,
            data_prefix+'_text_list.json'), 'w') as f:
            json.dump(text_list, f)
            
    else:
        text_list =[]
        for batch in iterator:
            text_list.append(batch.comment_text.tolist())

        with open(os.path.join(data_dir,
            data_prefix+'_text_list.json'), 'w') as f:
            json.dump(text_list, f)

In [28]:
save_iterator(train_iterator, 'train')
save_iterator(valid_iterator, 'val')

## Testing 

In [29]:
dataFields = {"comment_text": ("comment_text", TEXT)}

testDataset= data.TabularDataset(path='./data/preprocessed_test.json', 
                                            format='json',
                                            fields=dataFields, 
                                            skip_header=False)

In [30]:
test_iterator = torchtext.data.Iterator(testDataset, batch_size=64, device=device, 
                                     sort=False, sort_within_batch=False, 
                                     repeat=False,shuffle=False)

In [31]:
save_iterator(test_iterator, 'test', with_labels = False)