## Migration to AWS

The idea of this notebook is using torchtext on a manner that is compatible the migration to AWS. The following approach aims to 
avoid having to rely on the installation of torchtex, and actualizations of spicy on the network. 

In [1]:

%matplotlib inline
import numpy as np 
import pandas as pd 
import torch
import torchtext
from torchtext import data
import spacy
import os
import re

spacy.load('en')
_stopwords = spacy.lang.en.stop_words.STOP_WORDS

os.environ['OMP_NUM_THREADS'] = '4'


SEED = 1234

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


TEXT = data.Field(lower=True,include_lengths=False,tokenize='spacy')

LABEL = data.Field(sequential=False, 
                         use_vocab=False, 
                         pad_token=None, 
                            unk_token=None, dtype = torch.float)


dataFields = {"comment_text": ("comment_text", TEXT), 
              'toxic': ("toxic", LABEL), 
              'severe_toxic': ("severe_toxic", LABEL),
              'threat': ("threat", LABEL), 
              'obscene': ("obscene", LABEL),
              'insult': ("insult", LABEL), 
              'identity_hate': ("identity_hate", LABEL)}

dataset= data.TabularDataset(path='./data/train.json', 
                                            format='json',
                                            fields=dataFields, 
                                            skip_header=True)

In [2]:
import random
SEED = 3
#train, unimportant = dataset.split(split_ratio=0.5,random_state = random.seed(SEED)) 

train_data, val_data = dataset.split(split_ratio=0.9,random_state = random.seed(SEED))

In [3]:
MAX_VOCAB_SIZE = 20_000

TEXT.build_vocab(train_data, 
                 max_size = MAX_VOCAB_SIZE, 
                 vectors = "glove.6B.100d", 
                 unk_init = torch.Tensor.normal_)

In [14]:
import pickle

pickle.dump(TEXT, open('./custom_embeddings/train_data_field', 'wb'))

In [5]:
TEXT.vocab

<torchtext.vocab.Vocab at 0x14d3b5e10>

In [6]:
BATCH_SIZE = 256

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_data, val_data), 
    batch_size = BATCH_SIZE,
    sort_key=lambda x: len(x.comment_text),
    sort_within_batch = True,
    device = device)

# Translating to data loader

In [7]:
import json

In [8]:
def save_iterator(iterator, data_prefix, data_dir = './data', with_labels = True):

    yFields = ['toxic','severe_toxic',
               'obscene','threat','insult',
               'identity_hate']
 
    if with_labels:
        text_list =[]; labels_list=[] 
        for batch in iterator:
            text_list.append(batch.comment_text.tolist())
            labels = torch.stack([getattr(batch, y) for y in yFields])
            labels = torch.transpose(labels,0,1).tolist()
            labels_list.append(labels)
    
        with open(os.path.join(data_dir,
            data_prefix+'_labels_list.json'), 'w') as f:
            json.dump(labels_list, f)
        with open(os.path.join(data_dir,
            data_prefix+'_text_list.json'), 'w') as f:
            json.dump(text_list, f)
            
    else:
        text_list =[]
        for batch in iterator:
            text_list.append(batch.comment_text.tolist())

        with open(os.path.join(data_dir,
            data_prefix+'_text_list.json'), 'w') as f:
            json.dump(text_list, f)

In [9]:
save_iterator(train_iterator, 'train')
save_iterator(valid_iterator, 'val')

## Testing 

In [10]:
from sklearn import metrics

#roc_auc(np.vstack(preds_list), np.vstack(labels_list))

In [11]:
dataFields = {"comment_text": ("comment_text", TEXT)}

testDataset= data.TabularDataset(path='./data/test.json', 
                                            format='json',
                                            fields=dataFields, 
                                            skip_header=False)

In [12]:
test_iterator = torchtext.data.Iterator(testDataset, batch_size=64, device=device, 
                                     sort=False, sort_within_batch=False, 
                                     repeat=False,shuffle=False)

In [13]:
save_iterator(test_iterator, 'test', with_labels = False)