## Migration to AWS

The idea of this notebook is using torchtext on a manner that is compatible the migration to AWS. The following approach aims to 
avoid having to rely on the installation of torchtex, and actualizations of spicy on the network. 

In [1]:

%matplotlib inline
import numpy as np 
import pandas as pd 
import torch

import os




In [8]:
import pickle



In [9]:
TEXT = pickle.load(open('./custom_embeddings/train_data_field', 'rb'))

In [10]:
TEXT.vocab

<torchtext.vocab.Vocab at 0x12ba5b860>

# Translating to data loader

In [11]:
import json

In [12]:
def _get_data(data_prefix, data_dir='./data', with_labels= True):
    #print("Get train data loader.")

    with open(os.path.join(data_dir,
              data_prefix+'_text_list.json'), 'r') as f:
        train_X = json.load(f)
    train_X = [torch.tensor(t) for t in train_X] #.long()
    
    if with_labels:
        with open(os.path.join(data_dir,
                  data_prefix+'_labels_list.json'), 'r') as f:
            train_y = json.load(f)

        train_y = [torch.tensor(t) for t in train_y] # .float().squeeze()
        return train_X , train_y
    
    else: 
        return train_X

In [13]:
class Data_iterator:
    def __init__(self, data_prefix, data_dir='./data'):
        self.X, self.y  = _get_data(data_prefix, data_dir)
        
    def __iter__(self):
        return iter(zip(self.X, self.y))

In [14]:
class Test_iterator:
    def __init__(self, data_prefix, data_dir='./data'):
        self.X = _get_data(data_prefix, data_dir, with_labels= False)
        
    def __iter__(self):
        return iter(self.X)

In [15]:
iterator_train = Data_iterator('train')
iterator_val = Data_iterator('val')

## Build model

In [23]:
import torch.nn as nn
from torch.functional import F
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim)) 
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [sent len, batch size]
        
        text = text.permute(1, 0)
                
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conv_n = [batch size, n_filters, sent len - filter_sizes[n]]
        
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)


In [24]:
INPUT_DIM = len(TEXT.vocab) # 20002
EMBEDDING_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2,3,4]
OUTPUT_DIM = 6
DROPOUT = 0.5
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token] # 1

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, 
            FILTER_SIZES ,OUTPUT_DIM, DROPOUT, PAD_IDX)

In [25]:

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 2,092,306 trainable parameters


In [27]:
for i in iterator_train:
    aux=i[0]
    break

from torchsummaryX import summary

summary(model, aux )

                      Kernel Shape       Output Shape   Params Mult-Adds
Layer                                                                   
0_embedding           [100, 20002]     [256, 13, 100]  2.0002M   2.0002M
1_convs.Conv2d_0  [1, 100, 2, 100]  [256, 100, 12, 1]    20.1k    240.0k
2_convs.Conv2d_1  [1, 100, 3, 100]  [256, 100, 11, 1]    30.1k    330.0k
3_convs.Conv2d_2  [1, 100, 4, 100]  [256, 100, 10, 1]    40.1k    400.0k
4_dropout                        -         [256, 300]        -         -
5_fc                      [300, 6]           [256, 6]   1.806k      1.8k
---------------------------------------------------------------------------
                         Totals
Total params          2.092306M
Trainable params      2.092306M
Non-trainable params        0.0
Mult-Adds                2.972M


Unnamed: 0_level_0,Kernel Shape,Output Shape,Params,Mult-Adds
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0_embedding,"[100, 20002]","[256, 13, 100]",2000200.0,2000200.0
1_convs.Conv2d_0,"[1, 100, 2, 100]","[256, 100, 12, 1]",20100.0,240000.0
2_convs.Conv2d_1,"[1, 100, 3, 100]","[256, 100, 11, 1]",30100.0,330000.0
3_convs.Conv2d_2,"[1, 100, 4, 100]","[256, 100, 10, 1]",40100.0,400000.0
4_dropout,-,"[256, 300]",,
5_fc,"[300, 6]","[256, 6]",1806.0,1800.0


In [28]:
pretrained_embeddings = TEXT.vocab.vectors

print(pretrained_embeddings.shape)

torch.Size([20002, 100])


In [29]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[-0.1117, -0.4966,  0.1631,  ...,  1.2647, -0.2753, -0.1325],
        [-0.8555, -0.7208,  1.3755,  ...,  0.0825, -1.1314,  0.3997],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-1.0361, -0.3528, -0.4494,  ..., -0.3391, -0.0521, -0.2626],
        [-0.8892,  0.3043,  0.9224,  ..., -0.2417, -0.1520,  0.0683],
        [ 0.2916,  0.0795, -0.0095,  ...,  0.3854,  0.3772, -1.5852]])

In [30]:
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-1.0361, -0.3528, -0.4494,  ..., -0.3391, -0.0521, -0.2626],
        [-0.8892,  0.3043,  0.9224,  ..., -0.2417, -0.1520,  0.0683],
        [ 0.2916,  0.0795, -0.0095,  ...,  0.3854,  0.3772, -1.5852]])


In [31]:
print(model.embedding.weight.data)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.0382, -0.2449,  0.7281,  ..., -0.1459,  0.8278,  0.2706],
        ...,
        [-1.0361, -0.3528, -0.4494,  ..., -0.3391, -0.0521, -0.2626],
        [-0.8892,  0.3043,  0.9224,  ..., -0.2417, -0.1520,  0.0683],
        [ 0.2916,  0.0795, -0.0095,  ...,  0.3854,  0.3772, -1.5852]])


## Train our model

In [32]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [35]:
import numpy
from sklearn.metrics import roc_auc_score
def roc_auc(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    #round predictions to the closest integer
    #rounded_preds = torch.sigmoid(preds)
    
    #assert preds.size()==y.size()
    
    #reds=rounded_preds.detach().numpy()

    #y=y.numpy()
    
    global var_y
    global var_preds
    var_y = y
    var_preds = preds

    acc = roc_auc_score(y, preds)

    
    return acc

In [36]:


def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    preds_list=[]
    labels_list= []
 
    iterations=0
    for batch in iterator:
        iterations+=1
        
        batch_X, batch_y = batch
        
        optimizer.zero_grad()
        
        #text = batch_X
        
        predictions = model(batch_X).squeeze(1)
        
        #batch_labels=torch.stack([getattr(batch, y) for y in yFields]) #transpose?
        #batch_labels = torch.transpose(batch_labels,0,1)
        
        loss = criterion(predictions, batch_y)
        
        loss.backward()
        
        optimizer.step()
        
        preds_list+=[torch.sigmoid(predictions).detach().numpy()]
        labels_list+=[batch_y.numpy()]
        
        #if i%64==0:
        #    epoch_acc += [roc_auc(np.vstack(preds_list), np.vstack(batch_labels))]
        #    preds_list=[]
        #    labels_list= []
            
        
        epoch_loss += loss.item()
        
        
        
    return epoch_loss / iterations, roc_auc(np.vstack(preds_list), np.vstack(labels_list))

In [37]:


def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    preds_list=[]
    labels_list= []
    epoch_acc=[]
    
    with torch.no_grad():
        iterations = 0
        for batch in iterator:
            iterations+=1
            
            batch_X, batch_y = batch
            
            predictions = model(batch_X).squeeze(1)
            
            #batch_labels = torch.stack([getattr(batch, y) for y in yFields]) #transpose?
            #batch_labels = torch.transpose(batch_labels,0,1)
            
            loss = criterion(predictions, batch_y)

            epoch_loss += loss.item()
            
            preds_list+=[torch.sigmoid(predictions).detach().numpy()]
            labels_list+=[batch_y.numpy()]
        
            #if i%64==0:
            #    epoch_acc += [roc_auc(np.vstack(preds_list), np.vstack(batch_labels))]
            #    preds_list=[]
            #    labels_list= []
        
    return epoch_loss / iterations, roc_auc(np.vstack(preds_list), np.vstack(labels_list))



In [38]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [39]:
model.embedding.weight.requires_grad = True


N_EPOCHS = 4

best_valid_loss = float('inf')


for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_acc = train(model, iterator_train, optimizer, criterion)
    print('jaja')
    valid_loss, valid_acc = evaluate(model, iterator_val, criterion)
    print('juju')
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'tut2-model.pt')
    
    

    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')



jaja
juju
Epoch: 01 | Epoch Time: 2m 56s
	Train Loss: 0.081 | Train Acc: 88.96%
	 Val. Loss: 0.051 |  Val. Acc: 96.63%
jaja
juju
Epoch: 02 | Epoch Time: 2m 54s
	Train Loss: 0.055 | Train Acc: 96.34%
	 Val. Loss: 0.048 |  Val. Acc: 97.57%
jaja
juju
Epoch: 03 | Epoch Time: 2m 54s
	Train Loss: 0.049 | Train Acc: 97.35%
	 Val. Loss: 0.047 |  Val. Acc: 98.00%
jaja
juju
Epoch: 04 | Epoch Time: 2m 55s
	Train Loss: 0.045 | Train Acc: 98.08%
	 Val. Loss: 0.048 |  Val. Acc: 98.02%


In [40]:
TEXT.vocab.vectors = model.embedding.weight.data

In [41]:
import pickle

pickle.dump(TEXT, open('./custom_embeddings/train_data_field_trained', 'wb'))

## Save word embedding

In [42]:
from tqdm import tqdm
word_list = [] 
def write_embeddings(path, embeddings, vocab):
    
    with open(path, 'w') as f:
        for i, embedding in enumerate(tqdm(embeddings)):
            word = vocab.itos[i]
            #skip words with unicode symbols
            if len(word) != len(word.encode()):
                continue
            word_list.append(word)
            vector = ' '.join([str(i) for i in embedding.tolist()])
            f.write(f'{word} {vector}\n')

In [43]:
write_embeddings('embeddings_conv.txt', 
                 model.embedding.weight.data, 
                 TEXT.vocab)

100%|██████████| 20002/20002 [00:01<00:00, 10388.21it/s]


## Testing 

In [44]:
from sklearn import metrics

#roc_auc(np.vstack(preds_list), np.vstack(labels_list))

In [46]:
iterator_test = Test_iterator('test')

In [47]:
myPreds=[]
with torch.no_grad():
    model.eval()
    for batch_X in iterator_test:

        torch.cuda.empty_cache()
    
        predictions = model(batch_X).squeeze(1)         
        myPreds+=[torch.sigmoid(predictions).detach().numpy()]
    
        torch.cuda.empty_cache()
myPreds = np.vstack(myPreds)

In [48]:
testDF= pd.read_csv("./data/test.csv")
for i, col in enumerate(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]):
    testDF[col] = myPreds[:, i]

In [49]:
myPreds.shape

(153164, 6)

In [50]:
testDF.drop("comment_text", axis=1).to_csv("submission_convolutional_no_torchtext_2.csv", index=False)