In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import os, sys
import json
import numpy as np
import pandas as pd
import numpy.random as nr

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import MultiStepLR, StepLR
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd import Variable

from torchvision import datasets, transforms, models

import lib.pytorch_trainer as ptt

use_gpu = torch.cuda.is_available()
print('GPU available:', use_gpu)

GPU available: False


## Dataset IMDB

### Lendo do disco

O dataset é composto de 25 mil amostras de treinamento e 25 mil amostras de teste.
Cada amostra possui um texto de tamanho que varia entre 11 e 2494 palavras. 
Cada amostra tem um rótulo igual a 1 para denominar sentimento positivo e 0 para sentimento negativo.

In [2]:
word_index = json.load(open('/data/datasets/IMDB/imdb_word_index.json'))
data = np.load('/data/datasets/IMDB/imdb.npz')
x_test, x_train, y_train, y_test = data['x_test'], data['x_train'], data['y_train'], data['y_test']

n_words = len(word_index)
n_train = x_train.shape[0]
n_test  = x_test.shape[0]

word_list = [None for i in range(n_words+1)]
for k, v in word_index.items():
    word_list[v] = k

n_words, n_train, n_test

(88584, 25000, 25000)

In [3]:
def print_stats(x_train, x_test, word_list=None):
    print('Train word index limits:', min([min(s) for s in x_train]), max([max(s) for s in x_train]))
    print('Test word index limits:', min([min(s) for s in x_test]), max([max(s) for s in x_test]))
    print('\nTrain sequence length limits:', min([len(x) for x in x_train]), max([len(x) for x in x_train]))
    print('Test sequence length limits:', min([len(x) for x in x_test]), max([len(x) for x in x_test]))
    if word_list:
        print('\nMost frequent words:', word_list[1:11])
    
print_stats(x_train, x_test, word_list)

Train word index limits: 1 88584
Test word index limits: 1 88581

Train sequence length limits: 10 2493
Test sequence length limits: 6 2314

Most frequent words: ['the', 'and', 'a', 'of', 'to', 'is', 'br', 'in', 'it', 'i']


### Limitando o vocabulário

Retiramos das sequências as palavras com índice maior que o valor especificado em `voc_size`.

In [4]:
voc_size = 10000

xtra = [[w for w in x if (w < voc_size)] for x in x_train]
xval = [[w for w in x if (w < voc_size)] for x in x_test]
print_stats(xtra, xval)

Train word index limits: 1 9999
Test word index limits: 1 9999

Train sequence length limits: 9 2194
Test sequence length limits: 6 2198


### Obtendo sequências de mesmo comprimento

Fazemos com que todas as sequências tenham o mesmo comprimento, especificado em `seq_len`. As sequências mais longas que `seq_len` são truncadas e as menores, completadas com zeros.

In [5]:
def pad_sequences(sequences, max_len):
    def fill_seq(seq, n, seq_len, fill_value):
        zseq = [fill_value for i in range(seq_len)]
        zseq[:n] = seq
        return zseq
    
    new_seq = []
    for seq in sequences:
        n = len(seq)
        if n > max_len:
            new_seq.append(seq[:max_len])
        else:
            new_seq.append(fill_seq(seq, n, max_len, 0))
    return new_seq
    

In [6]:
seq_len = 500
xtra = pad_sequences(xtra, seq_len)
xval = pad_sequences(xval, seq_len)
print_stats(xtra, xval)

Train word index limits: 0 9999
Test word index limits: 0 9999

Train sequence length limits: 500 500
Test sequence length limits: 500 500


In [7]:
xtra = np.array(xtra, np.int)
xval = np.array(xval, np.int)
ytra = np.array(y_train, np.int)
yval = np.array(y_test, np.int)
xtra.shape, xtra.max(), xval.shape, xval.max()

((25000, 500), 9999, (25000, 500), 9999)

## A rede neural

In [8]:
embedding_dim = 100

class MyNet(nn.Module):
    
    def __init__(self, seq_len=seq_len, voc_size=voc_size, embed_dim=embedding_dim, 
                 n_conv_filters=128, conv_kernel_size=5):
        super().__init__()
        
        k = conv_kernel_size - 1
        n = (((seq_len - k) // 2 - k) // 2 - k) // 2
        self.flat_size = n * n_conv_filters
        
        self.emb = nn.Embedding(voc_size, embed_dim)
        nn.init.xavier_uniform(self.emb.weight)

        self.conv1 = nn.Conv1d(embed_dim, n_conv_filters, conv_kernel_size)
        self.conv2 = nn.Conv1d(n_conv_filters, n_conv_filters, conv_kernel_size)
        self.conv3 = nn.Conv1d(n_conv_filters, n_conv_filters, conv_kernel_size)
        
        self.fc1 = nn.Linear(self.flat_size, 128)
        self.fc2 = nn.Linear(128, 2)

    def forward(self, x):
        x = self.emb(x)
        
        x = x.transpose(1, 2)
        
        x = F.relu(self.conv1(x))
        x = F.max_pool1d(x, 2)
        x = F.dropout(x, 0.5)
    
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, 2)

        x = F.relu(self.conv3(x))
        x = F.max_pool1d(x, 2)
        x = F.dropout(x, 0.5)
        
        x = x.view(-1, self.flat_size)
        
        x = F.relu(self.fc1(x))
        x = F.dropout(x, 0.5)
        
        x = self.fc2(x)
        
        return x


In [9]:
net = MyNet()

xx = Variable(torch.from_numpy(xtra[:10]))

aa = net(xx)


In [10]:
aa, ytra.shape

(Variable containing:
 1.00000e-02 *
   1.7076 -6.4162
   1.7229 -6.4496
   1.7073 -6.4071
   1.7154 -6.4046
   1.7056 -6.3919
   1.7080 -6.4193
   1.7061 -6.3997
   1.7131 -6.4028
   1.7084 -6.4331
   1.7085 -6.4274
 [torch.FloatTensor of size 10x2], (25000,))

## Treinamento

In [11]:
trainIt = True
resetIt = False

# Callbacks
# ---------
accuracy_cb = ptt.AccuracyMetric()
chkpt_cb = ptt.ModelCheckpoint('../../models/sentimento_1', reset=resetIt, verbose=1)
print_cb = ptt.PrintCallback()
plot_cb = ptt.PlotCallback()

# Model, optimizer and learning rate scheduler
# --------------------------------------------
model = MyNet()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = StepLR(optimizer, step_size=5, gamma=0.75)

# Network trainer
# ---------------
training_parameters = {
    'model':         model, 
    'criterion':     nn.CrossEntropyLoss(),
    'optimizer':     optimizer, 
    'lr_scheduler':  scheduler, 
    'callbacks':     [accuracy_cb, print_cb],
}
trainer = ptt.DeepNetTrainer(**training_parameters)


In [12]:
if trainIt:
    M = 1000
    Xtrain = torch.from_numpy(xtra)[:M]
    ytrain = torch.from_numpy(ytra)[:M]
    Xvalid = torch.from_numpy(xval)[:M]
    yvalid = torch.from_numpy(yval)[:M]
    
    trainer.fit(10, Xtrain, ytrain, valid_data=(Xvalid, yvalid))
else:
    print('\nTraining disabled.\nThis model was trained for {} epochs.'.format(trainer.last_epoch))

Start training for 10 epochs
  1:  21.2s   T: 0.16025 0.99000   V: 0.00002 1.00000 best
  2:  21.6s   T: 0.00000 1.00000   V: 0.00000 1.00000 best
Stop training at epoch: 2/10


## Avaliação