 # Table of Contents
<div class="toc" style="margin-top: 1em;"><ul class="toc-item" id="toc-level0"><li><span><a href="http://localhost:8888/notebooks/rnap/PyTorch/livros_classif.ipynb#Classificação-de-Textos" data-toc-modified-id="Classificação-de-Textos-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Classificação de Textos</a></span><ul class="toc-item"><li><span><a href="http://localhost:8888/notebooks/rnap/PyTorch/livros_classif.ipynb#Preâmbulo" data-toc-modified-id="Preâmbulo-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Preâmbulo</a></span></li><li><span><a href="http://localhost:8888/notebooks/rnap/PyTorch/livros_classif.ipynb#O-Dataset" data-toc-modified-id="O-Dataset-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>O Dataset</a></span><ul class="toc-item"><li><span><a href="http://localhost:8888/notebooks/rnap/PyTorch/livros_classif.ipynb#Lendo-do-disco" data-toc-modified-id="Lendo-do-disco-1.2.1"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Lendo do disco</a></span></li></ul></li><li><span><a href="http://localhost:8888/notebooks/rnap/PyTorch/livros_classif.ipynb#A-Rede-Neural" data-toc-modified-id="A-Rede-Neural-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>A Rede Neural</a></span><ul class="toc-item"><li><span><a href="http://localhost:8888/notebooks/rnap/PyTorch/livros_classif.ipynb#Preparando-a-matriz-de-embeddings" data-toc-modified-id="Preparando-a-matriz-de-embeddings-1.3.1"><span class="toc-item-num">1.3.1&nbsp;&nbsp;</span>Preparando a matriz de embeddings</a></span></li><li><span><a href="http://localhost:8888/notebooks/rnap/PyTorch/livros_classif.ipynb#Construindo-a-rede" data-toc-modified-id="Construindo-a-rede-1.3.2"><span class="toc-item-num">1.3.2&nbsp;&nbsp;</span>Construindo a rede</a></span></li></ul></li><li><span><a href="http://localhost:8888/notebooks/rnap/PyTorch/livros_classif.ipynb#Treinando" data-toc-modified-id="Treinando-1.4"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Treinando</a></span></li><li><span><a href="http://localhost:8888/notebooks/rnap/PyTorch/livros_classif.ipynb#Avaliando" data-toc-modified-id="Avaliando-1.5"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Avaliando</a></span></li></ul></li></ul></div>

# Classificação de Textos

## Preâmbulo

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import os,sys
import numpy as np
import pandas as pd
import numpy.random as nr

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import MultiStepLR, StepLR
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd import Variable

from torchvision import datasets, transforms, models

import lib.pytorch_trainer as ptt

use_gpu = torch.cuda.is_available()
print('GPU available:', use_gpu)

GPU available: True


## O Dataset

### Lendo do disco

In [2]:
dd = np.load('/data/datasets/livros/livros_sequences_50.npz')
Xtra, ytra = dd['Xtra'], dd['ytra']
Xval, yval = dd['Xval'], dd['yval']
i2w = dd['i2w']

num_words = len(i2w)
seq_size = Xtra.shape[1]
n_labels = max(ytra) + 1
embedding_dim = 300

Xtra, ytra = Xtra.astype(np.int), ytra.astype(np.int)
Xval, yval = Xval.astype(np.int), yval.astype(np.int)

print('Vocabulary: {} words'.format(len(i2w)))
print('Training dataset:', Xtra.shape, ytra.shape)
print('Validation dataset:', Xval.shape, yval.shape)

Xtra.dtype, ytra.dtype


Vocabulary: 20000 words
Training dataset: (20553, 50) (20553,)
Validation dataset: (5139, 50) (5139,)


(dtype('int64'), dtype('int64'))

## A Rede Neural

### Preparando a matriz de embeddings

Neste notebook o embedding é treinado a partir de pesos aleatórios.


In [3]:
xx = Variable(torch.from_numpy(Xtra[:5].astype(int)))

emb = nn.Embedding(num_words, embedding_dim)
a = emb(xx)
a.data.shape, xx.data.shape

(torch.Size([5, 50, 300]), torch.Size([5, 50]))

### Construindo a rede

In [4]:

class MyNet(nn.Module):
    
    def __init__(self, seq_len=seq_size, voc_size=num_words, embed_dim=embedding_dim, 
                 n_conv_filters=128, conv_kernel_size=5):
        super().__init__()
        
        k = conv_kernel_size - 1
        n = (((seq_len - k) // 2 - k) // 2 - k) // 2
        self.flat_size = n * n_conv_filters
        
        self.emb = nn.Embedding(voc_size, embed_dim)
        nn.init.xavier_uniform(self.emb.weight)

        self.conv1 = nn.Conv1d(embed_dim, n_conv_filters, conv_kernel_size)
        self.conv2 = nn.Conv1d(n_conv_filters, n_conv_filters, conv_kernel_size)
        self.conv3 = nn.Conv1d(n_conv_filters, n_conv_filters, conv_kernel_size)
        
        self.fc1 = nn.Linear(self.flat_size, 128)
        self.fc2 = nn.Linear(128, 3)

    def forward(self, x):
        x = self.emb(x)
        
        x = x.transpose(1, 2)
        
        x = F.relu(self.conv1(x))
        x = F.max_pool1d(x, 2)
        x = F.dropout(x, 0.5)
    
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, 2)

        x = F.relu(self.conv3(x))
        x = F.max_pool1d(x, 2)
        x = F.dropout(x, 0.5)
        
        x = x.view(-1, self.flat_size)
        
        x = F.relu(self.fc1(x))
        x = F.dropout(x, 0.5)
        
        x = self.fc2(x)
        
        return x


## Treinando

In [5]:
trainIt = True
resetIt = True

# Callbacks
# ---------
accuracy_cb = ptt.AccuracyMetric()
chkpt_cb = ptt.ModelCheckpoint('../../models/livros_classif_50_1', reset=resetIt, verbose=1)
print_cb = ptt.PrintCallback()
plot_cb = ptt.PlotCallback()

# Model, optimizer and learning rate scheduler
# --------------------------------------------
model = MyNet()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = StepLR(optimizer, step_size=5, gamma=0.75)

# Network trainer
# ---------------
training_parameters = {
    'model':         model, 
    'criterion':     nn.CrossEntropyLoss(),
    'optimizer':     optimizer, 
    'lr_scheduler':  scheduler, 
    'callbacks':     [accuracy_cb, print_cb, chkpt_cb],
}
trainer = ptt.DeepNetTrainer(**training_parameters)


In [6]:
if trainIt:
    Xtrain = torch.from_numpy(Xtra)
    ytrain = torch.from_numpy(ytra)
    Xvalid = torch.from_numpy(Xval)
    yvalid = torch.from_numpy(yval)
    
    trainer.fit(10, Xtrain, ytrain, valid_data=(Xvalid, yvalid))
else:
    print('\nTraining disabled.\nThis model was trained for {} epochs.'.format(trainer.last_epoch))

Start training for 10 epochs
  1:  18.6s   T: 0.54930 0.72393   V: 0.17525 0.93423 best
  2:  18.1s   T: 0.08217 0.97095   V: 0.14175 0.95038 best
  3:  18.0s   T: 0.01389 0.99625   V: 0.28593 0.92314 
  4:  18.0s   T: 0.00559 0.99810   V: 0.23928 0.94571 
  5:  18.0s   T: 0.00134 0.99966   V: 0.29704 0.94337 
  6:  17.9s   T: 0.00029 0.99995   V: 0.36903 0.93948 
  7:  18.0s   T: 0.00005 1.00000   V: 0.31879 0.94863 
  8:  18.0s   T: 0.00000 1.00000   V: 0.33632 0.94688 
  9:  18.0s   T: 0.00000 1.00000   V: 0.34664 0.94785 
 10:  18.0s   T: 0.00000 1.00000   V: 0.36333 0.94843 
Stop training at epoch: 10/10


## Avaliando

In [7]:
rmetrics = trainer.evaluate(Xtrain, ytrain, metrics=[accuracy_cb])
print('Model training set accuracy after training: {:.5f}'.format(rmetrics['acc']))
print()
rmetrics = trainer.evaluate(Xvalid, yvalid, metrics=[accuracy_cb])
print('Model validation set accuracy after training: {:.5f}'.format(rmetrics['acc']))

evaluate: 2055/2055 ok
Model training set accuracy after training: 1.00000

evaluate: 513/513 ok
Model validation set accuracy after training: 0.94843
