# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Classificação-de-Textos" data-toc-modified-id="Classificação-de-Textos-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Classificação de Textos</a></div><div class="lev2 toc-item"><a href="#Preâmbulo" data-toc-modified-id="Preâmbulo-11"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Preâmbulo</a></div><div class="lev2 toc-item"><a href="#O-Dataset" data-toc-modified-id="O-Dataset-12"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>O Dataset</a></div><div class="lev3 toc-item"><a href="#Lendo-do-disco" data-toc-modified-id="Lendo-do-disco-121"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Lendo do disco</a></div><div class="lev2 toc-item"><a href="#A-Rede-Neural" data-toc-modified-id="A-Rede-Neural-13"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>A Rede Neural</a></div><div class="lev3 toc-item"><a href="#Preparando-a-matriz-de-embeddings" data-toc-modified-id="Preparando-a-matriz-de-embeddings-131"><span class="toc-item-num">1.3.1&nbsp;&nbsp;</span>Preparando a matriz de embeddings</a></div><div class="lev3 toc-item"><a href="#Construindo-a-rede" data-toc-modified-id="Construindo-a-rede-132"><span class="toc-item-num">1.3.2&nbsp;&nbsp;</span>Construindo a rede</a></div><div class="lev2 toc-item"><a href="#Treinando" data-toc-modified-id="Treinando-14"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Treinando</a></div><div class="lev2 toc-item"><a href="#Avaliando" data-toc-modified-id="Avaliando-15"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Avaliando</a></div>

# Classificação de Textos

## Preâmbulo

In [15]:
%matplotlib inline
import matplotlib.pyplot as plt
import os,sys
import numpy as np
import pandas as pd
import numpy.random as nr

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import MultiStepLR, StepLR
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd import Variable

from torchvision import datasets, transforms, models

import lib.pytorch_trainer as ptt

use_gpu = torch.cuda.is_available()
print('GPU available:', use_gpu)

GPU available: True


## O Dataset

### Lendo do disco

In [16]:
dd = np.load('/data/datasets/livros/livros_sequences_50.npz')
Xtra, ytra = dd['Xtra'], dd['ytra']
Xval, yval = dd['Xval'], dd['yval']
i2w = dd['i2w']

num_words = len(i2w)
seq_size = Xtra.shape[1]
n_labels = max(ytra) + 1
embedding_dim = 300

Xtra, ytra = Xtra.astype(np.int), ytra.astype(np.int)
Xval, yval = Xval.astype(np.int), yval.astype(np.int)

print('Vocabulary: {} words'.format(len(i2w)))
print('Training dataset:', Xtra.shape, ytra.shape)
print('Validation dataset:', Xval.shape, yval.shape)

Xtra.dtype, ytra.dtype


Vocabulary: 20000 words
Training dataset: (20553, 50) (20553,)
Validation dataset: (5139, 50) (5139,)


(dtype('int64'), dtype('int64'))

## A Rede Neural

### Preparando a matriz de embeddings

Neste notebook o embedding é treinado a partir de pesos aleatórios.


In [17]:
xx = Variable(torch.from_numpy(Xtra[:5].astype(int)))

emb = nn.Embedding(num_words, embedding_dim)
a = emb(xx)
a.data.shape, xx.data.shape

(torch.Size([5, 50, 300]), torch.Size([5, 50]))

### Construindo a rede

In [18]:

class MyNet(nn.Module):
    
    def __init__(self, seq_len=seq_size, voc_size=num_words, embed_dim=embedding_dim, 
                 n_conv_filters=128, conv_kernel_size=5):
        super().__init__()
        
        k = conv_kernel_size - 1
        n = (((seq_len - k) // 2 - k) // 2 - k) // 2
        self.flat_size = n * n_conv_filters
        
        self.emb = nn.Embedding(voc_size, embed_dim)
        nn.init.xavier_uniform(self.emb.weight)

        self.conv1 = nn.Conv1d(embed_dim, n_conv_filters, conv_kernel_size)
        self.conv2 = nn.Conv1d(n_conv_filters, n_conv_filters, conv_kernel_size)
        self.conv3 = nn.Conv1d(n_conv_filters, n_conv_filters, conv_kernel_size)
        
        self.fc1 = nn.Linear(self.flat_size, 128)
        self.fc2 = nn.Linear(128, 3)

    def forward(self, x):
        x = self.emb(x)
        
        x = x.transpose(1, 2)
        
        x = F.relu(self.conv1(x))
        x = F.max_pool1d(x, 2)
        x = F.dropout(x, 0.5)
    
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, 2)

        x = F.relu(self.conv3(x))
        x = F.max_pool1d(x, 2)
        x = F.dropout(x, 0.5)
        
        x = x.view(-1, self.flat_size)
        
        x = F.relu(self.fc1(x))
        x = F.dropout(x, 0.5)
        
        x = self.fc2(x)
        
        return x


## Treinando

In [19]:
trainIt = True
resetIt = True

# Callbacks
# ---------
accuracy_cb = ptt.AccuracyMetric()
chkpt_cb = ptt.ModelCheckpoint('../../models/livros_classif_50_1', reset=resetIt, verbose=1)
print_cb = ptt.PrintCallback()
plot_cb = ptt.PlotCallback()

# Model, optimizer and learning rate scheduler
# --------------------------------------------
model = MyNet()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
scheduler = StepLR(optimizer, step_size=5, gamma=0.75)

# Network trainer
# ---------------
training_parameters = {
    'model':         model, 
    'criterion':     nn.CrossEntropyLoss(),
    'optimizer':     optimizer, 
    'lr_scheduler':  scheduler, 
    'callbacks':     [accuracy_cb, print_cb],
}
trainer = ptt.DeepNetTrainer(**training_parameters)


In [20]:
if trainIt:
    Xtrain = torch.from_numpy(Xtra)
    ytrain = torch.from_numpy(ytra)
    Xvalid = torch.from_numpy(Xval)
    yvalid = torch.from_numpy(yval)
    
    trainer.fit(10, Xtrain, ytrain, valid_data=(Xvalid, yvalid))
else:
    print('\nTraining disabled.\nThis model was trained for {} epochs.'.format(trainer.last_epoch))

Start training for 10 epochs
  1:  17.9s   T: 0.36057 0.82966   V: 0.14837 0.94882 best
  2:  17.9s   T: 0.06131 0.98010   V: 0.15311 0.94746 
  3:  17.9s   T: 0.02229 0.99275   V: 0.18231 0.95077 
  4:  18.0s   T: 0.01449 0.99557   V: 0.20554 0.94785 
Stop training at epoch: 4/10


## Avaliando

In [21]:
rmetrics = trainer.evaluate(Xtrain, ytrain, metrics=[accuracy_cb])
print('Model training set accuracy after training: {:.5f}'.format(rmetrics['acc']))
print()
rmetrics = trainer.evaluate(Xvalid, yvalid, metrics=[accuracy_cb])
print('Model validation set accuracy after training: {:.5f}'.format(rmetrics['acc']))

evaluate: 2055/2055 ok
Model training set accuracy after training: 0.99504

evaluate: 513/513 ok
Model validation set accuracy after training: 0.93676
