# Classificação de Textos

## Preâmbulo

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import os,sys
import numpy as np
import pandas as pd
import numpy.random as nr

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import MultiStepLR, StepLR
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd import Variable

from torchvision import datasets, transforms, models

import lib.pytorch_trainer as ptt

use_gpu = torch.cuda.is_available()
print('GPU available:', use_gpu)

GPU available: False


## O Dataset

### Lendo do disco

In [2]:
dd = np.load('/data/datasets/livros/livros_sequences_50.npz')
Xtra, ytra = dd['Xtra'], dd['ytra']
Xval, yval = dd['Xval'], dd['yval']
i2w = dd['i2w']

num_words = len(i2w)
seq_size = Xtra.shape[1]
n_labels = max(ytra) + 1
embedding_dim = 300

Xtra, ytra = Xtra.astype(np.int), ytra.astype(np.int)
Xval, yval = Xval.astype(np.int), yval.astype(np.int)

print('Vocabulary: {} words'.format(len(i2w)))
print('Training dataset:', Xtra.shape, ytra.shape)
print('Validation dataset:', Xval.shape, yval.shape)

Xtra.dtype, ytra.dtype

Vocabulary: 20000 words
Training dataset: (13764, 50) (13764,)
Validation dataset: (3442, 50) (3442,)


(dtype('int64'), dtype('int64'))

## A Rede Neural

### Preparando a matriz de embeddings

Neste notebook o embedding é treinado a partir de pesos aleatórios.


In [3]:
xx = Variable(torch.from_numpy(Xtra[:5].astype(int)))

emb = nn.Embedding(num_words, embedding_dim)
a = emb(xx)
a.data.shape, xx.data.shape

(torch.Size([5, 50, 300]), torch.Size([5, 50]))

### Construindo a rede

In [4]:
class MyNet(nn.Module):
    def __init__(self, seq_len=seq_size, voc_size=num_words, embed_dim=embedding_dim, 
                 n_conv_filters=128, conv_kernel_size=5):
        super().__init__()
        
        k = conv_kernel_size - 1
        n = (((seq_len - k) // 2 - k) // 2 - k) // 2
        self.flat_size = n * n_conv_filters
        
        self.emb = nn.Embedding(voc_size, embed_dim)
        nn.init.xavier_uniform(self.emb.weight)

        self.conv1 = nn.Conv1d(embed_dim, n_conv_filters, conv_kernel_size)
        self.conv2 = nn.Conv1d(n_conv_filters, n_conv_filters, conv_kernel_size)
        self.conv3 = nn.Conv1d(n_conv_filters, n_conv_filters, conv_kernel_size)
        
        self.fc1 = nn.Linear(self.flat_size, 128)
        self.fc2 = nn.Linear(128, 3)

    def forward(self, x):
        x = self.emb(x)
        
        x = x.transpose(1, 2)
        
        x = F.relu(self.conv1(x))
        x = F.max_pool1d(x, 2)
        x = F.dropout(x, 0.5)
    
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, 2)

        x = F.relu(self.conv3(x))
        x = F.max_pool1d(x, 2)
        x = F.dropout(x, 0.5)
        
        x = x.view(-1, self.flat_size)
        
        x = F.relu(self.fc1(x))
        x = F.dropout(x, 0.5)
        
        x = self.fc2(x)
        
        return x

## Treinando

In [5]:
trainIt = False
resetIt = False

# Callbacks
# ---------
accuracy_cb = ptt.AccuracyMetric()
chkpt_cb = ptt.ModelCheckpoint('../../models/livros_classif_50_1', reset=resetIt, verbose=1)
print_cb = ptt.PrintCallback()
plot_cb = ptt.PlotCallback()

# Model, optimizer and learning rate scheduler
# --------------------------------------------
model = MyNet()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = StepLR(optimizer, step_size=5, gamma=0.75)

# Network trainer
# ---------------
training_parameters = {
    'model':         model, 
    'criterion':     nn.CrossEntropyLoss(),
    'optimizer':     optimizer, 
    'lr_scheduler':  scheduler, 
    'callbacks':     [accuracy_cb, print_cb, chkpt_cb],
}
trainer = ptt.DeepNetTrainer(**training_parameters)

In [6]:
Xtrain = torch.from_numpy(Xtra)
ytrain = torch.from_numpy(ytra)
Xvalid = torch.from_numpy(Xval)
yvalid = torch.from_numpy(yval)

if trainIt:
    trainer.fit(10, Xtrain, ytrain, valid_data=(Xvalid, yvalid))
else:
    trainer.load_state('/data/models/livros_classif_50_1')
    print('\nTraining disabled.\nThis model was trained for {} epochs.'.format(trainer.last_epoch))


Training disabled.
This model was trained for 2 epochs.


## Treinamento em GPU
`
Start training for 10 epochs
  1:   9.7s   T: 0.55538 0.72087   V: 0.20912 0.92392 best
  2:   9.1s   T: 0.09597 0.96584   V: 0.15606 0.94668 best
  3:   9.1s   T: 0.01565 0.99557   V: 0.23586 0.93851 
  4:   9.2s   T: 0.00339 0.99917   V: 0.25678 0.94376 
  5:   9.1s   T: 0.00111 0.99976   V: 0.30262 0.94376 
  6:   9.1s   T: 0.00040 0.99995   V: 0.31673 0.94221 
  7:   9.3s   T: 0.00002 1.00000   V: 0.33130 0.94279 
  8:   9.2s   T: 0.00000 1.00000   V: 0.34154 0.94279 
  9:   9.2s   T: 0.00000 1.00000   V: 0.35736 0.94318 
 10:   9.3s   T: 0.00000 1.00000   V: 0.37403 0.94221 
Stop training at epoch: 10/10
Best model was saved at epoch 2 with loss 0.15606: ../../models/livros_classif_50_1
`

## Avaliando

In [7]:
rmetrics = trainer.evaluate(Xtrain, ytrain, metrics=[accuracy_cb])
print('Model training set accuracy after training: {:.5f}'.format(rmetrics['acc']))
print()
rmetrics = trainer.evaluate(Xvalid, yvalid, metrics=[accuracy_cb])
print('Model validation set accuracy after training: {:.5f}'.format(rmetrics['acc']))

evaluate: 1376/1376 ok
Model training set accuracy after training: 0.28749

evaluate: 344/344 ok
Model validation set accuracy after training: 0.29692
