# Classificação de Textos

## Preâmbulo

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import os,sys
import numpy as np
import pandas as pd
import numpy.random as nr

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import MultiStepLR, StepLR
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd import Variable

from torchvision import datasets, transforms, models

import lib.pytorch_trainer as ptt

use_gpu = torch.cuda.is_available()
print('GPU available:', use_gpu)

GPU available: True


In [2]:
np.random.seed(20170601)

## O Dataset

### Lendo do disco

In [3]:
dd = np.load('/data/datasets/livros/livros_sequences_50.npz')
Xtra, ytra = dd['Xtra'], dd['ytra']
Xval, yval = dd['Xval'], dd['yval']
i2w = dd['i2w']

num_words = len(i2w)
seq_size = Xtra.shape[1]
n_labels = max(ytra) + 1
embedding_dim = 300

Xtra, ytra = Xtra.astype(np.int), ytra.astype(np.int)
Xval, yval = Xval.astype(np.int), yval.astype(np.int)

print('Vocabulary: {} words'.format(len(i2w)))
print('Training dataset:', Xtra.shape, ytra.shape)
print('Validation dataset:', Xval.shape, yval.shape)

Xtra.dtype, ytra.dtype


Vocabulary: 20000 words
Training dataset: (20553, 50) (20553,)
Validation dataset: (5139, 50) (5139,)


(dtype('int64'), dtype('int64'))

## A Rede Neural

### Carregando os vetores word2vec para português

In [4]:
w2v_model_fn = '/data/datasets/word2vec_pt_br.npz'

if not os.path.isfile(w2v_model_fn):
    wvec_words = '../../datasets/word2vec_portuguese.tsv'

    words = []
    vectors = []
    word_index = {}
    for line in open(wvec_pt):
        line = line.rstrip()
        if line[0] != ' ':
            i, w, vec0 = line.split(maxsplit=2)
            assert int(i) == len(words)
            assert vec0[0] == '[', vec0
            words.append(w)
            vv = [float(x) for x in vec0[1:].split()]
        elif line[-1] == ']':
            vv += [float(x) for x in line[:-1].split()]
            vectors.append(vv)
        else:
            vv += [float(x) for x in line.split()]
            
    words = np.array(words)
    vectors = np.array(vects)
    np.savez_compressed(w2v_model_fn, words=words, vectors=vectors)

else:
    dd = np.load(w2v_model_fn)
    words = dd['words']
    vectors = dd['vectors']

embeddings_index = {}
for i, w in enumerate(words):
    embeddings_index[w] = vectors[i]
    
embedding_dim = vectors.shape[1]

### Preparando a matriz de embeddings

Neste notebook o embedding é treinado a partir de pesos aleatórios.


In [5]:
# prepare embedding matrix
num_words = len(i2w)
embedding_matrix = np.zeros((num_words, embedding_dim))

notfound = []
for i in range(1, num_words):
    word = i2w[i]
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        notfound.append(word)

print('Embedding Matrix: {}.\nWords not found:  {}'.format(embedding_matrix.shape, len(notfound)))
print('\nW2V embedding mean: {:.5f}, variance: {:.5f}, sum: {:.5f}'
      .format(embedding_matrix.mean(), embedding_matrix.var(), embedding_matrix.sum()))

Embedding Matrix: (20000, 300).
Words not found:  6546

W2V embedding mean: 0.00524, variance: 0.54893, sum: 31429.54515


In [6]:
emb = nn.Embedding(num_words, embedding_dim)
print('Embedding (default init) mean: {:.5f}, variance: {:.5f}, sum: {:.5f}'
      .format(emb.weight.data.mean(), emb.weight.data.var(), emb.weight.data.sum()))
nn.init.xavier_uniform(emb.weight)
print('Embedding (xavier init)  mean: {:.5f}, variance: {:.5f}, sum: {:.5f}'
      .format(emb.weight.data.mean(), emb.weight.data.var(), emb.weight.data.sum()))

Embedding (default init) mean: -0.00029, variance: 0.99980, sum: -1765.43946
Embedding (xavier init)  mean: -0.00001, variance: 0.00010, sum: -30.16257


### Construindo a rede

In [7]:
class MyNet(nn.Module):
    def __init__(self, seq_len=seq_size, embedding_matrix=embedding_matrix, 
                 n_conv_filters=128, conv_kernel_size=5):
        super().__init__()
        
        voc_size, embed_dim = embedding_matrix.shape
        
        k = conv_kernel_size - 1
        n = (((seq_len - k) // 2 - k) // 2 - k) // 2
        self.flat_size = n * n_conv_filters
                
        self.emb = nn.Embedding(voc_size, embed_dim)
        
        dd = self.emb.state_dict()
        dd['weight'] = torch.from_numpy(embedding_matrix / 10000.0)
        self.emb.load_state_dict(dd)

        self.conv1 = nn.Conv1d(embed_dim, n_conv_filters, conv_kernel_size)
        self.conv2 = nn.Conv1d(n_conv_filters, n_conv_filters, conv_kernel_size)
        self.conv3 = nn.Conv1d(n_conv_filters, n_conv_filters, conv_kernel_size)
        
        self.fc1 = nn.Linear(self.flat_size, 128)
        self.fc2 = nn.Linear(128, 3)

    def forward(self, x):
        x = self.emb(x)
        x = x.transpose(1, 2)
        
        x = F.relu(self.conv1(x))
        x = F.max_pool1d(x, 2)
        x = F.dropout(x, 0.5)
    
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, 2)

        x = F.relu(self.conv3(x))
        x = F.max_pool1d(x, 2)
        x = F.dropout(x, 0.5)
        
        x = x.view(-1, self.flat_size)
        
        x = F.relu(self.fc1(x))
        x = F.dropout(x, 0.5)
        
        x = self.fc2(x)
        
        return x

## Treinando

In [8]:
trainIt = False
resetIt = False

# Callbacks
# ---------
accuracy_cb = ptt.AccuracyMetric()
chkpt_cb = ptt.ModelCheckpoint('../../models/livros_classif_w2v', reset=resetIt, verbose=1)
print_cb = ptt.PrintCallback()
plot_cb = ptt.PlotCallback()

# Model, optimizer and learning rate scheduler
# --------------------------------------------
model = MyNet()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = StepLR(optimizer, step_size=5, gamma=0.75)

# Network trainer
# ---------------
training_parameters = {
    'model':         model, 
    'criterion':     nn.CrossEntropyLoss(),
    'optimizer':     optimizer, 
    'lr_scheduler':  scheduler, 
    'callbacks':     [accuracy_cb, print_cb, chkpt_cb],
}
trainer = ptt.DeepNetTrainer(**training_parameters)

In [9]:
Xtrain = torch.from_numpy(Xtra)
ytrain = torch.from_numpy(ytra)
Xvalid = torch.from_numpy(Xval)
yvalid = torch.from_numpy(yval)

if trainIt:
    trainer.fit(5, Xtrain, ytrain, valid_data=(Xvalid, yvalid))
else:
    trainer.load_state('/data/models/livros_classif_w2v')
#     trainer.load_state('../../models/livros_classif_w2v')
    print('\nTraining disabled.\nThis model was trained for {} epochs.'.format(trainer.last_epoch))


Training disabled.
This model was trained for 2 epochs.


### Treinando em CPU (AWS c4x.2large, _compute optimized 8 cores_ ):
`
   1: 220.1s   T: 0.50395 0.76563   V: 0.18099 0.93267 best
...
`
### Treinando em GPU (GTX1080, 8GB):
`
Start training for 5 epochs
  1:   9.7s   T: 0.54306 0.74909   V: 0.27705 0.88908 best
  2:   9.2s   T: 0.10373 0.96477   V: 0.18397 0.93656 best
  3:   9.4s   T: 0.01779 0.99440   V: 0.27974 0.93306 
  4:   9.4s   T: 0.00610 0.99796   V: 0.29690 0.94221 
  5:   9.1s   T: 0.00138 0.99971   V: 0.28753 0.94668 
Stop training at epoch: 5/5
Best model was saved at epoch 2 with loss 0.18397: ../../models/livros_classif_w2v
`

## Avaliando

In [10]:
rmetrics = trainer.evaluate(Xtrain, ytrain, metrics=[accuracy_cb])
print('Model training set accuracy after training: {:.5f}'.format(rmetrics['acc']))
print()
rmetrics = trainer.evaluate(Xvalid, yvalid, metrics=[accuracy_cb])
print('Model validation set accuracy after training: {:.5f}'.format(rmetrics['acc']))

evaluate: 2055/2055 ok
Model training set accuracy after training: 0.99265

evaluate: 513/513 ok
Model validation set accuracy after training: 0.93150
