# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Classificação-de-Textos" data-toc-modified-id="Classificação-de-Textos-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Classificação de Textos</a></div><div class="lev2 toc-item"><a href="#Preâmbulo" data-toc-modified-id="Preâmbulo-11"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Preâmbulo</a></div><div class="lev2 toc-item"><a href="#O-Dataset" data-toc-modified-id="O-Dataset-12"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>O Dataset</a></div><div class="lev3 toc-item"><a href="#Lendo-do-disco" data-toc-modified-id="Lendo-do-disco-121"><span class="toc-item-num">1.2.1&nbsp;&nbsp;</span>Lendo do disco</a></div><div class="lev2 toc-item"><a href="#A-Rede-Neural" data-toc-modified-id="A-Rede-Neural-13"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>A Rede Neural</a></div><div class="lev3 toc-item"><a href="#Carregando-os-vetores-word2vec-para-português" data-toc-modified-id="Carregando-os-vetores-word2vec-para-português-131"><span class="toc-item-num">1.3.1&nbsp;&nbsp;</span>Carregando os vetores word2vec para português</a></div><div class="lev3 toc-item"><a href="#Preparando-a-matriz-de-embeddings" data-toc-modified-id="Preparando-a-matriz-de-embeddings-132"><span class="toc-item-num">1.3.2&nbsp;&nbsp;</span>Preparando a matriz de embeddings</a></div><div class="lev3 toc-item"><a href="#Construindo-a-rede" data-toc-modified-id="Construindo-a-rede-133"><span class="toc-item-num">1.3.3&nbsp;&nbsp;</span>Construindo a rede</a></div><div class="lev2 toc-item"><a href="#Treinando" data-toc-modified-id="Treinando-14"><span class="toc-item-num">1.4&nbsp;&nbsp;</span>Treinando</a></div><div class="lev2 toc-item"><a href="#Avaliando" data-toc-modified-id="Avaliando-15"><span class="toc-item-num">1.5&nbsp;&nbsp;</span>Avaliando</a></div>

# Classificação de Textos

## Preâmbulo

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import os,sys
import numpy as np
import pandas as pd
import numpy.random as nr

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim.lr_scheduler import MultiStepLR, StepLR
from torch.utils.data import DataLoader, TensorDataset
from torch.autograd import Variable

from torchvision import datasets, transforms, models

import lib.pytorch_trainer as ptt

use_gpu = torch.cuda.is_available()
print('GPU available:', use_gpu)

GPU available: True


## O Dataset

### Lendo do disco

In [2]:
dd = np.load('/data/datasets/livros/livros_sequences_50.npz')
Xtra, ytra = dd['Xtra'], dd['ytra']
Xval, yval = dd['Xval'], dd['yval']
i2w = dd['i2w']

num_words = len(i2w)
seq_size = Xtra.shape[1]
n_labels = max(ytra) + 1
embedding_dim = 300

Xtra, ytra = Xtra.astype(np.int), ytra.astype(np.int)
Xval, yval = Xval.astype(np.int), yval.astype(np.int)

print('Vocabulary: {} words'.format(len(i2w)))
print('Training dataset:', Xtra.shape, ytra.shape)
print('Validation dataset:', Xval.shape, yval.shape)

Xtra.dtype, ytra.dtype


Vocabulary: 20000 words
Training dataset: (20553, 50) (20553,)
Validation dataset: (5139, 50) (5139,)


(dtype('int64'), dtype('int64'))

## A Rede Neural

### Carregando os vetores word2vec para português

In [3]:
w2v_model_fn = '/data/datasets/word2vec_pt_br.npz'

if not os.path.isfile(w2v_model_fn):
    wvec_words = '../../datasets/word2vec_portuguese.tsv'

    words = []
    vectors = []
    word_index = {}
    for line in open(wvec_pt):
        line = line.rstrip()
        if line[0] != ' ':
            i, w, vec0 = line.split(maxsplit=2)
            assert int(i) == len(words)
            assert vec0[0] == '[', vec0
            words.append(w)
            vv = [float(x) for x in vec0[1:].split()]
        elif line[-1] == ']':
            vv += [float(x) for x in line[:-1].split()]
            vectors.append(vv)
        else:
            vv += [float(x) for x in line.split()]
            
    words = np.array(words)
    vectors = np.array(vects)
    np.savez_compressed(w2v_model_fn, words=words, vectors=vectors)

else:
    dd = np.load(w2v_model_fn)
    words = dd['words']
    vectors = dd['vectors']

embeddings_index = {}
for i, w in enumerate(words):
    embeddings_index[w] = vectors[i]
    
embedding_dim = vectors.shape[1]

### Preparando a matriz de embeddings

Neste notebook o embedding é treinado a partir de pesos aleatórios.


In [4]:
nr.seed(20170601)

# prepare embedding matrix
num_words = len(i2w)
embedding_matrix = np.zeros((num_words, embedding_dim))

notfound = []
for i in range(1, num_words):
    word = i2w[i]
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        notfound.append(word)

print('Embedding Matrix: {}.\nWords not found:  {}'.format(embedding_matrix.shape, len(notfound)))
print('\nW2V embedding mean: {:.5f}, variance: {:.5f}, sum: {:.5f}'
      .format(embedding_matrix.mean(), embedding_matrix.var(), embedding_matrix.sum()))

Embedding Matrix: (20000, 300).
Words not found:  6546

W2V embedding mean: 0.00524, variance: 0.54893, sum: 31429.54515


In [5]:
emb = nn.Embedding(num_words, embedding_dim)
print('Embedding (default init) mean: {:.5f}, variance: {:.5f}, sum: {:.5f}'
      .format(emb.weight.data.mean(), emb.weight.data.var(), emb.weight.data.sum()))
nn.init.xavier_uniform(emb.weight)
print('Embedding (xavier init)  mean: {:.5f}, variance: {:.5f}, sum: {:.5f}'
      .format(emb.weight.data.mean(), emb.weight.data.var(), emb.weight.data.sum()))

Embedding (default init) mean: 0.00002, variance: 1.00010, sum: 92.72173
Embedding (xavier init)  mean: -0.00000, variance: 0.00010, sum: -12.06245


### Construindo a rede

In [6]:

class MyNet(nn.Module):
    
    def __init__(self, seq_len=seq_size, embedding_matrix=embedding_matrix, 
                 n_conv_filters=128, conv_kernel_size=5):
        super().__init__()
        
        voc_size, embed_dim = embedding_matrix.shape
        
        k = conv_kernel_size - 1
        n = (((seq_len - k) // 2 - k) // 2 - k) // 2
        self.flat_size = n * n_conv_filters
                
        self.emb = nn.Embedding(voc_size, embed_dim)
        
        dd = self.emb.state_dict()
        dd['weight'] = torch.from_numpy(embedding_matrix / 10000.0)
        self.emb.load_state_dict(dd)

        self.conv1 = nn.Conv1d(embed_dim, n_conv_filters, conv_kernel_size)
        self.conv2 = nn.Conv1d(n_conv_filters, n_conv_filters, conv_kernel_size)
        self.conv3 = nn.Conv1d(n_conv_filters, n_conv_filters, conv_kernel_size)
        
        self.fc1 = nn.Linear(self.flat_size, 128)
        self.fc2 = nn.Linear(128, 3)

    def forward(self, x):
        x = self.emb(x)
        x = x.transpose(1, 2)
        
        x = F.relu(self.conv1(x))
        x = F.max_pool1d(x, 2)
        x = F.dropout(x, 0.5)
    
        x = F.relu(self.conv2(x))
        x = F.max_pool1d(x, 2)

        x = F.relu(self.conv3(x))
        x = F.max_pool1d(x, 2)
        x = F.dropout(x, 0.5)
        
        x = x.view(-1, self.flat_size)
        
        x = F.relu(self.fc1(x))
        x = F.dropout(x, 0.5)
        
        x = self.fc2(x)
        
        return x


## Treinando

In [7]:
trainIt = True
resetIt = False

# Callbacks
# ---------
accuracy_cb = ptt.AccuracyMetric()
chkpt_cb = ptt.ModelCheckpoint('../../models/livros_classif_50_1', reset=resetIt, verbose=1)
print_cb = ptt.PrintCallback()
plot_cb = ptt.PlotCallback()

# Model, optimizer and learning rate scheduler
# --------------------------------------------
model = MyNet()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
scheduler = StepLR(optimizer, step_size=5, gamma=0.75)

# Network trainer
# ---------------
training_parameters = {
    'model':         model, 
    'criterion':     nn.CrossEntropyLoss(),
    'optimizer':     optimizer, 
    'lr_scheduler':  scheduler, 
    'callbacks':     [accuracy_cb, print_cb],
}
trainer = ptt.DeepNetTrainer(**training_parameters)


In [8]:
if trainIt:
    Xtrain = torch.from_numpy(Xtra)
    ytrain = torch.from_numpy(ytra)
    Xvalid = torch.from_numpy(Xval)
    yvalid = torch.from_numpy(yval)
    
    trainer.fit(10, Xtrain, ytrain, valid_data=(Xvalid, yvalid))
else:
    print('\nTraining disabled.\nThis model was trained for {} epochs.'.format(trainer.last_epoch))

Start training for 10 epochs
  1:  18.7s   T: 0.51409 0.76072   V: 0.19539 0.92878 best
  2:  18.1s   T: 0.08552 0.96954   V: 0.15972 0.94435 best
  3:  18.1s   T: 0.01638 0.99484   V: 0.20268 0.94162 
  4:  18.1s   T: 0.00338 0.99912   V: 0.24776 0.94415 
  5:  18.1s   T: 0.00203 0.99932   V: 0.30243 0.94318 
  6:  18.1s   T: 0.00104 0.99961   V: 0.29067 0.94435 
  7:  18.1s   T: 0.00076 0.99976   V: 0.35808 0.93987 
  8:  18.1s   T: 0.00033 0.99981   V: 0.32042 0.94415 
  9:  18.1s   T: 0.00001 1.00000   V: 0.31239 0.94785 
 10:  18.1s   T: 0.00000 1.00000   V: 0.31742 0.94766 
Stop training at epoch: 10/10


## Avaliando

In [9]:
rmetrics = trainer.evaluate(Xtrain, ytrain, metrics=[accuracy_cb])
print('Model training set accuracy after training: {:.5f}'.format(rmetrics['acc']))
print()
rmetrics = trainer.evaluate(Xvalid, yvalid, metrics=[accuracy_cb])
print('Model validation set accuracy after training: {:.5f}'.format(rmetrics['acc']))

evaluate: 2055/2055 ok
Model training set accuracy after training: 1.00000

evaluate: 513/513 ok
Model validation set accuracy after training: 0.94766
