In [1]:
%matplotlib inline

### Atenção: Rode esta linha apenas se estiver usando o Google Colab

In [None]:
# http://pytorch.org/
from os.path import exists
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())
cuda_output = !ldconfig -p|grep cudart.so|sed -e 's/.*\.\([0-9]*\)\.\([0-9]*\)$/cu\1\2/'
accelerator = cuda_output[0] if exists('/dev/nvidia0') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.4.1-{platform}-linux_x86_64.whl torchvision
import torch

In [3]:
import torch
from torch import nn
from torch import optim
import torchvision
from matplotlib import pyplot as plt
from torchvision import transforms
from torchvision import datasets
import numpy as np
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/panda/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
classes = [
'World',
'Sports',
'Business',
'Sci/Tech',
]

### O código da célula abaixo contém funções para efetuar a carga dos dados, treinamento teste dos modelos

In [5]:
# Download do dataset
import os 
if not os.path.exists('agnews.zip'):
    !wget https://s3-us-west-2.amazonaws.com/wehrmann/agnews.zip
    !unzip agnews.zip

--2019-01-30 21:11:32--  https://s3-us-west-2.amazonaws.com/wehrmann/agnews.zip
Resolving s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)... 52.218.205.8
Connecting to s3-us-west-2.amazonaws.com (s3-us-west-2.amazonaws.com)|52.218.205.8|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13701321 (13M) [application/zip]
Saving to: ‘agnews.zip’


2019-01-30 21:11:37 (3,62 MB/s) - ‘agnews.zip’ saved [13701321/13701321]

Archive:  agnews.zip
   creating: agnews/
  inflating: agnews/vocab.json       
  inflating: agnews/train.csv        
  inflating: agnews/test.csv         


In [6]:
import json
class Vocabulary(object):
    """Simple vocabulary wrapper."""

    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0

    def add_word(self, word):
        if word not in self.word2idx:
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1

    def __call__(self, word):
        if word not in self.word2idx:
            return self.word2idx['<unk>']
        return self.word2idx[word]

    def __len__(self):
        return len(self.word2idx)


def load_vocab(src):
    with open(src) as f:
        d = json.load(f)
    vocab = Vocabulary()
    vocab.word2idx = d['word2idx']
    vocab.idx2word = d['idx2word']
    vocab.idx = d['idx']
    return vocab

In [7]:
def load_txt(txt):
    return open(txt).read().strip().split('\n')


def get_xy(raw_data):
    classes = []
    texts = []
    for line in raw_data:
        y, x = line.split('\t')
        y = np.int(y)
        classes.append(y)
        texts.append(x)
    return classes, texts


def tokenize_text(text, vocab, to_tensor=True):
    # Convert caption (string) to word ids.
    tokens = nltk.tokenize.word_tokenize(
        str(text).lower()#.decode('utf-8')
    )
    caption = []
    caption.append(vocab('<start>'))
    caption.extend([vocab(token) for token in tokens])
    caption.append(vocab('<end>'))
    if to_tensor:
        caption = torch.Tensor(caption)
    return caption

In [8]:
class CSVDataset(torch.utils.data.Dataset):
    """
    """

    def __init__(self, data_path, data_split,):
        self.vocab = load_vocab(os.path.join(data_path, 'vocab.json'))
        self.raw_data = load_txt(
            os.path.join(data_path,'{}.csv'.format(data_split))            
        )
        self.split = data_split

        self.labels, self.texts = get_xy(self.raw_data)
        assert len(self.labels) == len(self.texts)
        self.nb_classes = np.max(self.labels) + 1

    def __getitem__(self, index):
        data = self.texts[index]
        label = self.labels[index]

        tokens = tokenize_text(data, vocab=self.vocab)

        return tokens, label, index, data

    def __len__(self):
        return len(self.labels)

In [9]:
def collate_fn(data):
    # Sort a data list by caption length
    data.sort(key=lambda x: len(x[0]), reverse=True)
    captions, labels, ids, raw = zip(*data)
    labels = torch.Tensor(labels).long()
    # Merget captions (convert tuple of 1D tensor to 2D tensor)
    captions, lengths = pad_default(captions)

    return captions, labels, lengths


def pad_default(captions):
    lengths = np.array([len(cap) for cap in captions])
    targets = torch.zeros(len(captions), max(lengths)).long()
    
    for i, cap in enumerate(captions):
        end = lengths[i]
        targets[i, :end] = cap[:end]
    
    return targets, lengths

In [10]:
def get_loaders(
        data_path,
        batch_size,
        workers=2, 
        splits=['train', 'val', 'test'], 
    ):

    loaders = []
    for split in splits:
        csv_dataset = CSVDataset(
            data_path=data_path,
            data_split=split,
        )

        loader = torch.utils.data.DataLoader(
            dataset=csv_dataset,
            batch_size=batch_size,
            shuffle=(split == 'train'),            
            collate_fn=collate_fn,            
        )
        loaders.append(loader)

    return tuple(loaders)

In [11]:
def train_epoch(
        model, 
        device, 
        train_loader, 
        optimizer, 
        criterion, 
        epoch, 
        log_interval
    ):
    model.train()
    history = []
    for batch_idx, (data, target, lengths) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data, lengths)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))


def test(
        model, 
        device, 
        criterion, 
        test_loader
    ):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target, lengths in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data, lengths)
            test_loss += criterion(output, target).item() # sum up batch loss
            pred = output.max(1, keepdim=True)[1] # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    accuracy = 100. * correct / len(test_loader.dataset)
    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        accuracy))
    return accuracy


def train(
        model,
        train_loader,
        test_loader,
        device,
        lr,
        nb_epochs=3,
        log_interval=100,
    ):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss().to(device)

    for epoch in range(1, nb_epochs + 1):
        print('\n* * * Training * * *')
        train_epoch(
            model=model, 
            device=device, 
            train_loader=train_loader, 
            optimizer=optimizer, 
            criterion=criterion, 
            epoch=epoch, 
            log_interval=log_interval
        )
        print('\n* * * Evaluating * * *')
        acc = test(model, device, criterion, test_loader)        
    
    return acc


def check_input(model, device):
    dummy_data = torch.ones(5, 30).long().to(device)
    lens = [28]*5
    dummy_pred = model(dummy_data, lens)
    assert dummy_pred.shape == (5, 4), '\nOutput expected: (batch_size, 4) \nOutput found   : {}'.format(dummy_pred.shape)
    print('Passed')
    return dummy_pred


### Hyper-parâmetros que você pode definir

In [12]:
batch_size = 32
device_name = 'cpu'
nb_epochs = 3
log_interval = 100
lr = 1e-3

In [13]:
device = torch.device(device_name)

### Conferência dos dados

In [14]:
train_loader, test_loader = get_loaders('./agnews/', batch_size=batch_size, splits=['train', 'test'],)

In [15]:
print(
    'Train size: ', 
    len(train_loader.dataset.texts),
    len(train_loader.dataset.labels)
)
print(
    'Test size : ', 
    len(test_loader.dataset.texts),
    len(test_loader.dataset.labels)
)

Train size:  112400 112400
Test size :  7600 7600


In [17]:
def plot_instance(instance_id):
    print('\nExample: ')
    print(train_loader.dataset.texts[instance_id])
    print('\nLabel Number: ')
    print(train_loader.dataset.labels[instance_id])
    print('\nLabel String: ')
    print(classes[train_loader.dataset.labels[instance_id]])

In [18]:
plot_instance(0)

plot_instance(5000)

plot_instance(1238)

plot_instance(8723)


Example: 
Computer Q amp;A: Tips on installing Windows XP SP2. It seems that Microsoft #39;s new upgrade to Windows is making some people nervous. That #39;s not surprising, as home and office computing environments are just not as friendly as they used to be due to viruses and spyware. Even the software we ...

Label Number: 
3

Label String: 
Sci/Tech

Example: 
Ace performance by Thailand #39;s Thongchai at Mount Juliet. Thailand #39;s Thongchai Jaidee aced the 165-yard 11th hole in the WGC-American Express Championship second round on Friday. The 34-year-old former paratrooper used an eight 

Label Number: 
1

Label String: 
Sports

Example: 
Office Depot Won #39;t Meet Analysts #39; Earnings Estimates for 3 Qtr. Office Depot Inc., the world #39;s No. 2 office-supplies retailer, said it expects earnings per share for the third quarter to fall below current First Call estimates partly because of the recent Hurricanes in Florida.

Label Number: 
2

Label String: 
Business

Example: 

In [19]:
text, labels, lens = next(iter(train_loader))
print('Instance Example: ', text.shape, labels.shape)

Instance Example:  torch.Size([32, 60]) torch.Size([32])


In [20]:
print(text[0], labels[0], lens[0])

tensor([    1, 86346, 74435, 56783, 90184, 13431, 74783, 30858,   506, 30163,
        29886, 68548, 57510, 70219,  6278, 86346,  2884, 60313, 74435, 11618,
        90184,  2185, 74783, 73634, 30858, 55489, 68436, 74435, 35998, 79618,
        20116,  7629, 77181, 53442, 91059, 36958, 80918, 52458, 18507, 13425,
        67721, 74435, 56783, 18844, 89334,   609, 95288, 64317, 62697,  6278,
        43086, 75453, 42781, 34615, 54359, 63365, 38887,    52,   506,     2]) tensor(2) 60


In [21]:
nb_words = len(train_loader.dataset.vocab)

In [23]:
def get_txt_vector(texts, lengths):    
    I = torch.LongTensor(lengths).view(-1, 1, 1)
    I = I.expand(texts.size(0), 1, texts[0].size(1))-1
    
    if torch.cuda.is_available():
        I = I.cuda()
    
    out = torch.gather(texts, 1, I).squeeze(1)

    return out

## Seu trabalho começa aqui:

## 1. Crie uma rede neural, usando `nn.LSTM()` ou `nn.GRU()` para classificar os textos.


* Utilize a rede DigitsConvNet para processar cada um dos frames. 
* Utilize uma rede recorrente da sua escolha para processar a dimensão temporal. 

In [24]:
# RNN Stuff
from torch.nn.utils.rnn import pack_padded_sequence
from torch.nn.utils.rnn import pad_packed_sequence

class TextLSTM(nn.Module):
    def __init__(self):
        super(TextLSTM, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=97585, embedding_dim=100)
        self.rnn = nn.LSTM(input_size=100, hidden_size=32, batch_first=True)
        self.fc = nn.Linear(32, 4)
        
    def forward(self, x, lengths):
        x = self.embedding(x)
        packed = pack_padded_sequence(x, lengths, batch_first=True)
        x, _ = self.rnn(packed)
        padded = pad_packed_sequence(x, batch_first=True)
        hidden, _ = padded
        vector = get_txt_vector(hidden, lengths)
        x = self.fc(vector)
        return x

In [25]:
model = TextLSTM().to(device)
print(model)

TextLSTM(
  (embedding): Embedding(97585, 100)
  (rnn): LSTM(100, 32, batch_first=True)
  (fc): Linear(in_features=32, out_features=4, bias=True)
)


In [26]:
dummy_pred = check_input(model, device)

Passed


In [27]:
acc = train(model, train_loader, test_loader, device, lr, nb_epochs=2, log_interval=log_interval)
print('Final acc: {:.2f}%'.format(acc))


* * * Training * * *


KeyboardInterrupt: 

## 2. Crie uma rede neural, a mais rápida o possível, para classificar textos.

LSTMs e GRUs são lentas, procure usar uma abordagem mais rápida (ex: conv1d, average pooling, self-attention, etc)

In [229]:
class FastNet(nn.Module):
    def __init__(self):
        super(FastNet, self).__init__()
        self.embedding = nn.Embedding(num_embeddings=97585, embedding_dim=100)
        
        self.conv_1 = nn.Conv1d(in_channels=100, out_channels=32, kernel_size=5)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.relu = nn.ReLU(inplace=True)
        self.fc = nn.Linear(32, 4)

    def forward(self, x, lengths):
        #print(x.shape)
        x = self.embedding(x)
        #print(x.shape)
        x =x.permute(0,2,1)
        x = self.conv_1(x)
        self.relu(x)
        x=self.pool(x)
        #print(x.shape)
        x = x.squeeze(2)
        
        #print(x.shape)
        x = self.fc(x)
        return x

In [230]:
model = FastNet().to(device)
print(model)

FastNet(
  (embedding): Embedding(97585, 100)
  (conv_1): Conv1d(100, 32, kernel_size=(5,), stride=(1,))
  (pool): AdaptiveMaxPool1d(output_size=1)
  (relu): ReLU(inplace)
  (fc): Linear(in_features=32, out_features=4, bias=True)
)


In [231]:
dummy_pred = check_input(model, device)

Passed


In [232]:
acc = train(model, train_loader, test_loader, device, lr, nb_epochs, log_interval)
print('Final acc: {:.2f}%'.format(acc))


* * * Training * * *


KeyboardInterrupt: 

## 3. Implemente uma rede neural, da sua escolha, para classificar textos. Seu objetivo é conseguir a acurácia mais alta da turma.

Você pode escolher todos os elementos da arquitetura.

In [None]:
class YourBestNet(nn.Module):
    def __init__(self):
        super(YourBestNet, self).__init__()                
    
    def forward(self, x):
        
        return out

### 3.1 Verifique se a saída do seu modelo está correta

In [None]:
model = YourBestNet().to(device)
print(model)

In [None]:
dummy_pred = check_input(model, device)

### 3.2 Treine seu modelo por algumas épocas

In [None]:
acc = train(model, train_loader, test_loader, device, lr, nb_epochs, log_interval)
print('Final acc: {:.2f}%'.format(acc))