In [1]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F

In [2]:
class RNN(nn.Module):
    
    def __init__(self, batch_size, output_size, hidden_size, vocab_size,
                embedding_length, weigths):
        super(RNN, self).__init__()
        """
        Arguments
        ---------
        batch_size : Size of the batch which is same as the batch_size of the data returned by the TorchText BucketIterator
        output_size : 2 = (pos, neg)
        hidden_sie : Size of the hidden_state of the LSTM
        vocab_size : Size of the vocabulary containing unique words
        embedding_length : Embeddding dimension of GloVe word embeddings
        weights : Pre-trained GloVe word_embeddings which we will use to create our word_embedding look-up table 

        """
        
        self.batch_size = batch_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size
        self.embedding_length = embedding_length
        
        self.word_embedding = nn.Embedding(vocab_size, embedding_length)
        self.word_embedding.weigths = nn.Parameter(weights, requires_grad=False)
        self.rnn = nn.RNN(embedding_length, hidden_size, 
                          num_layers=2,bidirectional=True)
        self.label = nn.Linear(4*hidden_size,output_size)
        
    def forward(self, input_sentences, batch_size=None):
        """ 
        Parameters
        ----------
        input_sentence: input_sentence of shape = (batch_size, num_sequences)
        batch_size : default = None. Used only for prediction on a single sentence after training (batch_size = 1)

        Returns
        -------
        Output of the linear layer containing logits for pos & neg class which receives its input as the final_hidden_state of RNN.
        logits.size() = (batch_size, output_size)

        """
        
        input = self.word_embedding(input_sentences)
        input = input.permute(1,0,2)
        
        if batch_size is None:
            h_0 = Variable(torch.zeros(4, self.batch_size, self.hidden_size).cuda())
        else:
            h_0 = Variable(torch.zeros(4, batch_size, self.hidden_size).cuda())
        
        output, h_n = self.rnn(input,h_0)
        
        # h_n.size() = (4, batch_size, hidden_size)
        h_n = h_n.permute(1,0,2)
        h_n = h_n.contiguous().view(h_n.size()[0], h_n.size()[1]*h_n.size()[2])
        
        # h_n.size() = (batch_size, 4*hidden_size)
        logits = self.label(h_n)
        
        return logits
        

In [3]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.datasets import fetch_20newsgroups

x = torch.IntTensor([1,3,6])
y = torch.IntTensor([1,1,1])
result = x + y
print(result)


tensor([2, 4, 7], dtype=torch.int32)


In [4]:
categories = ["comp.graphics","sci.space","rec.sport.baseball"]
newsgroups_train = fetch_20newsgroups(subset='train', categories=categories)
newsgroups_test = fetch_20newsgroups(subset='test', categories=categories)

print('total texts in train:',len(newsgroups_train.data))
print('total texts in test:',len(newsgroups_test.data))

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


total texts in train: 1774
total texts in test: 1180


In [5]:
vocab = Counter()

for text in newsgroups_train.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1

for text in newsgroups_test.data:
    for word in text.split(' '):
        vocab[word.lower()]+=1

total_words = len(vocab)

def get_word_2_index(vocab):
    word2index = {}
    for i,word in enumerate(vocab):
        word2index[word.lower()] = i

    return word2index

word2index = get_word_2_index(vocab)

word2index

{'from:': 0,
 'jk87377@lehtori.cc.tut.fi': 1,
 '(kouhia': 2,
 'juhana)\nsubject:': 3,
 're:': 4,
 'more': 5,
 'gray': 6,
 'levels': 7,
 'out': 8,
 'of': 9,
 'the': 10,
 'screen\norganization:': 11,
 'tampere': 12,
 'university': 13,
 'technology\nlines:': 14,
 '21\ndistribution:': 15,
 'inet\nnntp-posting-host:': 16,
 'cc.tut.fi\n\nin': 17,
 'article': 18,
 '<1993apr6.011605.909@cis.uab.edu>': 19,
 'sloan@cis.uab.edu\n(kenneth': 20,
 'sloan)': 21,
 'writes:\n>\n>why': 22,
 "didn't": 23,
 'you': 24,
 'create': 25,
 '8': 26,
 'grey-level': 27,
 'images,': 28,
 'and': 29,
 'display': 30,
 'them': 31,
 'for\n>1,2,4,8,16,32,64,128...': 32,
 'time': 33,
 'slices?\n\nby': 34,
 "'8": 35,
 'grey': 36,
 'level': 37,
 "images'": 38,
 'mean': 39,
 'items': 40,
 '1bit': 41,
 'images?\nit': 42,
 'does': 43,
 'work(!),': 44,
 'but': 45,
 'it': 46,
 "doesn't": 47,
 'work': 48,
 'if': 49,
 'have': 50,
 'than': 51,
 '1bit\nin': 52,
 'your': 53,
 'screen': 54,
 'intensity': 55,
 'is': 56,
 'non-linear.\n

In [6]:
def get_batch(df,i,batch_size):
    batches = []
    results = []
    texts = df.data[i*batch_size:i*batch_size+batch_size]
    categories = df.target[i*batch_size:i*batch_size+batch_size]
    for text in texts:
        layer = np.zeros(total_words,dtype=float)
        for word in text.split(' '):
            layer[word2index[word.lower()]] += 1

        batches.append(layer)

    for category in categories:
        index_y = -1
        if category == 0:
            index_y = 0
        elif category == 1:
            index_y = 1
        else:
            index_y = 2
        results.append(index_y)


    return np.array(batches),np.array(results)

In [7]:
# Parameters
learning_rate = 0.01
num_epochs = 10
batch_size = 150
display_step = 1

# Network Parameters
hidden_size = 100      # 1st layer and 2nd layer number of features
input_size = total_words # Words in vocab
num_classes = 3         # Categories: graphics, sci.space and baseball

In [8]:
# input [batch_size, n_labels]
# output [max index for each item in batch, ... ,batch_size-1]
loss = nn.CrossEntropyLoss()
input = Variable(torch.randn(2, 5), requires_grad=True)
print(">>> batch of size 2 and 5 possible classes")
print(input)
target = Variable(torch.LongTensor(2).random_(5))
print(">>> array of size 'batch_size' with the index of the maxium label for each item")
print(target)
output = loss(input, target)
output.backward()

>>> batch of size 2 and 5 possible classes
tensor([[ 1.4892, -0.4042, -0.0267, -1.6635, -1.2048],
        [-0.8111,  0.9500,  0.6461,  1.3036,  1.1003]], requires_grad=True)
>>> array of size 'batch_size' with the index of the maxium label for each item
tensor([0, 0])
