In [1]:
class Config(object):
    embed_size = 300 # Size of the word embeddings
    hidden_layers = 1
    hidden_size = 64
    output_size = 2 # no of output labels
    max_epochs = 15
    hidden_size_linear = 64 
    lr = 0.5 # learning rate
    batch_size = 128
    seq_len = None # Sequence length for RNN
    dropout_keep = 0.8

In [2]:
import torch
from torchtext import data
from torchtext.vocab import Vectors
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from nltk.tokenize import sent_tokenize, word_tokenize

class Yelp_Dataset(object):
    def __init__(self, config):
        self.config = config
        self.train_iterator = None
        self.test_iterator = None
        self.val_iterator = None
        self.vocab = []
        self.word_embeddings = {}
    
    def load_data(self, w2v_file, train_df, test_df):
        tokenizer = lambda sent: [x for x in word_tokenize(sent) if x != " "]
        
        # Creating Field for data
        TEXT = data.Field(sequential=True, tokenize=tokenizer, lower=True)
        LABEL = data.Field(sequential=False, use_vocab=False)
        datafields = [("text",TEXT),("label",LABEL)]
        
        # Load data from pd.DataFrame into torchtext.data.Dataset
        
        train_examples = [data.Example.fromlist(i, datafields) for i in train_df.values.tolist()]
        train_data = data.Dataset(train_examples, datafields)
        
        test_examples = [data.Example.fromlist(i, datafields) for i in test_df.values.tolist()]
        test_data = data.Dataset(test_examples, datafields)
        
        
        
        train_data, val_data = train_data.split(split_ratio=0.8)
        
        TEXT.build_vocab(train_data, vectors=Vectors(w2v_file))
        self.word_embeddings = TEXT.vocab.vectors
        self.vocab = TEXT.vocab
        
        self.train_iterator = data.BucketIterator(
            (train_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=True)
        
        self.val_iterator, self.test_iterator = data.BucketIterator.splits(
            (val_data, test_data),
            batch_size=self.config.batch_size,
            sort_key=lambda x: len(x.text),
            repeat=False,
            shuffle=False)
        
        

def get_accuracy(model, iterator):
    all_preds = []
    all_y = []
    for idx,batch in enumerate(iterator):
        if torch.cuda.is_available():
            x = batch.text.cuda()
        else:
            x = batch.text
        y_pred = model(x)
        predicted = torch.max(y_pred.cpu().data, 1)[1] + 1
        all_preds.extend(predicted.numpy())
        all_y.extend(batch.label.numpy())
    score = accuracy_score(all_y, np.array(all_preds).flatten())
    return score

In [3]:
import torch
from torch import nn
import numpy as np
from torch.nn import functional as F


class RCNN(nn.Module):
    def __init__(self, config, vocab_size, word_embeddings):
        super(RCNN, self).__init__()
        self.config = config
        
        # Embedding Layer
        self.embeddings = nn.Embedding(vocab_size, self.config.embed_size)
        self.embeddings.weight = nn.Parameter(word_embeddings, requires_grad=False)
        
        # Bi-directional LSTM for RCNN
        self.lstm = nn.LSTM(input_size = self.config.embed_size,
                            hidden_size = self.config.hidden_size,
                            num_layers = self.config.hidden_layers,
                            dropout = self.config.dropout_keep,
                            bidirectional = True)
        
        self.dropout = nn.Dropout(self.config.dropout_keep)
        
        # Linear layer
        self.W = nn.Linear(
            self.config.embed_size + 2*self.config.hidden_size,
            self.config.hidden_size_linear
        )
        
        # Tanh non-linearity
        self.tanh = nn.Tanh()
        
        # Fully-Connected Layer
        self.fc = nn.Linear(
            self.config.hidden_size_linear,
            self.config.output_size
        )
        
        # Softmax
        self.softmax = nn.Softmax()
        
    def forward(self, x):
        
        embedded_sent = self.embeddings(x)
        lstm_out, (h_n,c_n) = self.lstm(embedded_sent)
        input_features = torch.cat([lstm_out,embedded_sent], 2).permute(1,0,2)
        linear_output = self.tanh(
            self.W(input_features)
        )
        linear_output = linear_output.permute(0,2,1) # Reshaping fot max_pool
        max_out_features = F.max_pool1d(linear_output, linear_output.shape[2]).squeeze(2)
        max_out_features = self.dropout(max_out_features)
        final_out = self.fc(max_out_features)
        return self.softmax(final_out)
    
    def add_optimizer(self, optimizer):
        self.optimizer = optimizer
        
    def add_loss_op(self, loss_op):
        self.loss_op = loss_op
    
    def reduce_lr(self):
        print("Reducing LR")
        for g in self.optimizer.param_groups:
            g['lr'] = g['lr'] / 2
                
    def run_epoch(self, train_iterator, val_iterator, epoch):
        train_losses = []
        val_accuracies = []
        losses = []
        
        # Reduce learning rate as number of epochs increase
        if (epoch == int(self.config.max_epochs/3)) or (epoch == int(2*self.config.max_epochs/3)):
            self.reduce_lr()
            
        for i, batch in enumerate(train_iterator):
            self.optimizer.zero_grad()
            if torch.cuda.is_available():
                x = batch.text.cuda()
                y = (batch.label - 1).type(torch.cuda.LongTensor)
            else:
                x = batch.text
                y = (batch.label - 1).type(torch.LongTensor)
            y_pred = self.__call__(x)
            loss = self.loss_op(y_pred, y)
            loss.backward()
            losses.append(loss.data.cpu().numpy())
            self.optimizer.step()
    
            if i % 100 == 0:
                print("Iteration: {}".format(i+1))
                avg_train_loss = np.mean(losses)
                train_losses.append(avg_train_loss)
                print("\tAverage training loss: {:.5f}".format(avg_train_loss))
                losses = []
                
                # Evalute Accuracy on validation set
                val_accuracy = get_accuracy(self, val_iterator)
                print("\tVal Accuracy: {:.4f}".format(val_accuracy))
                self.train()
                
        return train_losses, val_accuracies

In [4]:
import sys
import torch.optim as optim
from torch import nn
import torch
import pandas as pd



config = Config()
train_file = pd.read_csv('../data/yelp_train.csv')
test_file = pd.read_csv('../data/yelp_test.csv')
# for testing 
#train_file = train_file.iloc[:1000,:]
#test_file = test_file.iloc[:1000,:]


# Glove embeddings
w2v_file = '../data/glove.840B.300d.txt'

dataset = Yelp_Dataset(config)
dataset.load_data(w2v_file, train_file, test_file)
print("Data loaded")

# call the model
model = RCNN(config, len(dataset.vocab), dataset.word_embeddings)
# if gpu
if torch.cuda.is_available():
    model.cuda()
# train 
model.train()
optimizer = optim.SGD(model.parameters(), lr=config.lr)
NLLLoss = nn.NLLLoss()
model.add_optimizer(optimizer)
model.add_loss_op(NLLLoss)


train_losses = []
val_accuracies = []

# epochs 15
for i in range(config.max_epochs):
    print ("Epoch: {}".format(i))
    train_loss,val_accuracy = model.run_epoch(dataset.train_iterator, dataset.val_iterator, i)
    train_losses.append(train_loss)
    val_accuracies.append(val_accuracy)

train_acc = get_accuracy(model, dataset.train_iterator)
val_acc = get_accuracy(model, dataset.val_iterator)
test_acc = get_accuracy(model, dataset.test_iterator)

print ('Final Training Accuracy: {:.4f}'.format(train_acc))
print ('Final Validation Accuracy: {:.4f}'.format(val_acc))
print ('Final Test Accuracy: {:.4f}'.format(test_acc))

Data loaded


  "num_layers={}".format(dropout, num_layers))


Epoch: 0




Iter: 1
	Average training loss: -0.24101
	Val Accuracy: 0.2552
Iter: 101
	Average training loss: -0.49943
	Val Accuracy: 0.7842
Iter: 201
	Average training loss: -0.76596
	Val Accuracy: 0.8299
Iter: 301
	Average training loss: -0.81653
	Val Accuracy: 0.8370
Iter: 401
	Average training loss: -0.82595
	Val Accuracy: 0.8459
Iter: 501
	Average training loss: -0.83443
	Val Accuracy: 0.8490
Iter: 601
	Average training loss: -0.84113
	Val Accuracy: 0.8535
Iter: 701
	Average training loss: -0.83926
	Val Accuracy: 0.8561
Epoch: 1
Iter: 1
	Average training loss: -0.83464
	Val Accuracy: 0.8598
Iter: 101
	Average training loss: -0.85123
	Val Accuracy: 0.8597
Iter: 201
	Average training loss: -0.85632
	Val Accuracy: 0.8630
Iter: 301
	Average training loss: -0.85187
	Val Accuracy: 0.8614
Iter: 401
	Average training loss: -0.85711
	Val Accuracy: 0.8630
Iter: 501
	Average training loss: -0.85542
	Val Accuracy: 0.8642
Iter: 601
	Average training loss: -0.86220
	Val Accuracy: 0.8692
Iter: 701
	Average t