In [59]:
import time
import sys
import os
import re
import torch
import torchvision
import pandas as pd
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from PIL import Image
from torchvision import datasets, transforms
from torch.autograd import Variable
import numpy as np
from torch.utils.data import Dataset, DataLoader
import torch.backends.cudnn as cudnn
from torchvision import transforms, utils
from itertools import *

from dataparser import *
from batcher import *
from readEmbeddings import *
from datasets import *
from models import *

import pdb

In [81]:
class sstDataset(Dataset):
    def __init__(self, sstPath, glovePath, transform = None):
        self.data = load_sst_data(sstPath)
        self.paddingElement = ['<s>']
        self.maxSentenceLength = self.maxlength(self.data)
        self.vocab = glove2dict(glovePath)

    def __getitem__(self, index):
        s = self.pad(self.data[index]['sentence_1'].split())
        s = self.embed(s)
        label = int(self.data[index]['label'])
        return (s), label

    def __len__(self):
        return len(self.data)

    def maxlength(self, data):
        maxSentenceLength = max([len(d['sentence_1'].split()) for d in data])
#         maxLength = max(maxLength,maxSentenceLength)
        return maxSentenceLength

    def pad(self, sentence):
        return sentence + (self.maxSentenceLength-len(sentence))*self.paddingElement

    def embed(self, sentence):
        vector = []
        for word in sentence:
            if str(word) in self.vocab:
                vector = np.concatenate((vector, self.vocab[str(word)]), axis=0)
            else:
                vector = np.concatenate((vector, [0]*len(self.vocab['a'])), axis=0)
        return vector

In [39]:
class Partition(Dataset):
    def __init__(self, data, index):
        self.data = data
        self.index = index

    def __len__(self):
        return len(self.index)

    def __getitem__(self, index):
        data_idx = self.index[index]
        return self.data[data_idx]


class DataPartitioner(Dataset):
    def __init__(self, data, sizes, seed=1234):
        self.data = data
        self.partitions = []
        rng = Random()
        rng.seed(seed)
        data_len = len(data)
        indexes = [x for x in range(0, data_len)]
        rng.shuffle(indexes)

        for frac in sizes:
            part_len = int(frac * data_len)
            self.partitions.append(indexes[0:part_len])
            indexes = indexes[part_len:]

    def use(self, partition):
        return Partition(self.data, self.partitions[partition])

def partition_dataset(sstPath, glovePath, batchSize, transformations=None):
    dataset = sstDataset(sstPath, glovePath, transformations)
    size = dist.get_world_size()
    bsz = batchSize
    partition_sizes = [1.0 / size for _ in range(size)]
    partition = DataPartitioner(dataset, partition_sizes)
    partition = partition.use(dist.get_rank())
    train_set = DataLoader(partition, batch_size=bsz, shuffle=True, num_workers=1)
    return train_set, bsz


In [71]:
class ClassLSTM(nn.Module):
    """docstring for ClassLSTM"""
    def __init__(self, input_size, hidden_size, num_layers, batch, bias = True, batch_first = False, dropout = 0, bidirectional = False):
        super(ClassLSTM, self).__init__()
        self.num_directions = 2 if bidirectional else 1
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bias = True, batch_first = False, dropout = dropout, bidirectional = bidirectional)
        self.h0 = Variable(torch.randn(num_layers * self.num_directions, batch, hidden_size))
        self.c0 = Variable(torch.randn(num_layers * self.num_directions, batch, hidden_size))
    def forward(self, s1):
        output, hn = self.lstm(s1, (self.h0, self.c0))
        return output

class sstNet(nn.Module):
    """docstring for sstNet"""
    def __init__(self, inp_dim, model_dim, num_layers, reverse, bidirectional, dropout, mlp_input_dim, mlp_dim, num_classes, num_mlp_layers, mlp_ln, classifier_dropout_rate, training, batchSize):
        super(sstNet, self).__init__()
        self.encoderSst = ClassLSTM(inp_dim, model_dim, num_layers, batchSize, bidirectional = bidirectional, dropout = dropout)
        self.classifierSst = MLP(mlp_input_dim, mlp_dim, num_classes, num_mlp_layers, mlp_ln, classifier_dropout_rate, training)

    def forward(self, s1):
        oE = self.encoderSst(s1)
        features = oE[-1]
        output = F.log_softmax(self.classifierSst(features))
        return output

    def encode(self, s):
        emb = self.encoderSst(s)
        return emb
    
class BiLSTMSentiment(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size, use_gpu, batch_size, dropout=0.5):
        super(BiLSTMSentiment, self).__init__()
        self.hidden_dim = hidden_dim
        self.use_gpu = use_gpu
        self.batch_size = batch_size
        self.dropout = dropout
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, bidirectional=True)
        self.hidden2label = nn.Linear(hidden_dim*2, label_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # first is the hidden h
        # second is the cell c
        if self.use_gpu:
            return (Variable(torch.zeros(2, self.batch_size, self.hidden_dim).cuda()),
                    Variable(torch.zeros(2, self.batch_size, self.hidden_dim).cuda()))
        else:
            return (Variable(torch.zeros(2, self.batch_size, self.hidden_dim)),
                    Variable(torch.zeros(2, self.batch_size, self.hidden_dim)))

    def forward(self, sentence):
#         x = self.embeddings(sentence).view(len(sentence), self.batch_size, -1)
        x = sentence.view(len(sentence), self.batch_size, -1)
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        y = self.hidden2label(lstm_out[-1])
        log_probs = F.log_softmax(y)
        return log_probs

In [77]:
def trainEpoch(epoch, break_val, trainLoader, model, optimizer, criterion, inp_dim, batchSize, use_cuda, devLoader, devbatchSize):
    print("Epoch start - ",epoch)
    for batch_idx, (data, target) in enumerate(trainLoader):
        s1 = data.float()
        batch, _ = s1.shape
        if batchSize != batch:
            break
        print("before tr",s1.shape)
        s1 = s1.transpose(0,1).contiguous().view(-1,inp_dim,batch).transpose(1,2)
        print("after tr",s1.shape)
        if(use_cuda):
            s1, target = Variable(s1.cuda()), Variable(target.cuda())
        else:
            s1, target = Variable(s1), Variable(target)
        
#         output = model(s1)
        optimizer.zero_grad()
        # pdb.set_trace()
        output = model(s1)
        model.zero_grad()
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx == break_val:
            return
        if batch_idx % 100 == 0:
            dev_loss = 0
            n_correct = 0
            n_total = 0
            for idx, (dev_data, dev_target) in enumerate(devLoader):
                sd = dev_data.float()
                # pdb.set_trace()
                devbatchSize, _ = sd.shape
                if batchSize != devbatchSize:
                    break
                print("before dev",sd.shape)
                sd = sd.transpose(0,1).contiguous().view(-1,inp_dim,devbatchSize).transpose(1,2)
                print("after dev",sd.shape)
                if(use_cuda):
                    sd, dev_target = Variable(sd.cuda()), Variable(dev_target.cuda())
                else:
                    sd, dev_target = Variable(sd), Variable(dev_target)
                dev_output = model(sd)
                dev_loss += criterion(dev_output, dev_target)
                n_correct += (torch.max(dev_output, 1)[1].view(dev_target.size()) == dev_target).sum()
                n_total += devbatchSize
                # break
            dev_acc = (100. * n_correct.data[0])/n_total

            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tDev Loss: {:.6f}\tDev Acc: {:.6f}'.format(
                epoch, batch_idx * len(data), len(trainLoader.dataset),
                100. * batch_idx / len(trainLoader), loss.data[0], dev_loss.data[0], dev_acc))
            save(model, optimizer, loss, 'sstTrained.pth', dev_loss, dev_acc)
    return loss

In [62]:
def train(numEpochs, trainLoader, model, optimizer, criterion, inp_dim, batchSize, use_cuda, devLoader, devbatchSize):
    for epoch in range(numEpochs):
        loss = trainEpoch(epoch,20000000,trainLoader,model,optimizer,criterion,inp_dim,batchSize, use_cuda, devLoader, devbatchSize)
        dev_loss = 0
        n_correct = 0
        n_total = 0
        for idx, (dev_data, dev_target) in enumerate(devLoader):
            sd = dev_data.float()
            # pdb.set_trace()
            devbatchSize, _ = sd.shape
            sd = sd.transpose(0,1).contiguous().view(-1,inp_dim,devbatchSize).transpose(1,2)
            if(use_cuda):
                sd, dev_target = Variable(sd.cuda()), Variable(dev_target.cuda())
            else:
                sd, dev_target = Variable(sd), Variable(dev_target)
            dev_output = model(sd)
            dev_loss += criterion(dev_output, dev_target)
            n_correct += (torch.max(dev_output, 1)[1].view(dev_target.size()) == dev_target).sum()
            n_total += devbatchSize
        dev_acc = (100. * n_correct.data[0])/n_total
        print('Epoch: {} - Dev Accuracy: {}'.format(epoch, dev_acc))
        save(model, optimizer, loss, 'sstTrainedEpoch.pth', dev_loss, dev_acc)

In [82]:
sstPathTrain = "../Data/SST/trees/train.txt"
sstPathDev = "../Data/SST/trees/dev.txt"
glovePath = '../Data/glove.6B/glove.6B.100d.txt'

batchSize = 64
learningRate = 0.0001
momentum = 0.9
numWorkers = 5

numEpochs = 1

inp_dim = 300
model_dim = 300
num_layers = 2
reverse = False
bidirectional = True
dropout = 0.4

mlp_input_dim = 600
mlp_dim = 300
num_classes = 5
num_mlp_layers = 5
mlp_ln = True
classifier_dropout_rate = 0.4

training = True

use_cuda = torch.cuda.is_available()
# use_cuda = False
if(use_cuda):
    the_gpu.gpu = 0

t1 = time.time()
trainingDataset = sstDataset(sstPathTrain, glovePath)
devDataset = sstDataset(sstPathDev, glovePath, training = False)
print('Time taken - ',time.time()-t1)
devbatchSize = batchSize

trainLoader = DataLoader(trainingDataset, batchSize, shuffle=False, num_workers = numWorkers)
devLoader = DataLoader(devDataset, devbatchSize, shuffle=False, num_workers = numWorkers)

SST eval mode: Preserving only top node label.


UnboundLocalError: local variable 'maxLength' referenced before assignment

In [78]:
# model = sstNet(inp_dim, model_dim, num_layers, reverse, bidirectional, dropout, mlp_input_dim, mlp_dim, num_classes, num_mlp_layers, mlp_ln, classifier_dropout_rate, training, batchSize)
model = BiLSTMSentiment(300, 100, 100, 5, False, batchSize) 
if(use_cuda):
    model.cuda()
if(use_cuda):
    criterion = nn.CrossEntropyLoss().cuda()
else:
    criterion = nn.CrossEntropyLoss()
criterion=nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr = learningRate, weight_decay = 1e-5)

train(numEpochs, trainLoader, model, optimizer, criterion, inp_dim, batchSize, use_cuda, devLoader, batchSize)

('Epoch start - ', 0)
('before tr', torch.Size([64, 4800]))
('after tr', torch.Size([16, 64, 300]))




('before dev', torch.Size([64, 4400]))


RuntimeError: invalid argument 2: size '[-1 x 300 x 64]' is invalid for input with 281600 elements at /Users/soumith/miniconda2/conda-bld/pytorch_1532620281745/work/aten/src/TH/THStorage.cpp:80

In [56]:
fname = "../Data/SST/sentiment_dataset/data/stsa.fine.dev"
with open(fname) as f:
    content = f.readlines()
# you may also want to remove whitespace characters like `\n` at the end of each line
content = [x.strip() for x in content]
# len(content)
# print(content)
for line in content:
    cls = int(line[0])
    if cls < 2:
        cls = 0
    elif cls == 2:
        cls = 1
    else:
        cls = 2
    line = line[2:].split()
    print([cls,line])

[1, ['in', 'his', 'first', 'stab', 'at', 'the', 'form', ',', 'jacquot', 'takes', 'a', 'slightly', 'anarchic', 'approach', 'that', 'works', 'only', 'sporadically', '.']]
[0, ['one', 'long', 'string', 'of', 'cliches', '.']]
[0, ['if', 'you', "'ve", 'ever', 'entertained', 'the', 'notion', 'of', 'doing', 'what', 'the', 'title', 'of', 'this', 'film', 'implies', ',', 'what', 'sex', 'with', 'strangers', 'actually', 'shows', 'may', 'put', 'you', 'off', 'the', 'idea', 'forever', '.']]
[0, ['k-19', 'exploits', 'our', 'substantial', 'collective', 'fear', 'of', 'nuclear', 'holocaust', 'to', 'generate', 'cheap', 'hollywood', 'tension', '.']]
[0, ['it', "'s", 'played', 'in', 'the', 'most', 'straight-faced', 'fashion', ',', 'with', 'little', 'humor', 'to', 'lighten', 'things', 'up', '.']]
[2, ['there', 'is', 'a', 'fabric', 'of', 'complex', 'ideas', 'here', ',', 'and', 'feelings', 'that', 'profoundly', 'deepen', 'them', '.']]
[1, ['it', 'may', 'seem', 'long', 'at', '110', 'minutes', 'if', 'you', "'re"

[1, ['hey', 'arnold', '!']]
[0, ['though', 'perry', 'and', 'hurley', 'make', 'inspiring', 'efforts', 'to', 'breathe', 'life', 'into', 'the', 'disjointed', ',', 'haphazard', 'script', 'by', 'jay', 'scherick', 'and', 'david', 'ronn', ',', 'neither', 'the', 'actors', 'nor', 'director', 'reginald', 'hudlin', 'can', 'make', 'it', 'more', 'than', 'fitfully', 'entertaining', '.']]
[2, ['woody', 'allen', "'s", 'latest', 'is', 'an', 'ambling', ',', 'broad', 'comedy', 'about', 'all', 'there', 'is', 'to', 'love', '--', 'and', 'hate', '--', 'about', 'the', 'movie', 'biz', '.']]
[1, ['it', "'s", 'the', 'perfect', 'kind', 'of', 'film', 'to', 'see', 'when', 'you', 'do', "n't", 'want', 'to', 'use', 'your', 'brain', '.']]
[0, ['there', "'s", 'too', 'much', 'falseness', 'to', 'the', 'second', 'half', ',', 'and', 'what', 'began', 'as', 'an', 'intriguing', 'look', 'at', 'youth', 'fizzles', 'into', 'a', 'dull', ',', 'ridiculous', 'attempt', 'at', 'heart-tugging', '.']]
[2, ['as', 'a', 'first-time', 'direct

In [58]:
class RNN(nn.Module):
    def _init_(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, bidirectional, dropout, pad_idx):
        super()._init_()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, bidirectional=bidirectional, dropout=dropout)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):
        embedded = self.dropout(self.embedding(text))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths)
        packed_output, (hidden, cell) = self.rnn(packed_embedded)
        output, output_lengths = nn.utils.rnn.pad_packed_sequence(packed_output)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        return self.fc(hidden.squeeze(0))