In [32]:
from collections import defaultdict
import time
import random
import pandas as pd
import numpy as np

from torch.utils.data import Dataset, DataLoader

In [3]:
# Functions to read in the corpus
w2i = defaultdict(lambda: len(w2i))
t2i = defaultdict(lambda: len(t2i))
UNK = w2i["<unk>"]

In [8]:
def read_dataset(filename):
    with open(filename, "r") as f:
        for line in f:
            tag, words = line.lower().strip().split(" ||| ")
            yield ([w2i[x] for x in words.split(" ")], t2i[tag])

In [13]:
# Read in the data
train = list(read_dataset(".data/classes/train.txt"))
w2i = defaultdict(lambda: UNK, w2i)
dev = list(read_dataset(".data/classes/test.txt"))
nwords = len(w2i)
ntags = len(t2i)

In [33]:
nwords, ntags

(18648, 5)

In [51]:
def read_dataset_2(filename):
    x_data = []
    y_data = []
    with open(filename, "r") as f:
        for line in f:
            tag, words = line.lower().strip().split(" ||| ")
            x_data.append([w2i[x] for x in words.split(" ")])
            y_data.append(t2i[tag])
            
    x_data = np.array(x_data, dtype='int32')
    y_data = np.array(y_data, dtype='int32')
    
    return x_data, y_data
    
X, y = read_dataset_2(".data/classes/train.txt")

ValueError: setting an array element with a sequence.

In [47]:
class CustomData(Dataset):
    def __init__(self, file_name):
        # Read in the data
        X, Y = read_dataset_2(file_name)
        self.len = len(X)
        self.x_data = torch.from_numpy(X)
        self.y_data = torch.from_numpy(y)
        
    def __getitem__(self, index):
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return self.len
    
    
dataset = CustomData(".data/classes/train.txt")

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: double, float, float16, int64, int32, and uint8.

In [15]:
import torch
from torch import nn
from torch.autograd import Variable

class BoW(torch.nn.Module):
    def __init__(self, nwords, ntags):
        super(BoW, self).__init__()

        """ variables """
        type = torch.FloatTensor
        use_cuda = torch.cuda.is_available()

        if use_cuda:
            type = torch.cuda.FloatTensor

        self.bias = Variable(torch.zeros(ntags),
                             requires_grad=True).type(type)

        """ layers """
        self.embedding = nn.Embedding(nwords, ntags)
        # initialize the weights with xavier uniform (Glorot, X. & Bengio, Y. (2010))
        nn.init.xavier_uniform_(self.embedding.weight)


    def forward(self, words):
        emb = self.embedding(words)
        out = torch.sum(emb, dim=0) + self.bias # size(out) = N
        out = out.view(1, -1) # size(out) = 1 x N
        return out

In [16]:
# initialize the model
model = BoW(nwords, ntags)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters())

type = torch.LongTensor

In [18]:
for ITER in range(10):
    # Perform training
    random.shuffle(train)
    train_loss = 0.0
    start = time.time()
    for words, tag in train:
        words = torch.tensor(words).type(type)
        tag = torch.tensor([tag]).type(type)
        scores = model(words)
        loss = criterion(scores, tag)
        train_loss += loss.item()
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start))
    # Perform testing
    test_correct = 0.0
    for words, tag in dev:
        words = torch.tensor(words).type(type)
        scores = model(words)[0].detach().cpu().numpy()
        predict = np.argmax(scores)
        if predict == tag:
            test_correct += 1
    print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev)))

iter 0: train loss/sent=0.9920, time=41.03s
iter 0: test acc=0.4127
iter 1: train loss/sent=0.8235, time=41.46s
iter 1: test acc=0.4032
iter 2: train loss/sent=0.7045, time=40.93s
iter 2: test acc=0.4145
iter 3: train loss/sent=0.6139, time=44.26s
iter 3: test acc=0.4059
iter 4: train loss/sent=0.5428, time=40.76s
iter 4: test acc=0.4041
iter 5: train loss/sent=0.4849, time=45.02s
iter 5: test acc=0.4041
iter 6: train loss/sent=0.4369, time=42.72s
iter 6: test acc=0.4005
iter 7: train loss/sent=0.3968, time=43.64s
iter 7: test acc=0.4027
iter 8: train loss/sent=0.3622, time=43.49s
iter 8: test acc=0.3955
iter 9: train loss/sent=0.3320, time=42.57s
iter 9: test acc=0.3950
