In [1]:
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.optim import lr_scheduler
import time
import os
import copy
from dataset import ReviewsDataset
from torch.utils.data import DataLoader
from bow import BagOfWordsClassifier
import pickle

In [23]:
def read_data(path, labelled=True):
    f = open(path,'r')
    text = f.read()
    examples = [example.split(' ') for example in text.split('\n')[:-1]]
    if labelled:
        labels = [int(line[0]) for line in examples]
        data = [line[1:] for line in examples]
        return data,np.array(labels)
    else:
        return examples

def create_vocab(data):
    flatten = [w for line in data for w in line]
    unique = list(set(flatten))
    word2idx = {word: idx for idx,word in enumerate(unique)}
    return unique,word2idx

def create_bag_of_words(data, word2idx=None):
    if word2idx is None:
        raise Error('create_bag_of_words need a word2idx mapping!')
    bag_of_words = np.zeros((len(data), len(word2idx)))
    for line in range(len(data)):
        for word in data[line]:
            if word in word2idx:
                bag_of_words[line][word2idx[word]] += 1
    return torch.tensor(bag_of_words).float()

In [3]:
train_data, train_labels = read_data('../data/train.txt')
dev_data, dev_labels = read_data('../data/dev.txt')
test_data, test_labels = read_data('../data/test.txt')

train_labels = torch.tensor(train_labels)
dev_labels = torch.tensor(dev_labels)
test_labels = torch.tensor(test_labels)

In [4]:
vocab,word2idx = create_vocab(train_data)
vocab_size = len(vocab)

In [5]:
train_dataset = create_bag_of_words(train_data, word2idx=word2idx)
dev_dataset = create_bag_of_words(dev_data, word2idx=word2idx)
test_dataset = create_bag_of_words(test_data, word2idx=word2idx)

In [6]:
class PlainDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx],self.labels[idx]

In [7]:
class BagOfWordsClassifier(nn.Module):

    def __init__(self, num_labels, vocab_size):
        super(BagOfWordsClassifier, self).__init__()
        self.linear = nn.Linear(vocab_size, num_labels)
        
    def forward(self, bow_vec):
        z1 = self.linear(bow_vec)
        return torch.sigmoid(z1)

In [8]:
def train_model(device, dataloaders, dataset_sizes, model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()
   
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    # If there is no training happening
    if num_epochs == 0:
        model.eval()
        running_corrects = 0

        # Iterate over data.
        for inputs, labels in dataloaders['val']:
            inputs = inputs.to(device)
            labels = labels.to(device)

            # forward
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)

            # statistics
            running_corrects += torch.sum(preds == labels.data)

        best_acc = running_corrects.double() / dataset_sizes['val']

    # Training for num_epochs steps
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    
                    ####################################################################################
                    #                             END OF YOUR CODE                                     #
                    ####################################################################################
                    
                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                with open('model.pkl', 'wb') as f:
                    pickle.dump(best_model_wts, f)

                

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    if num_epochs > 0:
        model.load_state_dict(best_model_wts)
    return model

In [9]:
def train(device, dataloaders, dataset_sizes, vocab_size):
    model = BagOfWordsClassifier(2, vocab_size)
    model = model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)

    # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.05)

    # Train the model for 25 epochs
    print('Train the model')
    model = train_model(device, dataloaders, dataset_sizes, model, criterion, optimizer, exp_lr_scheduler, num_epochs=25)
    
    train_model(device, dataloaders, dataset_sizes, model, criterion, optimizer, exp_lr_scheduler, num_epochs=0)
    return model

In [11]:
def main():
    datasets = {}
    datasets['train'] = PlainDataset(train_dataset, train_labels)
    datasets['val'] = PlainDataset(dev_dataset, dev_labels)

    dataset_sizes = { x: len(datasets[x]) for x in ['train', 'val'] }

    dataloaders = {
        x: DataLoader(datasets[x], batch_size=4, shuffle=True, num_workers=4) for x in ['train', 'val']
    }
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = train(device, dataloaders, dataset_sizes, vocab_size)
    return model

In [12]:
model = main()

Train the model
Epoch 0/24
----------
train Loss: 0.5332 Acc: 0.8235
val Loss: 0.4839 Acc: 0.8684
Epoch 1/24
----------
train Loss: 0.4695 Acc: 0.8828
val Loss: 0.4549 Acc: 0.8932
Epoch 2/24
----------
train Loss: 0.4479 Acc: 0.9003
val Loss: 0.4404 Acc: 0.9055
Epoch 3/24
----------
train Loss: 0.4352 Acc: 0.9102
val Loss: 0.4305 Acc: 0.9107
Epoch 4/24
----------
train Loss: 0.4264 Acc: 0.9175
val Loss: 0.4235 Acc: 0.9144
Epoch 5/24
----------
train Loss: 0.4199 Acc: 0.9216
val Loss: 0.4181 Acc: 0.9188
Epoch 6/24
----------
train Loss: 0.4147 Acc: 0.9246
val Loss: 0.4138 Acc: 0.9201
Epoch 7/24
----------
train Loss: 0.4119 Acc: 0.9267
val Loss: 0.4135 Acc: 0.9222
Epoch 8/24
----------
train Loss: 0.4116 Acc: 0.9274
val Loss: 0.4133 Acc: 0.9224
Epoch 9/24
----------
train Loss: 0.4114 Acc: 0.9279
val Loss: 0.4131 Acc: 0.9225
Epoch 10/24
----------
train Loss: 0.4112 Acc: 0.9282
val Loss: 0.4129 Acc: 0.9225
Epoch 11/24
----------
train Loss: 0.4110 Acc: 0.9282
val Loss: 0.4127 Acc: 0.922

In [17]:
outputs = model(test_dataset)
_, preds = torch.max(outputs, 1)

In [22]:
correct = preds.numpy() == test_labels.numpy()
print(np.sum(correct)/correct.size)

0.9236


In [35]:
## Run the model on the unlabelled set
unlabelled = read_data('../data/unlabelled.txt', labelled=False)
bow = create_bag_of_words(unlabelled, word2idx=word2idx)
outputs = model(bow)
_,preds = torch.max(outputs, 1)

## Save to file
f = open('predictions_qONE_FINAL.txt', 'w')
output_string = '\n'.join(str(s) for s in preds.numpy())
f.write(output_string+'\n')

20000

In [36]:
test_test,labels_test = read_data('../data/test.txt')
bow = create_bag_of_words(test_test, word2idx=word2idx)
outputs = model(bow)
_,preds = torch.max(outputs, 1)

20000

In [31]:
f = open('test_out.txt','r')
text = f.read()
saved_data = [int(c) for c in text.split('\n')[:-1]]

In [34]:
correct = np.array(saved_data) == labels_test
print(np.sum(correct)/correct.size)

0.9236
