In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
import matplotlib.pyplot as plt
import time
import os
import copy
from dataset import ReviewsDataset
from torch.utils.data import DataLoader
from bow import BagOfWordsClassifier
import pickle

def train_model(device, dataloaders, dataset_sizes, model, criterion, optimizer, scheduler, num_epochs=25):
    since = time.time()
   
    best_model_wts = copy.deepcopy(model.state_dict())
    best_acc = 0.0

    # If there is no training happening
    if num_epochs == 0:
        model.eval()
        running_corrects = 0

        # Iterate over data.
        for inputs, labels in dataloaders['val']:
            inputs = inputs.to(device)
            labels = labels.to(device)

            # forward
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)

            # statistics
            running_corrects += torch.sum(preds == labels.data)

        best_acc = running_corrects.double() / dataset_sizes['val']

    # Training for num_epochs steps
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                scheduler.step()
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0

            # Iterate over data.
            for inputs, labels in dataloaders[phase]:
                inputs = inputs.to(device)
                labels = labels.to(device)

                # zero the parameter gradients
                optimizer.zero_grad()

                # forward
                # track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)

                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                    
                    ####################################################################################
                    #                             END OF YOUR CODE                                     #
                    ####################################################################################
                    
                # statistics
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(
                phase, epoch_loss, epoch_acc))

            # deep copy the model
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
                with open('model.pkl', 'wb') as f:
                    pickle.dump(best_model_wts, f)

                

    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best val Acc: {:4f}'.format(best_acc))

    # load best model weights
    if num_epochs > 0:
        model.load_state_dict(best_model_wts)
    return model

def train(device, dataloaders, dataset_sizes, vocab_size):
    model = BagOfWordsClassifier(2, vocab_size)
    model = model.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)

    # Decay LR by a factor of 0.1 every 7 epochs
    exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.05)

    # Train the model for 25 epochs
    print('Train the model')
    model = train_model(device, dataloaders, dataset_sizes, model, criterion, optimizer, exp_lr_scheduler, num_epochs=25)
    
    train_model(device, dataloaders, dataset_sizes, model, criterion, optimizer, exp_lr_scheduler, num_epochs=0)
    return model

def main():
    datasets = {}
    datasets['train'] = ReviewsDataset('../data/train.txt')
    datasets['val'] = ReviewsDataset('../data/dev.txt', train=False, word2idx=datasets['train'].word2idx)
    
    dataset_sizes = { x: len(datasets[x]) for x in ['train', 'val'] }

    vocab_size = len(datasets['train'].word2idx)

    dataloaders = {
        x: DataLoader(datasets[x], batch_size=5, shuffle=True, num_workers=4) for x in ['train', 'val']
    }

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = train(device, dataloaders, dataset_sizes, vocab_size)
    return model

if __name__== "__main__":
    model = main()


Train the model
Epoch 0/24
----------
train Loss: 0.5434 Acc: 0.8133
val Loss: 0.4931 Acc: 0.8586
Epoch 1/24
----------
train Loss: 0.4790 Acc: 0.8751
val Loss: 0.4637 Acc: 0.8869
Epoch 2/24
----------
train Loss: 0.4565 Acc: 0.8933
val Loss: 0.4483 Acc: 0.8965
Epoch 3/24
----------
train Loss: 0.4432 Acc: 0.9044
val Loss: 0.4379 Acc: 0.9041
Epoch 4/24
----------
train Loss: 0.4340 Acc: 0.9114
val Loss: 0.4305 Acc: 0.9115
Epoch 5/24
----------
train Loss: 0.4271 Acc: 0.9162
val Loss: 0.4248 Acc: 0.9137
Epoch 6/24
----------
train Loss: 0.4215 Acc: 0.9207
val Loss: 0.4201 Acc: 0.9182
Epoch 7/24
----------
train Loss: 0.4186 Acc: 0.9226
val Loss: 0.4199 Acc: 0.9182
Epoch 8/24
----------
train Loss: 0.4183 Acc: 0.9229
val Loss: 0.4196 Acc: 0.9185
Epoch 9/24
----------
train Loss: 0.4181 Acc: 0.9231
val Loss: 0.4194 Acc: 0.9186
Epoch 10/24
----------
train Loss: 0.4179 Acc: 0.9232
val Loss: 0.4192 Acc: 0.9185
Epoch 11/24
----------
train Loss: 0.4177 Acc: 0.9235
val Loss: 0.4190 Acc: 0.918

In [4]:
model

BagOfWordsClassifier(
  (linear): Linear(in_features=7639, out_features=2, bias=True)
)

In [5]:
datasets = {}
datasets['train'] = ReviewsDataset('../data/train.txt')
datasets['val'] = ReviewsDataset('../data/dev.txt', train=False, word2idx=datasets['train'].word2idx)

dataset_sizes = { x: len(datasets[x]) for x in ['train', 'val'] }

vocab_size = len(datasets['train'].word2idx)

dataloaders = {
    x: DataLoader(datasets[x], batch_size=4, shuffle=True, num_workers=4) for x in ['train', 'val']
}

In [6]:
outputs = model(datasets['val'].data)
_,preds = torch.max(outputs, 1)

In [7]:
np.sum(preds.numpy() == datasets['val'].labels.numpy())/len(datasets['val'].labels)

0.919

In [8]:
test = ReviewsDataset('../data/test.txt', train=False, word2idx=datasets['train'].word2idx)

In [9]:
outputs = model(test.data)
_,preds = torch.max(outputs, 1)

In [10]:
np.sum(preds.numpy() == test.labels.numpy())/len(test.labels)

0.9197

In [12]:
import numpy as np
import torch
from torch.utils.data import Dataset

def read_data(path, labelled=True):
    f = open(path,'r')
    text = f.read()
    examples = [example.split(' ') for example in text.split('\n')[:-1]]
    if labelled:
        labels = [int(line[0]) for line in examples]
        data = [line[1:] for line in examples]
        return data,np.array(labels)
    else:
        return examples

def create_vocab(data):
    flatten = [w for line in data for w in line]
    unique = list(set(flatten))
    word2idx = {word: idx for idx,word in enumerate(unique)}
    return unique,word2idx

def create_bag_of_words(data, word2idx=None):
    if word2idx is None:
        raise Error('create_bag_of_words need a word2idx mapping!')
    bag_of_words = np.zeros((len(data), len(word2idx)))
    for line in range(len(data)):
        for word in data[line]:
            if word in word2idx:
                bag_of_words[line][word2idx[word]] += 1
    return bag_of_words


In [None]:
data, labels = read_data(path)

if train:
    vocab,word2idx = create_vocab(data)
elif word2idx is None:
    raise Error('Vocab must be provided for non-training data in ReviewsDataset')

self.word2idx = word2idx

data = create_bag_of_words(data, word2idx=word2idx)
self.data = torch.tensor(data).float()

In [None]:
test = ReviewsDataset('../data/unlabelled.txt', train=False, word2idx=datasets['train'].word2idx)