In [1]:
import os
os.chdir('/scratch/sagarsj42')

In [2]:
import time
import pickle

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

In [3]:
class CBOW_Dataset(Dataset):
    def __init__(self):
        super(CBOW_Dataset, self).__init__()
        
        self.reviews = list()
        self.grouped_reviews = list()
        self.data_pairs = list()
        self.word_count = {'<UNK>': 0}
        self.word_list = list()
        self.vocab_size = 0
        
        self.load_reviews()
        self.group_reviews()
        self.prepare_data_pairs()
        self.compile_vocab()
        
    def __len__(self):
        return len(self.data_pairs)
    
    def __getitem__(self, index):
        context_words = self.data_pairs[index][0]
        word = self.data_pairs[index][1]
        context_vector = self.encode_words(context_words)
        target = self.encode_word(word)
        
        return (context_vector, target)
    
    def load_reviews(self):
        start = time.time()

        for i in range(1):
            filename = 'review_words-' + str(i+1) + '.pkl'
            print('Opening', filename, end='  ')
            with open(filename, 'rb') as f:
                reviews_set = pickle.load(f)
                print('Contains', len(reviews_set), 'entries')
                self.reviews.extend(reviews_set)

        end = time.time()
        print('Load data', 'Time taken:', end - start)
        print('No. of reviews:', len(self.reviews))

    def group_reviews(self):
        start = time.time()

        for review in self.reviews[:1]:
            review_words = list()
            for sentence in review:
                review_words.extend(sentence)
            if len(review_words) > 6:
                self.grouped_reviews.append(review_words)
            
        end = time.time()
        print('Grouping reviews', 'Time taken:', end - start)
        print('No. of grouped reviews:', len(self.grouped_reviews))
    
    def prepare_data_pairs(self):
        start = time.time()
        for review in self.grouped_reviews:
            for ind, word in enumerate(review):
                win_size = min(ind, len(review)-ind-1, 3)

                if win_size >= 3:
                    left_ind = ind - win_size
                    right_ind = ind + win_size
                    context = list()
                    
                    for cont_i in range(left_ind, right_ind+1):
                        if cont_i != ind:
                            context.append(review[cont_i])
                            
                    self.data_pairs.append((context, word))
        end = time.time()
        print('Preparing data pairs', 'Time taken:', end - start)
        print('No. of data pairs:', len(self.data_pairs))
                    
    def compile_vocab(self):
        start = time.time()
        
        for review in self.reviews:
            for sentence in review:
                for word in sentence:
                    if not word in self.word_count:
                        self.word_count[word] = 1
                    else:
                        self.word_count[word] += 1

        self.word_list = list(self.word_count.keys())
        self.vocab_size = len(self.word_list)
        print('Vocab size:', self.vocab_size)
        end = time.time()
        print('Preparing vocab', 'Time taken:', end - start)
        
    def encode_word(self, word):
        return torch.tensor([self.word_list.index(word)], dtype=torch.long)
    
    def encode_words(self, words):
        indices = [self.word_list.index(word) for word in words]
        return torch.tensor(indices, dtype=torch.long)
    
    def decode_word(self, index):
        return self.word_list[index.item()]

In [4]:
class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear = nn.Linear(embedding_dim, vocab_size)
        self.activation_function = nn.LogSoftmax(dim=-1)
        
    def forward(self, inputs):
        out = self.embeddings(inputs)
        ct_size = out.shape[1]
        out = (out.sum(1) / ct_size).view(out.shape[0], -1)
        out = self.linear(out)
        out = self.activation_function(out)
        
        return out
    
    def get_embedding(self, word, vocab):
        word_ind = torch.tensor(vocab.index(word), dtype=torch.long)
        return self.embeddings(word_ind).view(1, -1)

In [5]:
dataset = CBOW_Dataset()
dataset

Opening review_words-1.pkl  Contains 169913 entries
Load data Time taken: 4.9263622760772705
No. of reviews: 169913
Grouping reviews Time taken: 1.33514404296875e-05
No. of grouped reviews: 1
Preparing data pairs Time taken: 0.00018787384033203125
No. of data pairs: 139
Vocab size: 33794
Preparing vocab Time taken: 4.5156495571136475


<__main__.CBOW_Dataset at 0x7f2716c6ddf0>

In [6]:
dataloader = DataLoader(dataset, batch_size=1024, shuffle=False)
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f2716c4e160>

In [7]:
model_args = {'vocab_size': dataset.vocab_size, 'embedding_dim': 750}
model = CBOW(**model_args).cuda()
loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.0001)
mbatch_losses = list()
epoch_losses = list()

In [8]:
n_epochs = 3
for epoch in range(n_epochs):
    print('Epoch', epoch)
    total_loss = 0.0
    
    for i, (context_vector, target) in enumerate(dataloader):
        if i % 100 == 0:
            print('\tStep', i)
            if i % 100000 == 0 and i > 0:
                torch.save({
                    'vocab': dataset.word_list,
                    'model_args': model_args,
                    'state_dict': model.state_dict(),
                    'n_epochs': n_epochs,
                    'mini_batch_losses': mbatch_losses,
                    'epoch_losses': epoch_losses
                },
                    'checkpoint-'+str(epoch)+'-'+str(i)+'.pt')
        
        context_vector = context_vector.cuda()
        target = target.view(-1).cuda()
        model.zero_grad()
        
        log_probs = model(context_vector)
        loss = loss_function(log_probs, target)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        mbatch_losses.append(loss.item())
    
    epoch_losses.append(total_loss)
    
    torch.save({
        'vocab': dataset.word_list,
        'model_args': model_args,
        'state_dict': model.state_dict(),
        'n_epochs': n_epochs,
        'mini_batch_losses': mbatch_losses,
        'epoch_losses': epoch_losses
    },
        'checkpoint-'+str(epoch)+'.pt')
    
    print('Total epoch loss', total_loss)

Epoch 0
	Step 0
Total epoch loss 10.417192459106445
Epoch 1
	Step 0
Total epoch loss 10.417078971862793
Epoch 2
	Step 0
Total epoch loss 10.416964530944824


In [9]:
torch.save({
    'vocab': dataset.word_list,
    'model_args': model_args,
    'state_dict': model.state_dict(),
    'n_epochs': n_epochs,
    'mini_batch_losses': mbatch_losses,
    'epoch_losses': epoch_losses
    },
    'final-checkpoint.pt'
)

In [10]:
checkpoint = torch.load('final-checkpoint.pt')

vocab = checkpoint['vocab']
print('Vocab size', len(vocab))
print('Samples', vocab[:5])

model_args = checkpoint['model_args']
model = CBOW(**model_args)
print('Model initialized', model)

model.load_state_dict(checkpoint['state_dict'])
model.eval()
print('Model loaded with trained weights', model.state_dict())

n_epochs = checkpoint['n_epochs']
print('Epochs', n_epochs)

mini_batch_losses = checkpoint['mini_batch_losses']
print('No. of mini-batch losses', len(mini_batch_losses))
print('Samples', mini_batch_losses[:5])

epoch_losses = checkpoint['epoch_losses']
print('No. of epoch losses', len(epoch_losses))
print('Samples', epoch_losses[:5])

Vocab size 33794
Samples ['<UNK>', 'we', 'got', 'this', 'for']
Model initialized CBOW(
  (embeddings): Embedding(33794, 750)
  (linear): Linear(in_features=750, out_features=33794, bias=True)
  (activation_function): LogSoftmax(dim=-1)
)
Model loaded with trained weights OrderedDict([('embeddings.weight', tensor([[ 1.8629,  0.3649, -1.2000,  ..., -0.8849,  0.3445,  0.7882],
        [-0.4234,  0.1914, -0.0266,  ..., -0.5505,  0.3514, -1.7271],
        [ 0.4854, -0.2969, -0.9645,  ...,  1.0155, -0.1487, -0.6318],
        ...,
        [ 1.1696,  0.4288, -0.1578,  ..., -0.2005, -1.1478, -0.4725],
        [ 0.0512,  1.9135,  1.0105,  ..., -0.1014, -0.4852,  1.3978],
        [ 0.4521,  0.6331,  0.6027,  ..., -0.1792, -0.6109, -0.4757]])), ('linear.weight', tensor([[ 0.0273, -0.0003,  0.0269,  ...,  0.0087,  0.0031,  0.0293],
        [ 0.0279,  0.0283, -0.0211,  ...,  0.0075,  0.0206, -0.0111],
        [-0.0235,  0.0182,  0.0141,  ...,  0.0308,  0.0346,  0.0209],
        ...,
        [ 0.0143

In [11]:
embed = model.get_embedding('by', vocab).detach()
print(embed.shape)
embed

torch.Size([1, 750])


tensor([[-7.8090e-02, -2.7658e+00,  3.6753e-01,  3.1086e-01,  7.9372e-01,
         -1.4315e+00, -9.5732e-01,  6.1610e-01, -7.3930e-01, -9.3644e-01,
         -1.2328e-01, -2.5524e+00,  5.4905e-01,  8.0451e-01, -7.1114e-01,
         -9.2626e-01,  6.0144e-01,  1.9739e-01,  1.7654e+00, -1.8216e-01,
          1.5873e+00, -1.7611e+00, -3.8222e-01,  3.5357e-01, -2.2711e-01,
         -7.4580e-01, -9.3559e-02, -3.8042e-02,  7.6754e-01,  3.3477e-01,
         -3.4877e-03,  3.8403e-02,  1.1208e+00, -2.8794e+00, -3.6962e-01,
          1.4453e+00,  7.9496e-01, -5.3998e-01,  1.4459e-01,  1.8286e-01,
         -1.0667e+00,  1.2158e+00, -3.0689e+00, -1.0771e+00,  1.0395e+00,
          6.2042e-01, -1.1133e+00, -8.1562e-02, -5.3052e-01, -1.1428e+00,
          2.8851e-01, -9.6606e-01,  1.0011e+00, -1.1040e+00,  7.9581e-02,
         -3.3222e-01,  1.6657e-01,  7.5044e-01, -2.1435e-01,  1.3358e+00,
         -1.9062e+00,  6.4215e-01, -9.3439e-01,  1.4961e-01, -3.9431e-01,
         -1.5163e+00, -9.5255e-01,  1.