In [None]:
!pip install -i https://test.pypi.org/simple/ adawat

In [None]:
import gzip
import itertools
from adawat.nlp import Corpus

corpus_path='/home/rafid/Downloads/wiki-sents/wiki-sents.txt'
#corpus_path='../input/wikipedia-sentences/wikisent2.txt'\
#corpus_path='./text8'

def to_lowercase(line: str) -> str:
    return line.lower()

#MAX_LINES = 100_000
#MAX_VOCAB = 1_000
#corpus = Corpus(corpus_path, MAX_LINES, MAX_VOCAB, preprocessor=to_lowercase)

#MAX_VOCAB = 30_000

#with gzip.open("/home/rafid/Workspace/tmikolov-word2vec/dataset/news.2012.en.shuffled.normalized.gz") as f:
#    lines_as_string = map(lambda l: str(l), f)
#    corpus = Corpus(lines_as_string, min_word_freq=5, force_init=True)
with open(corpus_path, "r") as f:
    corpus = Corpus(f, min_word_freq=5, force_init=True)

len(corpus.tokens), corpus.vocab_size

In [None]:
from typing import Tuple
import itertools
from adawat.utils import GeneratorWithLen
from functools import partial

CONTEXT_RADIUS = 5

def build_training_data(corpus: Corpus, context_radius=2) -> Tuple:
    length = 0
    for tokens in corpus.tokens_per_line:
        length += (len(tokens) - context_radius) - context_radius

    
    def build_features(corpus: Corpus, context_radius=2):
        for tokens in corpus.tokens_per_line:
            for i in range(context_radius, len(tokens) - context_radius):
                yield tokens[i - context_radius:i] + tokens[i + 1: i + context_radius + 1]

    def build_targets(corpus: Corpus, context_radius=2):
        for tokens in corpus.tokens_per_line:
            for i in range(context_radius, len(tokens) - context_radius):
                yield tokens[i]
    
    return (GeneratorWithLen(partial(build_features, corpus, context_radius), length),
        GeneratorWithLen(partial(build_targets, corpus, context_radius), length))


features, targets = build_training_data(corpus, CONTEXT_RADIUS)

list(zip(itertools.islice(features, 3), itertools.islice(targets, 3)))


In [None]:
import torch
from torch import nn
from torch.nn import functional as F


EMBEDDING_DIM = 300

class CBOW(nn.Module):
    def __init__(self, vocab_size, context_radius, embedding_dim):
        super(CBOW, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(2 * context_radius * embedding_dim, 512)
        self.linear2 = nn.Linear(512, vocab_size)

    def forward(self, input):
        output1 = self.embedding(input)
        output1 = torch.flatten(output1, 1)
        output2 = F.relu(self.linear1(output1))
        output3 = self.linear2(output2)

        return F.log_softmax(output3, dim=1)


def model_func(vocab_size, context_radius, embedding_dim):
    """
     A function for defining instances of the model for use with the ModelTrainer class
    """

    def model():
        return CBOW(vocab_size, context_radius, embedding_dim)

    return model


def test_model():
    model_fn = model_func(corpus.vocab_size, CONTEXT_RADIUS, EMBEDDING_DIM)
    model = model_fn()
    output = model(torch.tensor(
        [list(range(0, 2*CONTEXT_RADIUS))] * 10
    ))
    print(output)
    print(output.shape)


test_model()


In [None]:
import torch

if torch.cuda.is_available():
    print("CUDA is available! Using it.")
    device = torch.device("cuda:0")
else:
    print("CUDA is not available :'( Using CPU.")
    device = torch.device("cpu")


In [None]:
# Python imports
from itertools import islice

# PyTorch impots
from torch.utils.data import DataLoader

# 3rd party imports
from adawat.data import IterableDataset
from adawat.transforms import Compose, WordToIndex, WordsToIndices, \
    WordToOneHot, ToPyTorchTensor


BATCH_SIZE = 1000

dataset = IterableDataset(
    features, targets,
    # Features transformation
    Compose(
        WordsToIndices(corpus.word2idx),
        ToPyTorchTensor(dtype=torch.long, device=device)
    ),
    # Targets transformation
    Compose(
        #WordToOneHot(corpus.word2idx, corpus.vocab_size),
        WordToIndex(corpus.word2idx),
        ToPyTorchTensor(dtype=torch.long, device=device)
    ),
    len(features) # this should be the same as len(targets)
)
data_loader = DataLoader(dataset, BATCH_SIZE)

In [None]:
# Configure logging
def configure_logging():
    import logging
    root = logging.getLogger()
    root.setLevel(logging.DEBUG)

    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)

    root.handlers = [ch]
configure_logging()


In [None]:
# Python imports
import random

import numpy as np

# PyTorch imports
from torch import optim

# Other imports
from adawat.models import ModelTrainer


def closest_vector(vector, vectors):
    vectors = np.asarray(vectors)
    dist_2 = np.sum((vectors - vector)**2, axis=1)
    dist_2_sorted = sorted(zip(dist_2, range(len(dist_2))))
    return dist_2_sorted[1][1]


def get_word_embedding(embedding_matrix, word, word2idx):
    idx = word2idx(word)
    return embedding_matrix(torch.tensor(idx, device=device))


def train():
    lr_start = 0.001
    lr_end = 0.00001
    def optim_creator(parameters):
        return optim.SGD(parameters, lr_start)

    epoch_count = 3    
    def optim_updater(optim, epoch, i, iter_count):
        abs_index = epoch*iter_count + i
        last_index = epoch_count*iter_count - 1
        new_lr = lr_start + (lr_end - lr_start)*abs_index/last_index
        #print(f'Changing learning rate to {new_lr}')
        for g in optim.param_groups:
            g['lr'] = new_lr

    model_fn = model_func(corpus.vocab_size, CONTEXT_RADIUS, EMBEDDING_DIM)
    model_trainer = ModelTrainer(model_fn, nn.NLLLoss, optim_creator, optim_updater)
    model, losses = model_trainer.train(data_loader, epoch_count, device)
    embedding = model.embedding
    embedding_matrix = model.embedding.weight
    
    for word in random.choices(corpus.vocab, k=100):
        word_vector = get_word_embedding(embedding, word, corpus.word2idx)
        closest_idx = closest_vector(word_vector.detach().cpu().numpy(),
                                 embedding_matrix.detach().cpu().numpy())
        print(f"The closest word to {word} is {corpus.idx2word(closest_idx)}")
    
    return model, losses


train()
