In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
from torch.autograd import Variable
from collections import Counter, OrderedDict

In [None]:
from scipy.spatial.distance import cosine
from torch.utils.data import DataLoader
from torchtext.data import to_map_style_dataset
from torchtext.data.utils import get_tokenizer

In [None]:
from tqdm.auto import tqdm
from dataclasses import dataclass
import re
from random import random
import random as random

In [None]:
files = ['interview_ds.txt', 'interview_ds_2.txt']

path = '/kaggle/input/text-247/'

In [None]:
def get_data():
    with open(f"{path}{files[0]}", 'r') as f:
        file1 = f.readlines()

    with open(f"{path}{files[1]}", 'r') as f:
        file2 = f.readlines()

    final_corpus = file1 + file2
    index = int(0.9*len(final_corpus))
    train_iter , valid_iter= final_corpus[:index], final_corpus[index:]
    return train_iter, valid_iter

# Define Model and Training Parameters here 

In [None]:
@dataclass
class Word2VecParams:

    # skipgram parameters
    MIN_FREQ = 50
    SKIPGRAM_N_WORDS = 7
    T = 1e-3
    NEG_SAMPLES = 10
    NS_ARRAY_LEN = 5_000_000
    SPECIALS = ""
    TOKENIZER = 'basic_english'

    # network parameters
    BATCH_SIZE = 128
    EMBED_DIM = 64
    EMBED_MAX_NORM = None
    N_EPOCHS = 10
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    CRITERION = nn.BCEWithLogitsLoss()

# Vocab Class

In [None]:
from torch._C import Value


###This class generates the vocabulary and provides helper function to get words from indexes and vice versa

class Vocab:

    def __init__(self, tokens, specials):
        self.stoi = {v[0] :(k, v[1]) for k,v in enumerate(tokens)}
        self.itos = {k:(v[0], v[1]) for k,v in enumerate(tokens)}

        self._specials = specials[0]
        self.total_tokens = np.nansum([f for _, (_, f) in self.stoi.items()], dtype=int)


    def __len__(self):
        return len(self.stoi) - 1

    def get_index(self, word):

        if word in self.stoi:
            return self.stoi.get(word)[0]
        else:
            raise ValueError(f"Word/phrase: {word} not a word or phrase")


    def get_freq(self, word):

        if word in self.stoi:
            return self.stoi.get(word)[1]
        else:
            raise ValueError(f"Word/phrase: {word} not a word or phrase")

    def lookup_token(self, token):

        if token in self.itos:
            return self.stoi.get(token)[0]

        else:

            raise ValueError(f"out of index")


In [None]:
# token generator generates tokens from the raw text after applying basic filters, removes punctuation.
def token_generator(iterator, tokenizer):
    r = re.compile('[a-z1-9]')
    for text in iterator:

        res = tokenizer(text)
        res = list(filter(r.match, res))
        yield res
# this function generates tokens and creates a counts word frequency, using with vocab is created
def build_vocab(iterator, tokenizer, params, min_freq = 1, max_tokens=None):
    counter = Counter()
    for tokens in tqdm(token_generator(iterator, tokenizer)):
        counter.update(tokens)
        
    counter = sorted(counter.items(), key = lambda x:(-x[1], x[0]))
    word_freq = OrderedDict(counter)
    tokens = []
    
    for token, f in word_freq.items():
        if f >=min_freq:
            tokens.append((token, f))
            
    word_vocab = Vocab(tokens, specials=[params.SPECIALS, np.nan])

    return word_vocab

In [None]:
# generates skipgrams as decribed in the paper, we have created rejection probabilities for words, so that words with high frequencies dont get trained much. 
#Also provides collator function for dataset generation
class SkipGrams:
    def __init__(self, vocab, params, tokenizer):
        self.vocab = vocab
        self.params = params
        self.t = params.T
        self.tokenizer = tokenizer
        self.word_rejection_prob = self._create_rejection_prob()
        
    def _create_rejection_prob(self):

        rejection_dict = {}
        for _, (word, freq) in self.vocab.stoi.items():
            rej_prob = 1-np.sqrt(
                    self.t / (freq/self.vocab.total_tokens + self.t)) # same as provided in paper
            rejection_dict[word] = rej_prob
        return rejection_dict

    def collator(self, batch):

        batch_input, batch_output = [], []
        r = re.compile('[a-z1-9]')

        for text in batch:
            res = list(filter(r.match, self.tokenizer(text)))
            text_tokens = [self.vocab.get_index(x) for x in res]
            
            for idx in range(len(text_tokens)):

                token_id_sequence = text_tokens[
                            max(idx - self.params.SKIPGRAM_N_WORDS +1 , 0) : idx] + text_tokens[idx+1 : idx + self.params.SKIPGRAM_N_WORDS]

                prob = random.uniform(0.5, 1)
                target = text_tokens[idx]

                if self.word_rejection_prob[target] >= prob:
                    continue
                    
                else:
                    for out_token in token_id_sequence:
                        prob = random.uniform(0.5, 1)

                        if self.word_rejection_prob[out_token] >= prob:
                            continue

                        else:
                            batch_input.append(target)
                            batch_output.append(out_token)

        batch_input = torch.tensor(batch_input, dtype=torch.long)
        batch_output = torch.tensor(batch_output, dtype=torch.long)

        return batch_input, batch_output


In [None]:
# created a negative sampling array using word frequencies to be used for sampling negative words. Frequencies are raised to the power of 0.75 as metioned in the paper.
class NegSampler:
    
    def __init__(self, vocab, ns_exponent, ns_array_length):

        self.vocab = vocab
        self.ns_exponent = ns_exponent
        self.ns_array_length = ns_array_length
        self.ns_array =  self._create_negative_sampling()
        
        
    def __len__(self):
        return len(self.ns_array)
    
    
    def _create_negative_sampling(self):

        word_freq_map = {word:freq**(self.ns_exponent) \
                              for _,(word, freq) in
                              list(self.vocab.stoi.items())}
        word_freq_map = {
            word: max(1,int((freq/self.vocab.total_tokens)*self.ns_array_length))
            for word, freq in word_freq_map.items()
            }
        ns_array = []
        for word, freq in tqdm(word_freq_map.items()):
            ns_array = ns_array + [word]*freq
        return ns_array
    
    def sample(self, batch_size = 1, n_samples=1):

        samples = []
        for _ in range(batch_size):
            samples.append(random.sample(self.ns_array, n_samples))

        return torch.as_tensor(np.array(samples))


In [None]:
from sklearn.preprocessing import normalize

In [None]:
class Model(nn.Module):
    def __init__(self, vocab, params) -> None:
        super().__init__()
        self.vocab = vocab

        self.embeds =  nn.Embedding(self.vocab.__len__()+1,
                                          params.EMBED_DIM, max_norm=params.EMBED_MAX_NORM)
        self.context_embeds =  nn.Embedding(self.vocab.__len__()+1,
                                          params.EMBED_DIM, max_norm=params.EMBED_MAX_NORM)
        
    def forward (self, inputs, context):

        target_embeddings = self.embeds(inputs)
        target_embeddings = torch.unsqueeze(target_embeddings, dim=1) # shape: Bx1xE

        context_embeddings = self.context_embeds(context)
        context_embeddings = context_embeddings.permute(0, 2, 1) # B x E x (pos+neg size) transpose for batch multiplication

        out = torch.bmm(target_embeddings, context_embeddings) # B x 1 x (pos+neg size)
        out = out.view(out.shape[0], out.shape[2])

        return out

    def normalize_embeddings(self):
        
        embeddings = list(self.embeds.parameters())[0]
        embeddings = embeddings.cpu().detach().numpy()
        self.norm_embeddings = normalize(embeddings)


    def get_similar_words(self, word, n=5):

        embeddings = self.normalize_embeddings()
        word_ind = self.vocab.get_index(word)

        dot_products = np.matmul(embeddings, embeddings[word_ind]).flatten()
        top_n = np.argsort(-dot_products)[1 : n + 1]

        results = []
        for index in top_n:
            results.append(self.vocab.lookup_token(index))

        return results
    
    def get_analogy(self, word, analogy, target_word, n=5):
        embeddings = self.normalize_embeddings()
        relation = embeddings[self.vocab.get_index(word)] - \
        embeddings[self.vocab.get_index(analogy)] + embeddings[self.vocab.get_index(target_word)]

        dot_products = np.matmul(embeddings, relation).flatten()
        top_n = np.argsort(-dot_products)[1 : n + 1]

        results = []
        for index in top_n:
            results.append(self.vocab.lookup_token(index))


        return results

In [None]:
class Trainer:

    def __init__(self, model, params, optimizer, vocab, skipgrams, negative_sampler,train_iter, valid_iter=None):

        self.model = model
        self.optimizer = optimizer
        self.vocab = vocab
        self.train_iter = train_iter
        self.valid_iter = valid_iter
        self.skipgrams = skipgrams
        self.params = params

        self.model.to(self.params.DEVICE)
        self.params.CRITERION.to(self.params.DEVICE)

        self.negative_sampler = negative_sampler



    def train_epoch(self):

        self.model.train()

        running_loss = []

        for i, batch_data in tqdm(enumerate(self.train_loader), total=len(self.train_loader) ):
            
            if len(batch_data[0]) == 0:
                continue
                
            inputs = batch_data[0].to(params.DEVICE)
            pos_labels = batch_data[1].to(params.DEVICE)
            neg_labels = self.negative_sampler.sample(pos_labels.shape[0], self.params.NEG_SAMPLES)

            neg_labels = neg_labels.to(self.params.DEVICE)
            context = torch.cat([pos_labels.view(pos_labels.shape[0], 1), neg_labels], dim=1)


            y_pos = torch.ones((pos_labels.shape[0], 1))
            y_neg = torch.zeros((neg_labels.shape[0], neg_labels.shape[1]))

            y = torch.cat([y_pos, y_neg], dim=1).to(self.params.DEVICE)

            self.optimizer.zero_grad()

            outputs = self.model(inputs, context)
            loss = self.params.CRITERION(outputs, y)

            loss.backward()
            self.optimizer.step()

            running_loss.append(loss.item())

        epoch_loss = np.mean(running_loss)

        return epoch_loss

    def validate_epoch(self):

        self.model.eval()
        running_loss = []

        with torch.no_grad():
            
            for i, batch_data in enumerate(self.val_loader):
                if len(batch_data[0]) == 0:
                    continue

                inputs = batch_data[0].to(params.DEVICE)
                pos_labels = batch_data[1].to(params.DEVICE)
                neg_labels = self.negative_sampler.sample(pos_labels.shape[0], self.params.NEG_SAMPLES)

                neg_labels = neg_labels.to(self.params.DEVICE)
                context = torch.cat([pos_labels.view(pos_labels.shape[0], 1), neg_labels], dim=1)


                y_pos = torch.ones((pos_labels.shape[0], 1))
                y_neg = torch.zeros((neg_labels.shape[0], neg_labels.shape[1]))

                y = torch.cat([y_pos, y_neg], dim=1).to(self.params.DEVICE)

                outputs = self.model(inputs, context)
                loss = self.params.CRITERION(outputs, y)

                running_loss.append(loss.item())

        return np.mean(running_loss)


    def train(self):

        self.train_loader = DataLoader(
            self.train_iter,
            batch_size=self.params.BATCH_SIZE,
            shuffle=True,
            collate_fn=self.skipgrams.collator)

        if self.valid_iter is not None:

            self.val_loader = DataLoader(
                self.valid_iter,
                batch_size=self.params.BATCH_SIZE,
                shuffle=True,
                collate_fn=self.skipgrams.collator)

        for epoch in range(self.params.N_EPOCHS):
            train_loss = self.train_epoch()
            print(f""" EPOCH: {epoch+1}, Train Loss: {train_loss}""")
                
            if self.valid_iter is not None:
                val_loss = self.validate_epoch()
                print(f"Validaton loss:{val_loss}")


In [None]:
params = Word2VecParams()
train_iter, valid_iter = get_data()
tokenizer = get_tokenizer(params.TOKENIZER)
vocab = build_vocab(train_iter+valid_iter, tokenizer, params)
skip_grams = SkipGrams(vocab, params, tokenizer)
negative_smapler = NegSampler(vocab, ns_exponent=0.75, ns_array_length=params.NS_ARRAY_LEN)

In [None]:
model = Model(vocab, params)
if params.DEVICE == 'cuda':
    model.cuda()

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

In [None]:
trainer = Trainer(
        model=model,
        params=params,
        optimizer=optimizer,
        train_iter=train_iter+valid_iter,
        vocab=vocab,
        skipgrams=skip_grams,
        negative_sampler= negative_smapler)
trainer.train()

In [None]:
ver = 2

In [None]:
torch.save(model.state_dict(), f"word2vec_{ver}.pth")