In [8]:
import numpy as np
from scipy.sparse import csr_matrix

In [1]:
import numpy as np
import torch
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import random
class DataPipeline(Dataset):
    def __init__(self, filename,window_size = 7,min_freq=5,vocab=None,neg_words=5):
        self.data = self.read_data(filename)
        self.neg_words = neg_words
        self.window_size = window_size
        if vocab is None:
            self.vocab, self.ind2vocab,self.word_count = self.build_vocab(self.data,min_freq)
        else:
            self.vocab = vocab
            self.ind2vocab = {v: k for k, v in vocab.items()}
            self.word_count = self.get_word_count(vocab,self.data,min_freq)
        self.neg_sampling_table = self.__create_neg_sampling_table()
        self.sub_sampling_table = self.__create_sub_sampling_table()

    def get_vocab(self):
        return self.vocab
    
    @staticmethod
    def read_data(filename):
        data = []
        with open(filename, 'r') as f:
            for line in f.readlines():
                e = line.strip()
                data.append(e.split())
        return data
    
    def get_word_count(self,vocab,data):
        word_count = {0: 0}
        for line in data:
            for word in line:
                if word in vocab:
                    word_count[vocab[word]] += 1
                else:
                    word_count[0] += 1
        return word_count
    
    def most_common(self,n):
        counter = Counter(self.word_count)
        common = counter.most_common(n)
        ind_freq = dict(common)
        # convert to word frequency
        word_freq = {}
        for ind in ind_freq:
            word_freq[self.ind2vocab[ind]] = ind_freq[ind]
        return word_freq
    
    @staticmethod
    def build_vocab(data,min_freq=10):
        word_set = {}
        for line in data:
            for word in line:
                if word not in word_set:
                    word_set[word]=1
                else:
                    word_set[word]+=1
        # sort the vocab
        word_list = sorted(list(word_set))
        word_count = {0: 1}
        vocab_dict = {"<unk>": 0}
        i=1
        for word in word_list:
            if word_set[word] >= min_freq:
                vocab_dict[word] = i
                word_count[i] = word_set[word]
                i+=1
            else:
                word_count[0] += word_set[word]
        ind2word = {v: k for k, v in vocab_dict.items()}
        return vocab_dict, ind2word, word_count

    def total_count(self):
        return sum(self.word_count.values())

    
    def __create_sub_sampling_table(self, threshold=1e-5):
        word_freq = np.array(list(self.word_count.values()))
        word_freq = word_freq / np.sum(word_freq)
        sub_sampling_table = ((np.sqrt(word_freq / threshold) + 1) * (threshold / word_freq))
        return sub_sampling_table
    
    def is_sample_selected(self, idx):
        # return True if the word is selected
        return random.random() < self.sub_sampling_table[idx]
    
    def __create_neg_sampling_table(self, power=0.75, table_size =1e8):
        vocab_size = len(self.vocab)
        word_freq = np.array(list(self.word_count.values())) ** power
        word_freq = word_freq / np.sum(word_freq)
        count = np.round(word_freq * table_size)
        neg_sampling_table = []
        for i in range(vocab_size):
            neg_sampling_table += [i] * int(count[i])
        neg_sampling_table = np.array(neg_sampling_table)
        np.random.shuffle(neg_sampling_table)
        return neg_sampling_table.tolist()
    
    def get_negative_samples(self, target, k):
        delta = random.sample(self.neg_sampling_table, k)
        while target in delta:
            delta = random.sample(self.neg_sampling_table, k)
        return delta   

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        words = self.data[idx]
        data = []
        start = self.window_size // 2
        for i in range(start, len(words) - start):
            if words[i] not in self.vocab or not self.is_sample_selected(self.vocab[words[i]]):
                continue
            target = self.vocab[words[i]]
            context = words[i - start: i] + words[i + 1: i + start + 1]
            #convert words to indices and unknown words to 0
            context = [ self.vocab[word] if word in self.vocab else self.vocab["<unk>"] for word in context ]
            neg_samples = self.get_negative_samples(target, self.neg_words)
            data.append((target, context, neg_samples))
        return data
    
    def __collate_fn(self,batches):
        target = []
        context = []
        neg_samples = []
        for sentence in batches:
            for t,c,n in sentence:
                target.append(t)
                context.append(c)
                neg_samples.append(n)
        return torch.LongTensor(target),torch.LongTensor(context),torch.LongTensor(neg_samples)

    def get_batches(self, batch_size):
        return DataLoader(self, batch_size=batch_size, shuffle=False,collate_fn=self.__collate_fn ,drop_last=True)


In [2]:
data = DataPipeline.read_data('../data/processed_data/corpus_cleaned.txt')

In [3]:
vocab, ind2vocab, word_count = DataPipeline.build_vocab(data)

In [4]:
len(vocab)

23522

In [17]:
from scipy.sparse import csr_matrix

In [22]:
class SVD_W2V:
    def __init__(self, vocab, window,embedding_size):
        self.dim = len(vocab)
        self.vocab = vocab
        self.window_size = window
        self.embedding_size = embedding_size
        self.cooccurrence_matrix = csr_matrix((self.dim, self.dim), dtype=np.int32)

    def train(self,data):
        self.__build_cooccurrence_matrix(data)


    def __build_cooccurrence_matrix(self, data):
        for tokens in data:
            for pos,token in enumerate(tokens):
                if token not in self.vocab:
                    continue
                start = max(0, pos - self.window_size)
                end = min(len(tokens), pos + self.window_size)
                for context_pos in range(start, end):
                    if context_pos != pos:
                        context_token = tokens[context_pos]
                        if context_token in self.vocab:
                            self.cooccurrence_matrix[self.vocab[token], self.vocab[context_token]] += 1
                        else:
                            self.cooccurrence_matrix[self.vocab[token], self.vocab["<unk>"]] += 1
    
    def save_embeddings(self, path):
        pass

In [23]:
model = SVD_W2V(vocab, 7,100)

In [24]:
model.cooccurrence_matrix.shape

(23522, 23522)

In [25]:
model.train(data)

  self._set_intXint(row, col, x.flat[0])
