In [1]:
import numpy as np

word2vec = np.load("w2v/word2vec3.npy")

In [2]:
with open('w2v/vocab3.txt') as fp:
    vocab = [l.strip() for l in fp.readlines()]
vocab_r = {k:i for i, k in enumerate(vocab)}

In [3]:
len(vocab), word2vec.shape

(58296, (58296, 200))

In [4]:
EMBED_DIM = 200

In [5]:
from glob import glob

docs = glob('../dataset/IR_dataset/*.txt')

In [6]:
docs[:10]

['../dataset/IR_dataset/2048.txt',
 '../dataset/IR_dataset/2404.txt',
 '../dataset/IR_dataset/661.txt',
 '../dataset/IR_dataset/1252.txt',
 '../dataset/IR_dataset/726.txt',
 '../dataset/IR_dataset/3029.txt',
 '../dataset/IR_dataset/329.txt',
 '../dataset/IR_dataset/1481.txt',
 '../dataset/IR_dataset/1511.txt',
 '../dataset/IR_dataset/1127.txt']

In [7]:
allowed_chars = [
  'آ',
  'أ',
  'ؤ',
  'إ',
  'ئ',
  'ا',
  'ب',
  'ة',
  'ت',
  'ث',
  'ج',
  'ح',
  'خ',
  'د',
  'ذ',
  'ر',
  'ز',
  'س',
  'ش',
  'ص',
  'ض',
  'ط',
  'ظ',
  'ع',
  'غ',
  'ف',
  'ق',
  'ك',
  'ل',
  'م',
  'ن',
  'ه',
  'و',
  'ى',
  'ي',
  '٠',
  '١',
  '٢',
  '٣',
  '٤',
  '٥',
  '٦',
  '٧',
  '٨',
  '٩',
  'چ',
  'ژ',
  'ک',
  'گ',
  'ھ',
  'ی',
  '۰',
  '۱',
  '۲',
  '۳',
  '۴',
  '۵',
  '۶',
  '۷',
  '۸',
  '۹',
#   '\u200c',
  '\u200d',
  '\u200e',
  '\u200f',
  'پ',
  'ﭼ',
  'ﯽ',
  'ﯾ',
  'ﯿ',
  'ﷲ',
  'ﺄ',
  'ﺆ',
  'ﺋ',
  'ﺎ',
  'ﺑ',
  'ﺔ',
  'ﺗ',
  'ﺘ',
  'ﺧ',
  'ﺪ',
  'ﺮ',
  'ﺳ',
  'ﺴ',
  'ﺿ',
  'ﻋ',
  'ﻌ',
  'ﻗ',
  'ﻠ',
  'ﻣ',
  'ﻨ',
  'ﻼ',
  '￼']

trans_chars = [
  'ً',
  'ٌ',
  'ٍ',
  'َ',
  'ُ',
  'ِ',
  'ّ',
  'ْ',
  'ٓ',
  'ٔ',
]

In [43]:
from hazm import Normalizer

normalizer = Normalizer()

word_seqs = []

from nltk.tokenize import word_tokenize, sent_tokenize
import re

for doc in docs:
    with open(doc) as fp:
        lines = fp.readlines()
        for line in lines:
            line = normalizer.normalize(line)
            line = ' '.join([a.strip() for a in re.split("([۰-۹]+)", line) if a])
            line = re.sub('[' + ''.join(trans_chars) + ']', '', line)
            line = re.sub('[^' + ''.join(allowed_chars) + ']', ' ', line)
            word_seqs += [[vocab_r[w] for w in word_tokenize(s)] for s in sent_tokenize(line)]

In [44]:
SEQ_LEN = 64
SEED = 40

In [45]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [46]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

sequences = []

for seq in word_seqs:
    sequences += list(chunks(seq, SEQ_LEN))
    
sequences = pad_sequences(sequences, maxlen=SEQ_LEN, padding='post')

In [47]:
sequences.shape

(53149, 64)

In [30]:
from tensorflow.keras.preprocessing.sequence import skipgrams

window_size = 2
positive_skip_grams, _ = skipgrams(word_seqs[0], 
                                   vocabulary_size=len(vocab),
                                   window_size=window_size,
                                   negative_samples=0)

In [31]:
for target, context in positive_skip_grams[:5]:
    print(f"({target}, {context}): ({vocab[target]}, {vocab[context]})")

(25695, 46405): (کوره, را)
(26270, 53439): (و, شود)
(21776, 34579): (چگالی, در)
(7294, 33599): (باید, شده)
(5527, 27069): (مهم, از)


In [53]:
import tensorflow as tf
import tqdm

In [49]:
def generate_training_data(sequences, window_size, num_ns, vocab_size, seed):
    # Elements of each training example are appended to these lists.
    targets, contexts, labels = [], [], []

    # Build the sampling table for vocab_size tokens.
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)

    # Iterate over all sequences (sentences) in dataset.
    for sequence in tqdm.tqdm(sequences):

        # Generate positive skip-gram pairs for a sequence (sentence).
        positive_skip_grams, _ = tf.keras.preprocessing.sequence.skipgrams(
              sequence,
              vocabulary_size=vocab_size,
              sampling_table=sampling_table,
              window_size=window_size,
              negative_samples=0)

        # Iterate over each positive skip-gram pair to produce training examples
        # with positive context word and negative samples.
        for target_word, context_word in positive_skip_grams:
            context_class = tf.expand_dims(
              tf.constant([context_word], dtype="int64"), 1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
              true_classes=context_class,
              num_true=1,
              num_sampled=num_ns,
              unique=True,
              range_max=vocab_size,
              seed=seed,
              name="negative_sampling")

            # Build context and label vectors (for one target word)
            negative_sampling_candidates = tf.expand_dims(
              negative_sampling_candidates, 1)

            context = tf.concat([context_class, negative_sampling_candidates], 0)
            label = tf.constant([1] + [0]*num_ns, dtype="int64")

            # Append each element from the training example to global lists.
            targets.append(target_word)
            contexts.append(context)
            labels.append(label)

    return targets, contexts, labels

In [None]:
targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=4,
    vocab_size=len(vocab),
    seed=SEED)

 37%|███▋      | 19522/53149 [40:01<6:59:11,  1.34it/s]    

In [None]:
targets = np.array(targets)
contexts = np.array(contexts)[:,:,0]
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

In [None]:
BATCH_SIZE = 1024
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

In [None]:
dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)
print(dataset)

In [None]:
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = layers.Embedding(vocab_size,
                                          embedding_dim,
                                          input_length=1,
                                          name="w2v_embedding")
        self.context_embedding = layers.Embedding(vocab_size,
                                           embedding_dim,
                                           input_length=num_ns+1)

    def call(self, pair):
        target, context = pair
        if len(target.shape) == 2:
            target = tf.squeeze(target, axis=1)
        # target: (batch,)
        word_emb = self.target_embedding(target)
        # word_emb: (batch, embed)
        context_emb = self.context_embedding(context)
        # context_emb: (batch, context, embed)
        dots = tf.einsum('be,bce->bc', word_emb, context_emb)
        # dots: (batch, context)
        return dots