### Iimport libraries

In [56]:
import numpy as np
import pandas as pd
import sys
import yaml
import os
import re
import io

import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, Input, Dense, Dropout
from keras.preprocessing import text
from tensorflow.keras.models import Model
from tensorflow.keras import losses, optimizers
from tensorflow.keras.activations import softmax
from keras.preprocessing import sequence


from itertools import islice

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

### Load data

In [57]:
df = pd.read_csv('enwiki.csv')
print('------------------Head------------------')
print(df.head())
print('------------------Tail------------------')
print(df.tail())

------------------Head------------------
   ARTICLE_ID      TITLE                 SECTION_TITLE  \
0           0  Anarchism                  Introduction   
1           0  Anarchism     Etymology and terminology   
2           0  Anarchism                       History   
3           0  Anarchism  Anarchist schools of thought   
4           0  Anarchism   Internal issues and debates   

                                        SECTION_TEXT  
0  \n\n\n\n\n\n'''Anarchism''' is a political phi...  
1  \n\nThe term ''anarchism'' is a compound word ...  
2  \n\n===Origins===\nWoodcut from a Diggers docu...  
3  \nPortrait of philosopher Pierre-Joseph Proudh...  
4  \nconsistent with anarchist values is a contro...  
------------------Tail------------------
        ARTICLE_ID              TITLE    SECTION_TITLE  \
265134       30475  Triboluminescence  Further reading   
265135       30475  Triboluminescence   External links   
265136       30476       Markov chain     Introduction   
265137 

### Remove unnecessary data from dataFrame

# Set constrains for filtering the data

In [58]:
data_limit = 100 # 0 for all taking all data. There is exis at most 2,65,139 paragraphs 
max_sentence_lenght = 1000
max_word_count = 100
min_word_count = 5
max_sentence = 1000
vocabulary_size = 30
window_size = 4

In [59]:
df.drop(['ARTICLE_ID', 'TITLE', 'SECTION_TITLE'], axis=1)
section_texts = df['SECTION_TEXT'].apply(str)
print('Total section texts ----->', len(section_texts))

Total section texts -----> 265139


In [60]:
if data_limit != 0:
    section_texts = section_texts[:data_limit]
print('Total section texts ----->', len(section_texts))

Total section texts -----> 100


### Convert text to sentences

In [61]:
sentences = []
for texts in section_texts:
    for sentence in sent_tokenize(texts):
        if len(sentence) < max_sentence_lenght:
            sentences.append(sentence.lower())

In [62]:
total_sentences = len(sentences)
print('Total sentences', total_sentences)

Total sentences 3059


### Convert sentences to words and create vocabulary with frequency

In [63]:
stop_words = set(stopwords.words('english'))
word_list = []
vocabulary_with_frequency = {}

for sentence in sentences:
    words = word_tokenize(sentence)
    words_without_stop_words = [word for word in words if word.isalpha() and word not in stop_words and len(word) != 1]
    
    word_lenght = len(words_without_stop_words)
    if word_lenght <= max_word_count and word_lenght >= min_word_count:
        word_list.append(words_without_stop_words)
        
        for word in words_without_stop_words:
            if word not in vocabulary_with_frequency.keys():
                vocabulary_with_frequency[word] = 1
            else:
                vocabulary_with_frequency[word] += 1

### Total words and vocabulary size

In [64]:
# After filtering the words
total_sentences = len(word_list)

total_words = 0
for words in word_list:
    total_words += len(words)
print('Total words in corpus', total_words)
print('Vocabulary size', len(vocabulary_with_frequency))

Total words in corpus 37912
Vocabulary size 9640


In [65]:
sorted_vocabulary_with_frequency = sorted(vocabulary_with_frequency.items(), key=lambda x: x[1], reverse=True)

### Remove less freuquent word form dictionary and assign unique id to vocabulary word


<br> 
Create two vocabulary.<br> 
<b>word_to_id</b> for gettig the word for an id. <br>
<b>id_to_word</b> for gettign the id for word. <br>
</br>


In [66]:
word_to_id = {}
word_to_id[0] = 'PAD'
id_to_word = {}
id_to_word['PAD'] = 0
word_id = 1

for word, _ in sorted_vocabulary_with_frequency:
    if word_id < vocabulary_size:
        word_to_id[word] = word_id
        id_to_word[word_id] = word
        word_id += 1
        
print(list(islice(word_to_id.items(), 15)))
print('-------------------------------------')
print(list(islice(id_to_word.items(), 15)))

[(0, 'PAD'), ('lincoln', 1), ('achilles', 2), ('state', 3), ('alabama', 4), ('aristotle', 5), ('autism', 6), ('anarchist', 7), ('war', 8), ('one', 9), ('also', 10), ('first', 11), ('states', 12), ('many', 13), ('anarchism', 14)]
-------------------------------------
[('PAD', 0), (1, 'lincoln'), (2, 'achilles'), (3, 'state'), (4, 'alabama'), (5, 'aristotle'), (6, 'autism'), (7, 'anarchist'), (8, 'war'), (9, 'one'), (10, 'also'), (11, 'first'), (12, 'states'), (13, 'many'), (14, 'anarchism')]


In [67]:
print('New dictionary size', vocabulary_size)

New dictionary size 30


### Convert word_list to word_id_list for expressign the words of a sentece using vocabulary id

<br> Remove the words which are not present in dictionary</br>

In [68]:
print('Before filtering total sentenc', total_sentences)
print('Before filtering total words', total_words)

sentece_word_ids = []
total_sentences = 0
total_words = 0

for words in word_list:
    filtered_words_ids = [word_to_id[word] for word in words if word in word_to_id.keys()]
    words_in_current_sentece = len(filtered_words_ids)
    
    if words_in_current_sentece >= min_word_count:
        total_sentences += 1
        total_words += words_in_current_sentece
        sentece_word_ids.append(filtered_words_ids)
        
print('--------------------------------------------')
print('After filtering total sentence', total_sentences)
print('After filtering total words', total_words)

Before filtering total sentenc 2818
Before filtering total words 37912
--------------------------------------------
After filtering total sentence 73
After filtering total words 427


### Untill Now 
<br> 
<b>sentece_word_ids:</b> Sentence wise words's id. <br>
<b>word_to_id:</b> Vocabulary for getting the <b>word_id</b> for a <b>word</b>.<br>
<b>id_to_word:</b> Vocabulary for getting the <b>word</b> for a <b> word_id</b>.<br>
<b>positive_skip_grams:</b> skipgrams of target and context word pairs.
</br>

In [69]:
print('Total senteces', total_sentences)
print('Total words', total_words)
vocabulary_size = len(word_to_id)
print('Total unique words in vocabulary', vocabulary_size)

Total senteces 73
Total words 427
Total unique words in vocabulary 30


### Save data for later use
<br> It will help use to skip the data processing step

In [70]:
def save_data(data, to_file):
    if os.path.exists(to_file+'.yaml'):
        os.remove(to_file+'.yaml')
    
    with open(to_file+'.yaml', 'w') as file:
        documents = yaml.dump(data, file, sort_keys=False)
    
def load_data(from_file):
    with open(from_file+'.yaml') as file:
        return yaml.load(file, Loader=yaml.FullLoader)

### Save

In [71]:
# save_data(word_to_id, 'word_to_id')
# save_data(id_to_word, 'id_to_word')
# save_data(sentece_word_ids, 'sentece_word_ids')
# # save_data(positive_skip_grams, 'positive_skip_grams')

# ⚠️⚠️⚠️ Reset Everything ⚠️⚠️⚠️

In [72]:
# %reset

# Start again 🏃‍♂️🏃‍♂️🏃‍♂️

In [73]:
# import numpy as np
# import pandas as pd
# import sys
# import yaml
# import os
# import io

# import nltk
# from nltk.corpus import PlaintextCorpusReader
# from nltk.tokenize import word_tokenize
# from nltk.tokenize import sent_tokenize
# from nltk.corpus import stopwords

# import tensorflow as tf
# from tensorflow import keras

# from tensorflow.keras import layers
# from keras.preprocessing import text

# from itertools import islice
# from sklearn.model_selection import train_test_split

# from tensorflow.keras.layers import Embedding, Input, Dense, Dropout
# from tensorflow.keras.models import Model
# from tensorflow.keras import losses, optimizers
# from tensorflow.keras.activations import softmax

In [74]:
def save_data(data, to_file):
    if os.path.exists(to_file+'.yaml'):
        os.remove('id_to_word.yaml')
    
    with open(to_file+'.yaml', 'w') as file:
        documents = yaml.dump(data, file, sort_keys=False)
    
def load_data(from_file):
    with open(from_file+'.yaml') as file:
        return yaml.load(file, Loader=yaml.FullLoader)

### Load data

In [75]:
# id_to_word = load_data('id_to_word')
# word_to_id = load_data('word_to_id')
# sentece_word_ids = load_data('sentece_word_ids')
# # positive_skip_grams = load_data('positive_skip_grams')
# vocabulary_size = len(word_to_id)

#### Example after loadign data

In [76]:
print(list(islice(id_to_word.items(), 10)))
print('------------------------------------------')
print(list(islice(word_to_id.items(), 10)))
print('------------------------------------------')
print(sentece_word_ids[:5])
print('------------------------------------------')

[('PAD', 0), (1, 'lincoln'), (2, 'achilles'), (3, 'state'), (4, 'alabama'), (5, 'aristotle'), (6, 'autism'), (7, 'anarchist'), (8, 'war'), (9, 'one')]
------------------------------------------
[(0, 'PAD'), ('lincoln', 1), ('achilles', 2), ('state', 3), ('alabama', 4), ('aristotle', 5), ('autism', 6), ('anarchist', 7), ('war', 8), ('one', 9)]
------------------------------------------
[[15, 13, 28, 18, 7, 7, 14], [29, 14, 29, 27, 12, 27, 12], [3, 7, 20, 3, 11, 20], [7, 14, 23, 16, 14], [7, 7, 14, 18, 18]]
------------------------------------------


In [77]:
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


### Generate 2D `vocabulary_size` list for getting the one-hot vector for a index

In [78]:
one_hot = [[0 for i in range(vocabulary_size)] for j in range(vocabulary_size)]

for index in range(1, vocabulary_size):
    one_hot[index][index] = 1

# `Model - I`

### Create (taget, context) pair for all of the words and convert them into one-hot encoding

In [79]:
X = []
Y = []

for ids in sentece_word_ids:
    skip, _ = tf.keras.preprocessing.sequence.skipgrams(
        ids,
        vocabulary_size,
        window_size=window_size, negative_samples=0,
        shuffle=False,
        categorical=True)
    for target, context in skip:
        X.append(one_hot[target])
        Y.append(one_hot[context])
print(np.array(X).shape)
print(vocabulary_size)

(1956, 30)
30


### Split training testing 

In [80]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Pairwise target, context model
<br>
Input target word in one-hot encoding format. <br>
Output context word of corespoding target word in one-hot encoding format. <br>

All of the targets and their corresponding context words are fitting at once. 
<br>

In [154]:
inp = Input(shape=(vocabulary_size, ))
x = Dense(200, name='w2v_embedding')(inp)
x = Dense(vocabulary_size, activation='softmax')(x)

model_pair = Model(inputs=inp, outputs=x)
model_pair.summary()

Model: "model_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        [(None, 30)]              0         
_________________________________________________________________
w2v_embedding (Dense)        (None, 200)               6200      
_________________________________________________________________
dense_9 (Dense)              (None, 30)                6030      
Total params: 12,230
Trainable params: 12,230
Non-trainable params: 0
_________________________________________________________________


In [82]:
model_pair.compile(loss=losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])
model_pair.fit(X_train, Y_train, epochs = 3, batch_size=2, verbose=1, shuffle=True)

Epoch 1/3
 24/782 [..............................] - ETA: 3s - loss: 3.3802 - accuracy: 0.1042    

2021-11-18 00:34:59.508107: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2990045e0>

### Store the vector and metadata for Model-1

In [83]:
weights = model.get_layer('w2v_embedding').get_weights()[0]
if os.path.exists('metadata.tsv'):
        os.remove('metadata.tsv')
        
if os.path.exists('vectors.tsv'):
        os.remove('vectors.tsv')
        
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

for index in range(1, vocabulary_size):
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(id_to_word[index] + "\n")
out_v.close()
out_m.close()

In [84]:
print('End Model-1')

End Model-1


# Data Processing for the CBOW and SKIP-GRAMS

<br> This function creates the target words and all of it's corrosponding context words.<br>
return:<br>
------<b>target_id:</b> id of the target word without encoding.<br>
------<b>context_ids:</b> All of the context words for a target_id<br>

<b>Note:</b> Zero padding is being used for returing total `window_size * 2` context words for a target word.<br>
<br>

In [85]:
from keras.preprocessing.sequence import pad_sequences

def get_target_contexts(word_ids, window_size):
    target_id = []
    context_ids = []

    one_hot_target_id = []
    one_hot_context_ids = []

    for ids in word_ids:
        for index, word_id in enumerate(ids):
            if not word_id:
                continue
                
            window_start = max(0, index - window_size)
            window_end = min(len(ids), index + window_size + 1)
            
            target_id.append(word_id)
            context_ids.append([ids[window_index] for window_index in range(window_start, window_end) if window_index != index])

    return target_id, pad_sequences(context_ids, maxlen = window_size * 2)

In [86]:
target_id, context_ids =  get_target_contexts(sentece_word_ids, window_size)

### Express `target_id` &  `context_ids` using one-hot encoding 

In [87]:
one_hot_target_id = [one_hot[id] for id in target_id]
one_hot_context_ids = []

for context_id in context_ids:
    one_hot_context_ids.append([one_hot[id] for id in context_id])

In [88]:
X = []
for context in one_hot_context_ids:
    row = [0] * vocabulary_size
    for word in context:
        for idx in range(vocabulary_size):
            row[idx] += word[idx]
    X.append(row)

# CBOW

In [89]:
class Average(keras.layers.Layer):
    def __init__(self, units=32, input_dim=32):
        super(Average, self).__init__()

    def call(self, inputs):
        return tf.math.divide(inputs, window_size*2)

In [90]:
inp = Input(shape=(vocabulary_size, ))
x = Dense(200, name='w2v_embedding')(inp)
x = Average(200)(x)
x = Dense(vocabulary_size, activation='softmax')(x)

model_cbow = Model(inputs=inp, outputs=x)
model_cbow.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 30)]              0         
_________________________________________________________________
w2v_embedding (Dense)        (None, 200)               6200      
_________________________________________________________________
average_2 (Average)          (None, 200)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 30)                6030      
Total params: 12,230
Trainable params: 12,230
Non-trainable params: 0
_________________________________________________________________


In [99]:
model_cbow.compile(loss=losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])
model_cbow.fit(X, one_hot_target_id, epochs = 3, batch_size=50, verbose=1, shuffle=False)

Epoch 1/3
Epoch 2/3
Epoch 3/3
1/9 [==>...........................] - ETA: 0s - loss: 2.9809 - accuracy: 0.3400

2021-11-18 00:37:39.747085: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




<keras.callbacks.History at 0x1547b60a0>

## SKIP-GRAMS

In [148]:
X_flatten = [sum(one_hot_context_id, []) for one_hot_context_id in one_hot_context_ids]

In [152]:
inp = Input(shape=(vocabulary_size, ))
x = Dense(200, name='w2v_embedding')(inp)
x = Dense(vocabulary_size * window_size * 2, activation='softmax')(x)
model_skip = Model(inputs=inp, outputs=x)
model_skip.summary()

Model: "model_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, 30)]              0         
_________________________________________________________________
w2v_embedding (Dense)        (None, 200)               6200      
_________________________________________________________________
dense_8 (Dense)              (None, 240)               48240     
Total params: 54,440
Trainable params: 54,440
Non-trainable params: 0
_________________________________________________________________


In [153]:
model_skip.compile(loss=losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])
model_skip.fit(one_hot_target_id, X_flatten, epochs = 5, batch_size=50, verbose=1, shuffle=False)

Epoch 1/5
Epoch 2/5
Epoch 3/5
1/9 [==>...........................] - ETA: 0s - loss: 24.0588 - accuracy: 0.0200

2021-11-18 01:16:01.105933: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x29ca91970>