### Import libraries

In [95]:
import numpy as np
import pandas as pd
import yaml
import os
import re
import io
import datetime

import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, Input, Dense, Dropout
from keras.preprocessing import text
from tensorflow.keras.models import Model
from tensorflow.keras import losses, optimizers
from tensorflow.keras.activations import softmax
from keras.preprocessing.sequence import pad_sequences

from itertools import islice

from sklearn.model_selection import train_test_split

# import matplotlib.pyplot as plt
# %matplotlib inline

### Data Set

<br>
<b>Total Aritcles:</b> 265139 <br>
<b>Total Senteces:</b> 3681645 <br>
<b>Total Words:</b> 46842818 <br>
<b>Total Unique Words:</b> 634540 <br>

<a href="https://www.kaggle.com/jkkphys/english-wikipedia-articles-20170820-sqlite/discussion/149578">Data Source</a>

</br>

### Load Data

In [2]:
df = pd.read_csv('enwiki.csv')
print('------------------Head------------------')
print(df.head())
print('------------------Tail------------------')
print(df.tail())

------------------Head------------------
   ARTICLE_ID      TITLE                 SECTION_TITLE  \
0           0  Anarchism                  Introduction   
1           0  Anarchism     Etymology and terminology   
2           0  Anarchism                       History   
3           0  Anarchism  Anarchist schools of thought   
4           0  Anarchism   Internal issues and debates   

                                        SECTION_TEXT  
0  \n\n\n\n\n\n'''Anarchism''' is a political phi...  
1  \n\nThe term ''anarchism'' is a compound word ...  
2  \n\n===Origins===\nWoodcut from a Diggers docu...  
3  \nPortrait of philosopher Pierre-Joseph Proudh...  
4  \nconsistent with anarchist values is a contro...  
------------------Tail------------------
        ARTICLE_ID              TITLE    SECTION_TITLE  \
265134       30475  Triboluminescence  Further reading   
265135       30475  Triboluminescence   External links   
265136       30476       Markov chain     Introduction   
265137 

### Remove unnecessary data from dataFrame

# Set constrains for filtering the data

In [3]:
# For saving the information offline
data_info = {}

In [4]:
# Set `data_limit` zero (0) for processing all of the data.
data_limit = 300
vocabulary_size = 50 
window_size = 5
max_sentence_lenght = 1000
max_word_count = 100
min_word_count = 5
max_sentence = 1000


data_info['data_limit']= data_limit
data_info['vocabulary_size'] = vocabulary_size
data_info['window_size'] = window_size 
data_info['max_sentence_lenght'] = max_sentence_lenght
data_info['max_word_count'] = max_word_count 
data_info['min_word_count'] = min_word_count
data_info['max_sentence'] = max_sentence 

In [5]:
df.drop(['ARTICLE_ID', 'TITLE', 'SECTION_TITLE'], axis=1)
section_texts = df['SECTION_TEXT'].apply(str)
print('Total Aritcles ----->', len(section_texts))

Total Aritcles -----> 265139


In [6]:
if data_limit != 0:
    section_texts = section_texts[:data_limit]
print('Total section texts ----->', len(section_texts))

Total section texts -----> 300


### Download `nltk` Resources for Data Processing 

In [7]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/ruman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ruman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Convert Aritcles to Sentences

In [8]:
sentences = []
for texts in section_texts:
    for sentence in sent_tokenize(texts):
        if len(sentence) < max_sentence_lenght:
            sentences.append(sentence.lower())

In [9]:
total_sentences = len(sentences)
print('Total sentences', total_sentences)

Total sentences 6661


### Convert Sentences to Words and Create Vocabulary with Frequency

In [10]:
stop_words = set(stopwords.words('english'))
word_list = []
vocabulary_with_frequency = {}

for sentence in sentences:
    words = word_tokenize(sentence)
    words_without_stop_words = [word for word in words if word.isalpha() and word not in stop_words and len(word) != 1]
    
    word_lenght = len(words_without_stop_words)
    if word_lenght <= max_word_count and word_lenght >= min_word_count:
        word_list.append(words_without_stop_words)
        
        for word in words_without_stop_words:
            if word not in vocabulary_with_frequency.keys():
                vocabulary_with_frequency[word] = 1
            else:
                vocabulary_with_frequency[word] += 1

### After Filtering Total Words and Vocabulary

In [11]:
total_sentences = len(word_list)

total_words = 0
for words in word_list:
    total_words += len(words)
print('Total words in corpus', total_words)
print('Vocabulary size', len(vocabulary_with_frequency))

Total words in corpus 80055
Vocabulary size 15960


### Sort Vocabulary Based on Frequency Count

In [12]:
sorted_vocabulary_with_frequency = sorted(vocabulary_with_frequency.items(), key=lambda x: x[1], reverse=True)

### Remove Less Freuquent Word form Dictionary and Assign Unique Id to Each Word

<br> 
Create two Dictionaries.<br> 
<b>word_to_id:</b> to get word for a word id. <br>
<b>id_to_word:</b> to get the id for word. <br>
</br>


In [13]:
word_to_id = {}
word_to_id[0] = 'PAD'
id_to_word = {}
id_to_word['PAD'] = 0
word_id = 1

for word, _ in sorted_vocabulary_with_frequency:
    if word_id < vocabulary_size:
        word_to_id[word] = word_id
        id_to_word[word_id] = word
        word_id += 1
        
print(list(islice(word_to_id.items(), 15)))
print('-------------------------------------')
print(list(islice(id_to_word.items(), 15)))

[(0, 'PAD'), ('lincoln', 1), ('also', 2), ('one', 3), ('apollo', 4), ('state', 5), ('first', 6), ('achilles', 7), ('alabama', 8), ('aristotle', 9), ('time', 10), ('autism', 11), ('many', 12), ('new', 13), ('century', 14)]
-------------------------------------
[('PAD', 0), (1, 'lincoln'), (2, 'also'), (3, 'one'), (4, 'apollo'), (5, 'state'), (6, 'first'), (7, 'achilles'), (8, 'alabama'), (9, 'aristotle'), (10, 'time'), (11, 'autism'), (12, 'many'), (13, 'new'), (14, 'century')]


In [14]:
print('New dictionary size after removing less frequent words', vocabulary_size)

New dictionary size after removing less frequent words 50


### Convert `word_list` to `word_id_list` for Expressign the Words Vocabualry Id

<br> Remove the words which are not present in dictionary</br>

In [15]:
data_info['Total sentences before filtering'] = total_sentences
data_info['Total words before filtering'] = total_words

print('Total sentences before filtering', total_sentences)
print('Total words before filtering', total_words)

Total sentences before filtering 6087
Total words before filtering 80055


In [16]:
sentece_word_ids = []
total_sentences = 0
total_words = 0

for words in word_list:
    filtered_words_ids = [word_to_id[word] for word in words if word in word_to_id.keys()]
    words_in_current_sentece = len(filtered_words_ids)
    
    if words_in_current_sentece >= min_word_count:
        total_sentences += 1
        total_words += words_in_current_sentece
        sentece_word_ids.append(filtered_words_ids)
        
print('--------------------------------------------')
print('Total sentences after filtering', total_sentences)
print('Total words after filtering', total_words)

data_info['Total sentences after filtering'] = total_sentences
data_info['Total words after filtering'] = total_words

--------------------------------------------
Total sentences after filtering 190
Total words after filtering 1141


### Untill Now 
<br> 
<b>sentece_word_ids:</b> Sentence wise word's id. <br>
<b>word_to_id:</b> Dictionary for getting the <b>word_id</b> for a <b>word</b>.<br>
<b>id_to_word:</b> Dictionary for getting the <b>word</b> for a <b> word_id</b>.<br>
</br>

In [17]:
print('Total sentences', total_sentences)
print('Total words', total_words)
vocabulary_size = len(word_to_id)
print('Total unique words in dictionary', vocabulary_size)
data_info['Total unique words in dictionary'] = vocabulary_size

Total sentences 190
Total words 1141
Total unique words in dictionary 50


### Save data for later use
It will help to skip the data processing step

In [18]:
# def save_data(data, to_file):
#     if os.path.exists(to_file+'.yaml'):
#         os.remove(to_file+'.yaml')
    
#     with open(to_file+'.yaml', 'w') as file:
#         documents = yaml.dump(data, file, sort_keys=False)
    
# def load_data(from_file):
#     with open(from_file+'.yaml') as file:
#         return yaml.load(file, Loader=yaml.FullLoader)

### Save

In [19]:
# save_data(word_to_id, 'word_to_id')
# save_data(id_to_word, 'id_to_word')
# save_data(sentece_word_ids, 'sentece_word_ids')
# # save_data(positive_skip_grams, 'positive_skip_grams')

# ⚠️⚠️⚠️ Reset Everything ⚠️⚠️⚠️

In [20]:
# %reset

# Start again 🏃‍♂️🏃‍♂️🏃‍♂️

In [21]:
# import numpy as np
# import pandas as pd
# import sys
# import yaml
# import os
# import io

# import nltk
# from nltk.corpus import PlaintextCorpusReader
# from nltk.tokenize import word_tokenize
# from nltk.tokenize import sent_tokenize
# from nltk.corpus import stopwords

# import tensorflow as tf
# from tensorflow import keras

# from tensorflow.keras import layers
# from keras.preprocessing import text

# from itertools import islice
# from sklearn.model_selection import train_test_split

# from tensorflow.keras.layers import Embedding, Input, Dense, Dropout
# from tensorflow.keras.models import Model
# from tensorflow.keras import losses, optimizers
# from tensorflow.keras.activations import softmax

In [22]:
# def save_data(data, to_file):
#     if os.path.exists(to_file+'.yaml'):
#         os.remove('id_to_word.yaml')
    
#     with open(to_file+'.yaml', 'w') as file:
#         documents = yaml.dump(data, file, sort_keys=False)
    
# def load_data(from_file):
#     with open(from_file+'.yaml') as file:
#         return yaml.load(file, Loader=yaml.FullLoader)

### Load data

In [23]:
# id_to_word = load_data('id_to_word')
# word_to_id = load_data('word_to_id')
# sentece_word_ids = load_data('sentece_word_ids')
# # positive_skip_grams = load_data('positive_skip_grams')
# vocabulary_size = len(word_to_id)

#### Example after loadign data

In [24]:
# print(list(islice(id_to_word.items(), 10)))
# print('------------------------------------------')
# print(list(islice(word_to_id.items(), 10)))
# print('------------------------------------------')
# print(sentece_word_ids[:5])
# print('------------------------------------------')

In [25]:
del PlaintextCorpusReader
del data_limit
del df
del filtered_words_ids
del max_sentence
del max_sentence_lenght
del max_word_count
del nltk
del min_word_count
del re 
del section_texts
del sent_tokenize
del sentence
del sentences
del sorted_vocabulary_with_frequency
del stop_words
del stopwords
del text 
del texts
del total_sentences
del total_words
del vocabulary_with_frequency
del word
del word_id
del word_lenght
del word_tokenize               
del words                      
del words_in_current_sentece   
del words_without_stop_words
del word_list

In [26]:
# %whos

### For loading the GPU of Macbook M1

In [27]:
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


### Generate 2D `vocabulary_size` List for Getting the `one-hot` Vector for a Word Id

In [28]:
one_hot = [[0 for i in range(vocabulary_size)] for j in range(vocabulary_size)]

for index in range(1, vocabulary_size):
    one_hot[index][index] = 1

### Helper Function Save and Load the Model and Informatino Data

In every cases name will be `pari` for pair-model, `skip` for skip-gram model and `cbow` for CBOW model.<br>

In [29]:
def get_path_name(name):
    if name == 'pair':
        return 'save/model_pair/'
    elif name == 'skip':
        return 'save/model_skip/'
    elif name == 'cbow':
        return 'save/model_cbow/'
    
def get_unique_file_name():
    return datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f")


def save_model(name, model, file_name):
    path = get_path_name(name)+'model/'
    
    if not os.path.exists(path):
        os.makedirs(path)
        
    if name == 'cbow':
        model.save(path+file_name+'/')
    else:
         model.save(path+file_name+'.h5')
            
            
def load_model(name, file_name):
    path = get_path_name(name)+'model/'
    
    if name == 'cbow':
        print(os.getcwd()+path+file_name)
        return keras.models.load_model(
            os.getcwd()+'/'+path+'/'+file_name+'/', custom_objects={"Average": Average}
        )
    
    return tf.keras.models.load_model(path+file_name+'.h5')


def get_model_list(name):
    path = get_path_name(name)+'model/'
    
    if os.path.exists(path):
        return os.listdir(path)
    
def save_info(name, file_name, data):
    path = get_path_name(name)+'data_info/'
    
    if not os.path.exists(path):
        os.makedirs(path)
            
    with open(path+file_name+'.yaml', 'w') as file:
        documents = yaml.dump(data, file, sort_keys=False)
    
    
def get_info_list(name):
    path = get_path_name(name)+'data_info/'
    
    if os.path.exists(path):
        return os.listdir(path)
        

def get_info(name, file_name):
    path = get_path_name(name)+'data_info/'
    
    if os.path.exists(path):
            with open(path+file_name+'.yaml') as file:
                return yaml.load(file, Loader=yaml.Loader)

In [30]:
file_name = get_unique_file_name()

# `Pair-Model`
<br> 
This is the primary version of skip-gram model.<br> As in this model we predict a context word for a target word, that's why I name this model as <b>Pair-Model</b>.

For buiding this model at first we need to calculte the positive positive skip-grams for a target and context word. We may also take the negative samples, but in this project I have just used the positive skip-grams.<br>

After creating the postivie skip-gram, I have encoded them in one-hot vector. 
Then used the `target word` as `input` and `context word` as `output`.<br>
Also if we want, we may also input the `context word` as `input` and `target word` as `output` to create model as like as `CBOW`. In this proejct I have just created the first one, which one I'm calling is the <b>Pair-Model</b>. 

</br>

### Copy `data_info` to `data_info_pair` for Saving the Informaiton of Pair-Model to `data_info_pair`

In [31]:
data_info_pair = data_info.copy()

### Create skip-gram(taget, context) for All of the Words and Convert Them into One-Hot Vector

In [32]:
X_target = []
Y_context = []

for ids in sentece_word_ids:
    skip, _ = tf.keras.preprocessing.sequence.skipgrams(
        ids,
        vocabulary_size,
        window_size=window_size, negative_samples=0,
        shuffle=False,
        categorical=True)
    for target, context in skip:
        X_target.append(one_hot[target])
        Y_context.append(one_hot[context])
print(np.array(X_target).shape)
print(vocabulary_size)

(5710, 50)
50


In [33]:
data_info_pair['Total pairs of target and context words'] = len(X_target)

### Train-Test Split
`20%` data for testing.

In [34]:
X_train_pair, X_test_pair, Y_train_pair, Y_test_pair = train_test_split(X_target, Y_context, test_size=0.2, random_state=0)

### Create Model for `Pair-Model`

In [35]:
inp = Input(shape=(vocabulary_size, ))
x = Dense(200, name='w2v_embedding_pair')(inp)
x = Dense(vocabulary_size, activation='softmax')(x)

model_pair = Model(inputs=inp, outputs=x)
model_pair.summary()

Metal device set to: Apple M1
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
w2v_embedding_pair (Dense)   (None, 200)               10200     
_________________________________________________________________
dense (Dense)                (None, 50)                10050     
Total params: 20,250
Trainable params: 20,250
Non-trainable params: 0
_________________________________________________________________


2021-11-20 00:23:15.356085: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-11-20 00:23:15.356199: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


### Compile & Fit the Pair-Model

Decrease the batch size for saving the RAM :(

In [92]:
training_started = datetime.datetime.now()

model_pair.compile(loss=losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])
history_pair = model_pair.fit(X_train_pair, Y_train_pair, epochs = 3, batch_size=1, verbose=1, shuffle=False)

NameError: name 'model_pair' is not defined

In [93]:
training_end = datetime.datetime.now()
data_info_pair['Training time'] =  training_end - training_started
data_info_pair['Pair-Model training history'] = history_pair.history

NameError: name 'data_info_pair' is not defined

### Save Pair-Model 

In [39]:
save_model('pair', model_pair, file_name)

### Testing Pair-Model Using Testing Data

In [40]:
loss, accuracy = model_pair.evaluate(X_test_pair, Y_test_pair, verbose=2)

36/36 - 0s - loss: 3.1583 - accuracy: 0.2102


2021-11-20 00:23:48.372086: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


In [41]:
data_info_pair['Pair-Model evaluatation loss'] = loss
data_info_pair['Pair-Model evaluatation accuracy'] = accuracy
print('Evaluated loss',loss, 'and accuracy', accuracy)

Evaluated loss 3.158266067504883 and accuracy 0.2101576179265976


### Save Pair-Model Information

In [42]:
save_info('pair',file_name, data_info_pair)

### Save the Vector and Metadata for Pair-Model
Projecting the model data, please browse the following link.<br>
https://projector.tensorflow.org

In [43]:
weights = model_pair.get_layer('w2v_embedding_pair').get_weights()[0]

path = get_path_name('pair') + 'vector_metadata/'

if not os.path.exists(path):
        os.makedirs(path)

out_v = io.open(path+file_name+'vectors.tsv', 'w', encoding='utf-8')
out_m = io.open(path+file_name+'metadata.tsv', 'w', encoding='utf-8')

for index in range(1, vocabulary_size):
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(id_to_word[index] + "\n")
out_v.close()
out_m.close()

In [44]:
print('End Pair Model!')

End Pair Model!


In [45]:
del X_target                    
del Y_context 
del X_test_pair          
del X_train_pair                              
del Y_test_pair            
del Y_train_pair          
del accuracy              
del  context              
del data_info_pair        
del history_pair         
del ids                    
del index                  
del inp                   
del loss                   
del out_m                  
del out_v                 
del path                  
del skip                  
del softmax               
del target                 
del training_end          
del training_started      
del vec                   
del weights                
del x  
del model_pair

In [46]:
# %whos

# Data Processing for the CBOW and SKIP-GRAM Model

In CBOW and SKIP-GRAM models, we will be needed the `target word` and their corresponding `context words`. 
For creating the context word for a target word, we need to consider the total `window_size * 2` words. Among the `window_size * 2`, `window_size` words will be on left side of the target word and `window_size` words fo the right side. If there number of words a target word is less than the `window_size * 2`, then I have use the zero padding. 


<br> This function creates the target words and their corrosponding context words.<br>
return:<br>
------<b>target_id:</b> of the target word.<br>
------<b>context_ids:</b> of the context words. I have also used the zero padding to make total number of `window_size * 2` target words.<br>
<br>

In [94]:
def get_target_contexts(word_ids, window_size):
    target_id = []
    context_ids = []
    
    for ids in word_ids:
        for index, word_id in enumerate(ids):
            if not word_id:
                continue
                
            window_start = max(0, index - window_size)
            window_end = min(len(ids), index + window_size + 1)
            
            target_id.append(word_id)
            context_ids.append([ids[window_index] for window_index in range(window_start, window_end) if window_index != index])
                    
    return target_id, pad_sequences(context_ids, maxlen = window_size * 2)

In [48]:
target_id, context_ids =  get_target_contexts(sentece_word_ids, window_size)

In [49]:
data_info['Target words'] = len(target_id)
data_info['Context words with zero padding'] = len(context_ids) * window_size * 2

### Express `target_id` &  `context_ids` using one-hot encoding 

In [50]:
one_hot_target_id = [one_hot[id] for id in target_id]
one_hot_context_ids = []

for context_id in context_ids:
    one_hot_context_ids.append([one_hot[id] for id in context_id])

### Take the sum of input vecotor

In [51]:
X = []
for context in one_hot_context_ids:
    X.append([sum(x) for x in zip(*context)])

In [96]:
del context             
del context_id             
del context_ids                              
del target_id              
del one_hot

NameError: name 'context' is not defined

In [53]:
# %whos

# CBOW Model

In [54]:
data_info_cbow =  data_info.copy()

### Split training testing

In [55]:
X_train_cbow, X_test_cbow, Y_train_cbow, Y_test_cbow = train_test_split(X, one_hot_target_id, test_size=0.2, random_state=0)

#### Extending keras layer for taking the average

In [56]:
class Average(keras.layers.Layer):
    def __init__(self, units=32, input_dim=32):
        super(Average, self).__init__()

    def call(self, inputs):
        return tf.math.divide(inputs, window_size*2)
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [57]:
inp = Input(shape=(vocabulary_size, ))
x = Dense(200, name='w2v_embedding_cbow')(inp)
x = Average(200)(x)
x = Dense(vocabulary_size, activation='softmax')(x)

model_cbow = Model(inputs=inp, outputs=x)
model_cbow.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
w2v_embedding_cbow (Dense)   (None, 200)               10200     
_________________________________________________________________
average (Average)            (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                10050     
Total params: 20,250
Trainable params: 20,250
Non-trainable params: 0
_________________________________________________________________


### Fit the CBOW model

In [58]:
training_started = datetime.datetime.now()

In [59]:
model_cbow.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True), optimizer='adam', metrics=['accuracy'])
history_cbow = model_cbow.fit(X_train_cbow, Y_train_cbow, epochs = 3, batch_size=50, verbose=1, shuffle=False)

Epoch 1/3

2021-11-20 00:23:48.867985: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3


In [60]:
training_end = datetime.datetime.now()
data_info_cbow['training time'] =  training_end - training_started
data_info_cbow['cbow model training history'] = history_cbow.history

### Save CBOW Model

In [61]:
save_model('cbow', model_cbow, file_name)

INFO:tensorflow:Assets written to: save/model_cbow/model/2021-11-20_00-23-15-316636/assets


2021-11-20 00:23:49.342209: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


### Testing CBOW Model

In [62]:
loss, accuracy = model_cbow.evaluate(X_test_cbow, Y_test_cbow, verbose=2)

8/8 - 0s - loss: 3.7658 - accuracy: 0.2926


2021-11-20 00:23:49.607976: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


In [63]:
data_info_cbow['cbow model evaluatation loss'] = loss
data_info_cbow['cbow model evaluatation accuracy'] = accuracy
print('Evaluated loss',loss, 'and accuracy', accuracy)

Evaluated loss 3.765805721282959 and accuracy 0.2925764322280884


### Save CBOW model information

In [64]:
save_info('cbow',file_name, data_info_cbow)

In [65]:
weights_cbow = model_cbow.get_layer('w2v_embedding_cbow').get_weights()[0]

path_cbow = get_path_name('cbow') + 'vector_metadata/'


if not os.path.exists(path_cbow):
        os.makedirs(path_cbow)

out_v = io.open(path_cbow+file_name+'vectors.tsv', 'w', encoding='utf-8')
out_m = io.open(path_cbow+file_name+'metadata.tsv', 'w', encoding='utf-8')

for index in range(1, vocabulary_size):
    vec = weights_cbow[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(id_to_word[index] + "\n")
out_v.close()
out_m.close()

In [66]:
print('End of CBOW model')

End of CBOW model


In [67]:
del X                     
del X_test_cbow            
del X_train_cbow           
del Y_test_cbow            
del Y_train_cbow          
del accuracy               
del data_info_cbow        
del history_cbow                     
del inp                   
del loss                             
del pad_sequences          
del training_end           
del training_started      
del x   
del model_cbow 
del out_m
del index                                  
del out_v                  
del path_cbow             
del vec                   
del weights_cbow           

In [68]:
# %whos

## SKIP-GRAMS

In [69]:
data_info_skip = data_info.copy()
del data_info

### Express the context ids(window_size *2) of a targetd id into a single row

In [70]:
flatten_one_hot_context_ids = [sum(one_hot_context_id, []) for one_hot_context_id in one_hot_context_ids]

### Split training testing data

In [71]:
X_train_skip, X_test_skip, Y_train_skip, Y_test_skip = train_test_split(one_hot_target_id, flatten_one_hot_context_ids, test_size=0.2, random_state=0)

### Create SKIP-GRAMS Model

In [72]:
inp = Input(shape=(vocabulary_size, ))
x = Dense(200, name='w2v_embedding_skip')(inp)
x = Dense(vocabulary_size * window_size * 2, activation='softmax')(x)
model_skip = Model(inputs=inp, outputs=x)
model_skip.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
w2v_embedding_skip (Dense)   (None, 200)               10200     
_________________________________________________________________
dense_2 (Dense)              (None, 500)               100500    
Total params: 110,700
Trainable params: 110,700
Non-trainable params: 0
_________________________________________________________________


### Fit skip-grams model

In [73]:
training_started = datetime.datetime.now()

In [74]:
model_skip.compile(loss=losses.categorical_crossentropy, optimizer='rmsprop', metrics=['accuracy'])
history_skip = model_skip.fit(X_train_skip, Y_train_skip, epochs = 5, batch_size=1, verbose=1, shuffle=False)

Epoch 1/5
 20/912 [..............................] - ETA: 4s - loss: 33.8878 - accuracy: 0.0000e+00 

2021-11-20 00:23:50.606717: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [75]:
training_end = datetime.datetime.now()
data_info_skip['training time'] =  training_end - training_started
data_info_skip['skip-grams model training history'] = history_skip.history

### Save SKIP_GRAM Model

In [76]:
save_model('skip', model_skip, file_name)

### Test SKIP_GRAM Model

In [77]:
loss, accuracy = model_skip.evaluate(X_test_skip, Y_test_skip, verbose=2)

8/8 - 0s - loss: 29.3598 - accuracy: 0.0524


2021-11-20 00:24:14.910679: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


In [78]:
data_info_skip['skip model evaluatation loss'] = loss
data_info_skip['skip model evaluatation accuracy'] = accuracy
print('Evaluated loss',loss, 'and accuracy', accuracy)

Evaluated loss 29.359811782836914 and accuracy 0.052401747554540634


### Save Data info for Skip-Gram model


In [79]:
save_info('skip',file_name, data_info_skip)

### Save Vector and Metadata For SKIP-GRAM

In [80]:
weights_skip = model_skip.get_layer('w2v_embedding_skip').get_weights()[0]

path_skip = get_path_name('skip') + 'vector_metadata/'


if not os.path.exists(path_skip):
        os.makedirs(path_skip)

out_v = io.open(path_skip+file_name+'vectors.tsv', 'w', encoding='utf-8')
out_m = io.open(path_skip+file_name+'metadata.tsv', 'w', encoding='utf-8')

for index in range(1, vocabulary_size):
    vec = weights_skip[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(id_to_word[index] + "\n")
out_v.close()
out_m.close()

In [81]:
print('End Skip-Gram Model')

End Skip-Gram Model


# Summary 

### Pair wise input model

In [82]:
pair = load_model('pair', file_name)
print(pair.summary())

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
w2v_embedding_pair (Dense)   (None, 200)               10200     
_________________________________________________________________
dense (Dense)                (None, 50)                10050     
Total params: 20,250
Trainable params: 20,250
Non-trainable params: 0
_________________________________________________________________
None


In [83]:
get_info('pair', file_name)

{'data_limit': 300,
 'vocabulary_size': 50,
 'window_size': 5,
 'max_sentence_lenght': 1000,
 'max_word_count': 100,
 'min_word_count': 5,
 'max_sentence': 1000,
 'Total sentences before filtering': 6087,
 'Total words before filtering': 80055,
 'Total sentences after filtering': 190,
 'Total words after filtering': 1141,
 'Total unique words in dictionary': 50,
 'Total pairs of target and context words': 5710,
 'training time': datetime.timedelta(seconds=32, microseconds=722405),
 'pair model training history': {'loss': [3.3004884719848633,
   3.0065228939056396,
   2.9561688899993896],
  'accuracy': [0.19943082332611084, 0.22898423671722412, 0.23095446825027466]},
 'Pair-Model evaluatation loss': 3.158266067504883,
 'Pair-Model evaluatation accuracy': 0.2101576179265976}

### CBOW Model

In [84]:
cbow = load_model('cbow', file_name)
print(cbow.summary())

/Users/ruman/Documents/project/word2Vec/word2vecsave/model_cbow/model/2021-11-20_00-23-15-316636
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
w2v_embedding_cbow (Dense)   (None, 200)               10200     
_________________________________________________________________
average (Average)            (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                10050     
Total params: 20,250
Trainable params: 20,250
Non-trainable params: 0
_________________________________________________________________
None


In [85]:
get_info('cbow', file_name)

{'data_limit': 300,
 'vocabulary_size': 50,
 'window_size': 5,
 'max_sentence_lenght': 1000,
 'max_word_count': 100,
 'min_word_count': 5,
 'max_sentence': 1000,
 'Total sentences before filtering': 6087,
 'Total words before filtering': 80055,
 'Total sentences after filtering': 190,
 'Total words after filtering': 1141,
 'Total unique words in dictionary': 50,
 'Target words': 1141,
 'Context words with zero padding': 11410,
 'training time': datetime.timedelta(microseconds=649136),
 'cbow model training history': {'loss': [3.893620491027832,
   3.842318058013916,
   3.7911999225616455],
  'accuracy': [0.0975877195596695, 0.22807016968727112, 0.2850877344608307]},
 'cbow model evaluatation loss': 3.765805721282959,
 'cbow model evaluatation accuracy': 0.2925764322280884}

### Skip-Gram Model

In [86]:
skip = load_model('skip', file_name)
print(skip.summary())

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
w2v_embedding_skip (Dense)   (None, 200)               10200     
_________________________________________________________________
dense_2 (Dense)              (None, 500)               100500    
Total params: 110,700
Trainable params: 110,700
Non-trainable params: 0
_________________________________________________________________
None


In [87]:
get_info('skip', file_name)

{'data_limit': 300,
 'vocabulary_size': 50,
 'window_size': 5,
 'max_sentence_lenght': 1000,
 'max_word_count': 100,
 'min_word_count': 5,
 'max_sentence': 1000,
 'Total sentences before filtering': 6087,
 'Total words before filtering': 80055,
 'Total sentences after filtering': 190,
 'Total words after filtering': 1141,
 'Total unique words in dictionary': 50,
 'Target words': 1141,
 'Context words with zero padding': 11410,
 'training time': datetime.timedelta(seconds=24, microseconds=934082),
 'skip-grams model training history': {'loss': [29.76048469543457,
   27.284664154052734,
   25.813173294067383,
   25.298664093017578,
   25.568836212158203],
  'accuracy': [0.030701754614710808,
   0.06359649449586868,
   0.06688596308231354,
   0.07346491515636444,
   0.07236842066049576]},
 'skip model evaluatation loss': 29.359811782836914,
 'skip model evaluatation accuracy': 0.052401747554540634}

In [88]:
print('THE END')

THE END
