### Iimport libraries

In [1]:
import numpy as np
import pandas as pd
import sys
import yaml
import os
import re
import io
import datetime

import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, Input, Dense, Dropout
from keras.preprocessing import text
from tensorflow.keras.models import Model
from tensorflow.keras import losses, optimizers
from tensorflow.keras.activations import softmax
from keras.preprocessing import sequence


from itertools import islice

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

### Load data

In [2]:
df = pd.read_csv('enwiki.csv')
print('------------------Head------------------')
print(df.head())
print('------------------Tail------------------')
print(df.tail())

------------------Head------------------
   ARTICLE_ID      TITLE                 SECTION_TITLE  \
0           0  Anarchism                  Introduction   
1           0  Anarchism     Etymology and terminology   
2           0  Anarchism                       History   
3           0  Anarchism  Anarchist schools of thought   
4           0  Anarchism   Internal issues and debates   

                                        SECTION_TEXT  
0  \n\n\n\n\n\n'''Anarchism''' is a political phi...  
1  \n\nThe term ''anarchism'' is a compound word ...  
2  \n\n===Origins===\nWoodcut from a Diggers docu...  
3  \nPortrait of philosopher Pierre-Joseph Proudh...  
4  \nconsistent with anarchist values is a contro...  
------------------Tail------------------
        ARTICLE_ID              TITLE    SECTION_TITLE  \
265134       30475  Triboluminescence  Further reading   
265135       30475  Triboluminescence   External links   
265136       30476       Markov chain     Introduction   
265137 

### Remove unnecessary data from dataFrame

# Set constrains for filtering the data

In [3]:
data_info = {}

In [4]:
# set 0 for taking all data
data_limit = 300 
max_sentence_lenght = 1000
max_word_count = 100
min_word_count = 5
max_sentence = 1000
vocabulary_size = 30
window_size = 4


data_info['data_limit']= data_limit
data_info['max_sentence_lenght'] = max_sentence_lenght
data_info['max_word_count'] = max_word_count 
data_info['min_word_count'] = min_word_count
data_info['max_sentence'] = max_sentence 
data_info['vocabulary_size'] = vocabulary_size
data_info['window_size'] = window_size 

### Create a dictioanry for saving the data related information

In [5]:
df.drop(['ARTICLE_ID', 'TITLE', 'SECTION_TITLE'], axis=1)
section_texts = df['SECTION_TEXT'].apply(str)
print('Total section texts ----->', len(section_texts))

Total section texts -----> 265139


In [6]:
if data_limit != 0:
    section_texts = section_texts[:data_limit]
print('Total section texts ----->', len(section_texts))

Total section texts -----> 300


### Convert text to sentences

In [7]:
sentences = []
for texts in section_texts:
    for sentence in sent_tokenize(texts):
        if len(sentence) < max_sentence_lenght:
            sentences.append(sentence.lower())

In [8]:
total_sentences = len(sentences)
print('Total sentences', total_sentences)

Total sentences 6661


### Convert sentences to words and create vocabulary with frequency

In [9]:
stop_words = set(stopwords.words('english'))
word_list = []
vocabulary_with_frequency = {}

for sentence in sentences:
    words = word_tokenize(sentence)
    words_without_stop_words = [word for word in words if word.isalpha() and word not in stop_words and len(word) != 1]
    
    word_lenght = len(words_without_stop_words)
    if word_lenght <= max_word_count and word_lenght >= min_word_count:
        word_list.append(words_without_stop_words)
        
        for word in words_without_stop_words:
            if word not in vocabulary_with_frequency.keys():
                vocabulary_with_frequency[word] = 1
            else:
                vocabulary_with_frequency[word] += 1

### Total words and vocabulary size

In [10]:
# After filtering the words
total_sentences = len(word_list)

total_words = 0
for words in word_list:
    total_words += len(words)
print('Total words in corpus', total_words)
print('Vocabulary size', len(vocabulary_with_frequency))

Total words in corpus 80055
Vocabulary size 15960


In [11]:
sorted_vocabulary_with_frequency = sorted(vocabulary_with_frequency.items(), key=lambda x: x[1], reverse=True)

### Remove less freuquent word form dictionary and assign unique id to vocabulary word


<br> 
Create two vocabulary.<br> 
<b>word_to_id</b> for gettig the word for an id. <br>
<b>id_to_word</b> for gettign the id for word. <br>
</br>


In [12]:
word_to_id = {}
word_to_id[0] = 'PAD'
id_to_word = {}
id_to_word['PAD'] = 0
word_id = 1

for word, _ in sorted_vocabulary_with_frequency:
    if word_id < vocabulary_size:
        word_to_id[word] = word_id
        id_to_word[word_id] = word
        word_id += 1
        
print(list(islice(word_to_id.items(), 15)))
print('-------------------------------------')
print(list(islice(id_to_word.items(), 15)))

[(0, 'PAD'), ('lincoln', 1), ('also', 2), ('one', 3), ('apollo', 4), ('state', 5), ('first', 6), ('achilles', 7), ('alabama', 8), ('aristotle', 9), ('time', 10), ('autism', 11), ('many', 12), ('new', 13), ('century', 14)]
-------------------------------------
[('PAD', 0), (1, 'lincoln'), (2, 'also'), (3, 'one'), (4, 'apollo'), (5, 'state'), (6, 'first'), (7, 'achilles'), (8, 'alabama'), (9, 'aristotle'), (10, 'time'), (11, 'autism'), (12, 'many'), (13, 'new'), (14, 'century')]


In [13]:
print('New dictionary size', vocabulary_size)

New dictionary size 30


### Convert word_list to word_id_list for expressign the words of a sentece using vocabulary id

<br> Remove the words which are not present in dictionary</br>

In [14]:
data_info['total sentence'] = total_sentences
data_info['total words'] = total_words

In [15]:
print('Before filtering total sentenc', total_sentences)
print('Before filtering total words', total_words)

sentece_word_ids = []
total_sentences = 0
total_words = 0

for words in word_list:
    filtered_words_ids = [word_to_id[word] for word in words if word in word_to_id.keys()]
    words_in_current_sentece = len(filtered_words_ids)
    
    if words_in_current_sentece >= min_word_count:
        total_sentences += 1
        total_words += words_in_current_sentece
        sentece_word_ids.append(filtered_words_ids)
        
print('--------------------------------------------')
print('After filtering total sentences', total_sentences)
print('After filtering total words', total_words)

data_info['After filtering total sentencs'] = total_sentences
data_info['After filtering total words'] = total_words

Before filtering total sentenc 6087
Before filtering total words 80055
--------------------------------------------
After filtering total sentences 82
After filtering total words 486


### Untill Now 
<br> 
<b>sentece_word_ids:</b> Sentence wise words's id. <br>
<b>word_to_id:</b> Vocabulary for getting the <b>word_id</b> for a <b>word</b>.<br>
<b>id_to_word:</b> Vocabulary for getting the <b>word</b> for a <b> word_id</b>.<br>
<b>positive_skip_grams:</b> skipgrams of target and context word pairs.
</br>

In [16]:
print('Total senteces', total_sentences)
print('Total words', total_words)
vocabulary_size = len(word_to_id)
print('Total unique words in vocabulary', vocabulary_size)

data_info['total_unique words in vocabulary'] = vocabulary_size

Total senteces 82
Total words 486
Total unique words in vocabulary 30


### Save data for later use
<br> It will help use to skip the data processing step

In [17]:
# def save_data(data, to_file):
#     if os.path.exists(to_file+'.yaml'):
#         os.remove(to_file+'.yaml')
    
#     with open(to_file+'.yaml', 'w') as file:
#         documents = yaml.dump(data, file, sort_keys=False)
    
# def load_data(from_file):
#     with open(from_file+'.yaml') as file:
#         return yaml.load(file, Loader=yaml.FullLoader)

### Save

In [18]:
# save_data(word_to_id, 'word_to_id')
# save_data(id_to_word, 'id_to_word')
# save_data(sentece_word_ids, 'sentece_word_ids')
# # save_data(positive_skip_grams, 'positive_skip_grams')

# ⚠️⚠️⚠️ Reset Everything ⚠️⚠️⚠️

In [19]:
# %reset

# Start again 🏃‍♂️🏃‍♂️🏃‍♂️

In [20]:
# import numpy as np
# import pandas as pd
# import sys
# import yaml
# import os
# import io

# import nltk
# from nltk.corpus import PlaintextCorpusReader
# from nltk.tokenize import word_tokenize
# from nltk.tokenize import sent_tokenize
# from nltk.corpus import stopwords

# import tensorflow as tf
# from tensorflow import keras

# from tensorflow.keras import layers
# from keras.preprocessing import text

# from itertools import islice
# from sklearn.model_selection import train_test_split

# from tensorflow.keras.layers import Embedding, Input, Dense, Dropout
# from tensorflow.keras.models import Model
# from tensorflow.keras import losses, optimizers
# from tensorflow.keras.activations import softmax

In [21]:
# def save_data(data, to_file):
#     if os.path.exists(to_file+'.yaml'):
#         os.remove('id_to_word.yaml')
    
#     with open(to_file+'.yaml', 'w') as file:
#         documents = yaml.dump(data, file, sort_keys=False)
    
# def load_data(from_file):
#     with open(from_file+'.yaml') as file:
#         return yaml.load(file, Loader=yaml.FullLoader)

### Load data

In [22]:
# id_to_word = load_data('id_to_word')
# word_to_id = load_data('word_to_id')
# sentece_word_ids = load_data('sentece_word_ids')
# # positive_skip_grams = load_data('positive_skip_grams')
# vocabulary_size = len(word_to_id)

#### Example after loadign data

In [23]:
print(list(islice(id_to_word.items(), 10)))
print('------------------------------------------')
print(list(islice(word_to_id.items(), 10)))
print('------------------------------------------')
print(sentece_word_ids[:5])
print('------------------------------------------')

[('PAD', 0), (1, 'lincoln'), (2, 'also'), (3, 'one'), (4, 'apollo'), (5, 'state'), (6, 'first'), (7, 'achilles'), (8, 'alabama'), (9, 'aristotle')]
------------------------------------------
[(0, 'PAD'), ('lincoln', 1), ('also', 2), ('one', 3), ('apollo', 4), ('state', 5), ('first', 6), ('achilles', 7), ('alabama', 8), ('aristotle', 9)]
------------------------------------------
[[16, 12, 26, 14, 24, 24], [5, 24, 19, 5, 6, 19], [24, 12, 24, 25, 24], [3, 24, 6, 24, 10], [6, 29, 24, 6, 24]]
------------------------------------------


In [24]:
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


### Generate 2D `vocabulary_size` list for getting the one-hot vector for a index

In [25]:
one_hot = [[0 for i in range(vocabulary_size)] for j in range(vocabulary_size)]

for index in range(1, vocabulary_size):
    one_hot[index][index] = 1

### Helper Function

In every cases name will be `pari`, `skip` or `cbow`.<br>

In [26]:
def get_path_name(name):
    if name == 'pair':
        return 'save/model_pair/'
    elif name == 'skip':
        return 'save/model_skip/'
    elif name == 'cbow':
        return 'save/model_cbow/'

In [27]:
def get_unique_file_name():
    return datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f")

In [28]:
def save_model(name, model, file_name):
    path = get_path_name(name)+'model/'
    
    if not os.path.exists(path):
        os.makedirs(path)
        
    if name == 'cbow':
        model.save(path+file_name+'/')
    else:
         model.save(path+file_name+'.h5')

In [29]:
def load_model(name, file_name):
    path = get_path_name(name)+'model/'
    
    if name == 'cbow':
        print(os.getcwd()+path+file_name)
        return keras.models.load_model(
            os.getcwd()+'/'+path+'/'+file_name+'/', custom_objects={"Average": Average}
        )
    
    return tf.keras.models.load_model(path+file_name+'.h5')


In [30]:
def get_model_list(name):
    path = get_path_name(name)+'model/'
    
    if os.path.exists(path):
        return os.listdir(path)

In [31]:
def save_info(name, file_name, data):
    path = get_path_name(name)+'data_info/'
    
    if not os.path.exists(path):
        os.makedirs(path)
            
    with open(path+file_name+'.yaml', 'w') as file:
        documents = yaml.dump(data, file, sort_keys=False)
    

In [32]:
def get_info_list(name):
    path = get_path_name(name)+'data_info/'
    
    if os.path.exists(path):
        return os.listdir(path)
        
        

In [33]:
def get_info(name, file_name):
    path = get_path_name(name)+'data_info/'
    
    if os.path.exists(path):
            with open(path+file_name+'.yaml') as file:
                return yaml.load(file, Loader=yaml.Loader)
            
#             yaml.load('Foo: !Ref bar', Loader=yaml.Loader)

In [34]:
file_name = get_unique_file_name()

# `Model - Pair Input`

### Save data_info to data_info_pair.
<br> Because `data_info` is common for all model</br>

In [35]:
data_info_pair = data_info.copy()

### Create (taget, context) pair for all of the words and convert them into one-hot encoding

In [36]:
X = []
Y = []

for ids in sentece_word_ids:
    skip, _ = tf.keras.preprocessing.sequence.skipgrams(
        ids,
        vocabulary_size,
        window_size=window_size, negative_samples=0,
        shuffle=False,
        categorical=True)
    for target, context in skip:
        X.append(one_hot[target])
        Y.append(one_hot[context])
print(np.array(X).shape)
print(vocabulary_size)

(2248, 30)
30


In [37]:
data_info_pair['Total target and context word pairs'] = len(X)

### Split training testing 

In [38]:
X_train_pair, X_test_pair, Y_train_pair, Y_test_pair = train_test_split(X, Y, test_size=0.2, random_state=42)

### Pairwise target, context model
<br>
Input target word in one-hot encoding format. <br>
Output context word of corespoding target word in one-hot encoding format. <br>

All of the targets and their corresponding context words are fitting at once. 
<br>

In [39]:
inp = Input(shape=(vocabulary_size, ))
x = Dense(200, name='w2v_embedding_pair')(inp)
x = Dense(vocabulary_size, activation='softmax')(x)

model_pair = Model(inputs=inp, outputs=x)
model_pair.summary()

Metal device set to: Apple M1
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 30)]              0         
_________________________________________________________________
w2v_embedding_pair (Dense)   (None, 200)               6200      
_________________________________________________________________
dense (Dense)                (None, 30)                6030      
Total params: 12,230
Trainable params: 12,230
Non-trainable params: 0
_________________________________________________________________


2021-11-19 03:17:44.848953: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-11-19 03:17:44.849048: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


### Fite the Pair model

In [40]:
training_started = datetime.datetime.now()

In [41]:
model_pair.compile(loss=losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])
history_pair = model_pair.fit(X_train_pair, Y_train_pair, epochs = 3, batch_size=2, verbose=1, shuffle=True)

2021-11-19 03:17:45.084152: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-19 03:17:45.084344: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2021-11-19 03:17:45.247742: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 1/3
Epoch 2/3
Epoch 3/3


In [42]:
training_end = datetime.datetime.now()
data_info_pair['training time'] =  training_end - training_started
data_info_pair['pair model training history'] = history_pair.history

### Save Pair Model 

In [43]:
save_model('pair', model_pair, file_name)

### Testing Pair Model

In [44]:
loss, accuracy = model_pair.evaluate(X_test_pair, Y_test_pair, verbose=2)

15/15 - 0s - loss: 2.5585 - accuracy: 0.3133


2021-11-19 03:17:57.400515: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


In [45]:
data_info_pair['pair model evaluatation loss'] = loss
data_info_pair['pair model evaluatation accuracy'] = accuracy
print('Evaluated loss',loss, 'and accuracy', accuracy)

Evaluated loss 2.558540105819702 and accuracy 0.31333333253860474


### Save Pair Model Information

In [46]:
save_info('pair',file_name, data_info_pair)

### Store the vector and metadata for Pair Model

In [47]:
weights = model_pair.get_layer('w2v_embedding_pair').get_weights()[0]

path = get_path_name('pair') + 'vector_metadata/'

if not os.path.exists(path):
        os.makedirs(path)

out_v = io.open(path+file_name+'vectors.tsv', 'w', encoding='utf-8')
out_m = io.open(path+file_name+'metadata.tsv', 'w', encoding='utf-8')

for index in range(1, vocabulary_size):
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(id_to_word[index] + "\n")
out_v.close()
out_m.close()

In [48]:
print('End Pair Model')

End Pair Model


# Data Processing for the CBOW and SKIP-GRAMS

<br> This function creates the target words and all of it's corrosponding context words.<br>
return:<br>
------<b>target_id:</b> id of the target word without encoding.<br>
------<b>context_ids:</b> All of the context words for a target_id<br>

<b>Note:</b> Zero padding is being used for returing total `window_size * 2` context words for a target word.<br>
<br>

In [49]:
from keras.preprocessing.sequence import pad_sequences

def get_target_contexts(word_ids, window_size):
    target_id = []
    context_ids = []

    one_hot_target_id = []
    one_hot_context_ids = []

    for ids in word_ids:
        for index, word_id in enumerate(ids):
            if not word_id:
                continue
                
            window_start = max(0, index - window_size)
            window_end = min(len(ids), index + window_size + 1)
            
            target_id.append(word_id)
            context_ids.append([ids[window_index] for window_index in range(window_start, window_end) if window_index != index])
                    
    return target_id, pad_sequences(context_ids, maxlen = window_size * 2)

In [50]:
target_id, context_ids =  get_target_contexts(sentece_word_ids, window_size)

In [51]:
data_info['Target words'] = len(target_id)
data_info['Context words with zero padding'] = len(context_ids) * window_size * 2

### Express `target_id` &  `context_ids` using one-hot encoding 

In [52]:
one_hot_target_id = [one_hot[id] for id in target_id]
one_hot_context_ids = []

for context_id in context_ids:
    one_hot_context_ids.append([one_hot[id] for id in context_id])

### Take the sum of input vecotor

In [53]:
X = []
for context in one_hot_context_ids:
    row = [0] * vocabulary_size
    for word in context:
        for idx in range(vocabulary_size):
            row[idx] += word[idx]
    X.append(row)

# CBOW

In [54]:
data_info_cbow =  data_info.copy()

### Split training testing

In [55]:
X_train_cbow, X_test_cbow, Y_train_cbow, Y_test_cbow = train_test_split(X, one_hot_target_id, test_size=0.2, random_state=42)

#### Extending keras layer for taking the average

In [56]:
class Average(keras.layers.Layer):
    def __init__(self, units=32, input_dim=32):
        super(Average, self).__init__()

    def call(self, inputs):
        return tf.math.divide(inputs, window_size*2)
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [57]:
inp = Input(shape=(vocabulary_size, ))
x = Dense(200, name='w2v_embedding_cbow')(inp)
x = Average(200)(x)
x = Dense(vocabulary_size, activation='softmax')(x)

model_cbow = Model(inputs=inp, outputs=x)
model_cbow.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 30)]              0         
_________________________________________________________________
w2v_embedding_cbow (Dense)   (None, 200)               6200      
_________________________________________________________________
average (Average)            (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 30)                6030      
Total params: 12,230
Trainable params: 12,230
Non-trainable params: 0
_________________________________________________________________


### Fit the CBOW model

In [58]:
training_started = datetime.datetime.now()

In [59]:
model_cbow.compile(loss=losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])
history_cbow = model_cbow.fit(X_train_cbow, Y_train_cbow, epochs = 3, batch_size=50, verbose=1, shuffle=False)

Epoch 1/3
Epoch 2/3
Epoch 3/3
1/8 [==>...........................] - ETA: 0s - loss: 3.3258 - accuracy: 0.2600

2021-11-19 03:17:57.698100: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




In [60]:
training_end = datetime.datetime.now()
data_info_cbow['training time'] =  training_end - training_started
data_info_cbow['cbow model training history'] = history_cbow.history

### Save CBOW Model

In [61]:
save_model('cbow', model_pair, file_name)

INFO:tensorflow:Assets written to: save/model_cbow/model/2021-11-19_03-17-44-823361/assets


2021-11-19 03:17:57.973651: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


### Testing CBOW Model

In [62]:
loss, accuracy = model_cbow.evaluate(X_test_cbow, Y_test_cbow, verbose=2)

2021-11-19 03:17:58.145453: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


4/4 - 0s - loss: 3.3266 - accuracy: 0.1939


In [63]:
data_info_cbow['cbow model evaluatation loss'] = loss
data_info_cbow['cbow model evaluatation accuracy'] = accuracy
print('Evaluated loss',loss, 'and accuracy', accuracy)

Evaluated loss 3.326587438583374 and accuracy 0.19387754797935486


### Save CBOW model information

In [64]:
save_info('cbow',file_name, data_info_cbow)

In [65]:
print('End of CBOW model')

End of CBOW model


## SKIP-GRAMS

In [66]:
data_info_skip = data_info.copy()

### Express the context ids(window_size *2) of a targetd id into a single row

In [67]:
flatten_one_hot_context_ids = [sum(one_hot_context_id, []) for one_hot_context_id in one_hot_context_ids]

### Split training testing data

In [68]:
X_train_skip, X_test_skip, Y_train_skip, Y_test_skip = train_test_split(one_hot_target_id, flatten_one_hot_context_ids, test_size=0.2, random_state=42)

### Create SKIP-GRAMS Model

In [69]:
inp = Input(shape=(vocabulary_size, ))
x = Dense(200, name='w2v_embedding_skip')(inp)
x = Dense(vocabulary_size * window_size * 2, activation='softmax')(x)
model_skip = Model(inputs=inp, outputs=x)
model_skip.summary()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 30)]              0         
_________________________________________________________________
w2v_embedding_skip (Dense)   (None, 200)               6200      
_________________________________________________________________
dense_2 (Dense)              (None, 240)               48240     
Total params: 54,440
Trainable params: 54,440
Non-trainable params: 0
_________________________________________________________________


### Fit skip-grams model

In [70]:
training_started = datetime.datetime.now()

In [71]:
model_skip.compile(loss=losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])
history_skip = model_skip.fit(X_train_skip, Y_train_skip, epochs = 5, batch_size=50, verbose=1, shuffle=False)

Epoch 1/5
Epoch 2/5
Epoch 3/5


2021-11-19 03:17:58.495028: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 4/5
Epoch 5/5


In [72]:
training_end = datetime.datetime.now()
data_info_skip['training time'] =  training_end - training_started
data_info_skip['skip-grams model training history'] = history_skip.history

### Save SKIP_GRAM Model

In [73]:
save_model('skip', model_skip, file_name)

### Test SKIP_GRAM Model

In [74]:
loss, accuracy = model_skip.evaluate(X_test_skip, Y_test_skip, verbose=2)

4/4 - 0s - loss: 24.8106 - accuracy: 0.0000e+00


2021-11-19 03:17:58.901534: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


In [75]:
data_info_skip['skip model evaluatation loss'] = loss
data_info_skip['skip model evaluatation accuracy'] = accuracy
print('Evaluated loss',loss, 'and accuracy', accuracy)

Evaluated loss 24.81056785583496 and accuracy 0.0


### Data info for Skip-Grams model


In [76]:
save_info('skip',file_name, data_info_skip)

In [77]:
print('End Skip-Gram Model')

End Skip-Gram Model


# Summary 

### Pair wise input model

In [78]:
pair = load_model('pair', file_name)
print(pair.summary())

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 30)]              0         
_________________________________________________________________
w2v_embedding_pair (Dense)   (None, 200)               6200      
_________________________________________________________________
dense (Dense)                (None, 30)                6030      
Total params: 12,230
Trainable params: 12,230
Non-trainable params: 0
_________________________________________________________________
None


In [79]:
get_info('pair', file_name)

{'data_limit': 300,
 'max_sentence_lenght': 1000,
 'max_word_count': 100,
 'min_word_count': 5,
 'max_sentence': 1000,
 'vocabulary_size': 30,
 'window_size': 4,
 'total sentence': 6087,
 'total words': 80055,
 'After filtering total sentencs': 82,
 'After filtering total words': 486,
 'total_unique words in vocabulary': 30,
 'Total target and context word pairs': 2248,
 'training time': datetime.timedelta(seconds=12, microseconds=409463),
 'pair model training history': {'loss': [2.801678419113159,
   2.404081344604492,
   2.3332059383392334],
  'accuracy': [0.30088990926742554, 0.3431590795516968, 0.3426029086112976]},
 'pair model evaluatation loss': 2.558540105819702,
 'pair model evaluatation accuracy': 0.31333333253860474}

### CBOW Model

In [80]:
cbow = load_model('cbow', file_name)
print(cbow.summary())

/Users/ruman/Documents/project/word2Vec/word2vecsave/model_cbow/model/2021-11-19_03-17-44-823361
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 30)]              0         
_________________________________________________________________
w2v_embedding_pair (Dense)   (None, 200)               6200      
_________________________________________________________________
dense (Dense)                (None, 30)                6030      
Total params: 12,230
Trainable params: 12,230
Non-trainable params: 0
_________________________________________________________________
None


In [81]:
get_info('cbow', file_name)

{'data_limit': 300,
 'max_sentence_lenght': 1000,
 'max_word_count': 100,
 'min_word_count': 5,
 'max_sentence': 1000,
 'vocabulary_size': 30,
 'window_size': 4,
 'total sentence': 6087,
 'total words': 80055,
 'After filtering total sentencs': 82,
 'After filtering total words': 486,
 'total_unique words in vocabulary': 30,
 'Target words': 486,
 'Context words with zero padding': 3888,
 'training time': datetime.timedelta(microseconds=343414),
 'cbow model training history': {'loss': [3.3940224647521973,
   3.3482391834259033,
   3.3044378757476807],
  'accuracy': [0.04123711585998535, 0.213917538523674, 0.30927836894989014]},
 'cbow model evaluatation loss': 3.326587438583374,
 'cbow model evaluatation accuracy': 0.19387754797935486}

### Skip-Gram Model

In [82]:
skip = load_model('skip', file_name)
print(skip.summary())

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(None, 30)]              0         
_________________________________________________________________
w2v_embedding_skip (Dense)   (None, 200)               6200      
_________________________________________________________________
dense_2 (Dense)              (None, 240)               48240     
Total params: 54,440
Trainable params: 54,440
Non-trainable params: 0
_________________________________________________________________
None


In [83]:
get_info('skip', file_name)

{'data_limit': 300,
 'max_sentence_lenght': 1000,
 'max_word_count': 100,
 'min_word_count': 5,
 'max_sentence': 1000,
 'vocabulary_size': 30,
 'window_size': 4,
 'total sentence': 6087,
 'total words': 80055,
 'After filtering total sentencs': 82,
 'After filtering total words': 486,
 'total_unique words in vocabulary': 30,
 'Target words': 486,
 'Context words with zero padding': 3888,
 'training time': datetime.timedelta(microseconds=574650),
 'skip-grams model training history': {'loss': [25.050983428955078,
   24.726524353027344,
   24.39234733581543,
   24.008689880371094,
   23.560100555419922],
  'accuracy': [0.012886598706245422,
   0.030927836894989014,
   0.0335051566362381,
   0.030927836894989014,
   0.030927836894989014]},
 'skip model evaluatation loss': 24.81056785583496,
 'skip model evaluatation accuracy': 0.0}

In [84]:
%who

Average	 Dense	 Dropout	 Embedding	 Input	 Model	 PlaintextCorpusReader	 X	 X_test_cbow	 
X_test_pair	 X_test_skip	 X_train_cbow	 X_train_pair	 X_train_skip	 Y	 Y_test_cbow	 Y_test_pair	 Y_test_skip	 
Y_train_cbow	 Y_train_pair	 Y_train_skip	 accuracy	 cbow	 context	 context_id	 context_ids	 data_info	 
data_info_cbow	 data_info_pair	 data_info_skip	 data_limit	 datetime	 df	 file_name	 filtered_words_ids	 flatten_one_hot_context_ids	 
get_info	 get_info_list	 get_model_list	 get_path_name	 get_target_contexts	 get_unique_file_name	 history_cbow	 history_pair	 history_skip	 
id_to_word	 ids	 idx	 index	 inp	 io	 islice	 keras	 layers	 
load_model	 loss	 losses	 max_sentence	 max_sentence_lenght	 max_word_count	 min_word_count	 model_cbow	 model_pair	 
model_skip	 nltk	 np	 one_hot	 one_hot_context_ids	 one_hot_target_id	 optimizers	 os	 out_m	 
out_v	 pad_sequences	 pair	 path	 pd	 physical_devices	 plt	 re	 row	 
save_info	 save_model	 section_texts	 sent_tokenize	 sentece_word_ids	 s

In [85]:
%whos

Variable                           Type                    Data/Info
--------------------------------------------------------------------
Average                            type                    <class '__main__.Average'>
Dense                              type                    <class 'keras.layers.core.Dense'>
Dropout                            type                    <class 'keras.layers.core.Dropout'>
Embedding                          type                    <class 'keras.layers.embeddings.Embedding'>
Input                              function                <function Input at 0x16981baf0>
Model                              type                    <class 'keras.engine.training.Model'>
PlaintextCorpusReader              type                    <class 'nltk.corpus.reade<...>t.PlaintextCorpusReader'>
X                                  list                    n=486
X_test_cbow                        list                    n=98
X_test_pair                        list              