### Import libraries

In [1]:
import numpy as np
import pandas as pd
import yaml
import os
import re
import io
import datetime

import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers import Embedding, Input, Dense, Dropout
from keras.preprocessing import text
from tensorflow.keras.models import Model
from tensorflow.keras import losses, optimizers
from tensorflow.keras.activations import softmax
from keras.preprocessing.sequence import pad_sequences

from itertools import islice

from sklearn.model_selection import train_test_split

# import matplotlib.pyplot as plt
# %matplotlib inline

### Data Set

<br>
<b>Total Aritcles:</b> 265139 <br>
<b>Total Senteces:</b> 3681645 <br>
<b>Total Words:</b> 46842818 <br>
<b>Total Unique Words:</b> 634540 <br>

<a href="https://www.kaggle.com/jkkphys/english-wikipedia-articles-20170820-sqlite/discussion/149578">Data Source</a>

</br>

### Load Data

In [2]:
df = pd.read_csv('train_ag.csv')
print('------------------Head------------------')
print(df.head())
print('------------------Tail------------------')
print(df.tail())

------------------Head------------------
   Class Index                                              Title  \
0            3  Wall St. Bears Claw Back Into the Black (Reuters)   
1            3  Carlyle Looks Toward Commercial Aerospace (Reu...   
2            3    Oil and Economy Cloud Stocks' Outlook (Reuters)   
3            3  Iraq Halts Oil Exports from Main Southern Pipe...   
4            3  Oil prices soar to all-time record, posing new...   

                                         Description  
0  Reuters - Short-sellers, Wall Street's dwindli...  
1  Reuters - Private investment firm Carlyle Grou...  
2  Reuters - Soaring crude prices plus worries\ab...  
3  Reuters - Authorities have halted oil export\f...  
4  AFP - Tearaway world oil prices, toppling reco...  
------------------Tail------------------
        Class Index                                              Title  \
119995            1  Pakistan's Musharraf Says Won't Quit as Army C...   
119996            2      

### Remove unnecessary data from dataFrame

# Set constrains for filtering the data

In [3]:
# For saving the information offline


In [4]:
# Set `data_limit` zero (0) for processing all of the data.
data_limit = 0
vocabulary_size = 400 
window_size = 5
max_sentence_lenght = 1000
max_word_count = 100
min_word_count = 5
max_sentence = 1000

data_info = {}
data_info['data_limit']= data_limit
data_info['vocabulary_size'] = vocabulary_size
data_info['window_size'] = window_size 
data_info['max_sentence_lenght'] = max_sentence_lenght
data_info['max_word_count'] = max_word_count 
data_info['min_word_count'] = min_word_count
data_info['max_sentence'] = max_sentence 

In [5]:
section_texts = df['Description'].apply(str)

In [6]:
# df.drop(['ARTICLE_ID', 'TITLE', 'SECTION_TITLE'], axis=1)
# section_texts = df['SECTION_TEXT'].apply(str)
# print('Total Aritcles ----->', len(section_texts))

In [7]:
if data_limit != 0:
    section_texts = section_texts[:data_limit]
print('Total section texts ----->', len(section_texts))

Total section texts -----> 120000


### Download `nltk` Resources for Data Processing 

In [8]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/ruman/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/ruman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# data = open("data.txt").readlines()

# sentences = []
# for texts in data:
#     for sentence in sent_tokenize(texts):
#         if len(sentence) < max_sentence_lenght:
#             sentences.append(sentence.lower())

### Convert Aritcles to Sentences

In [10]:
sentences = []
for texts in section_texts:
    for sentence in sent_tokenize(texts):
        if len(sentence) < max_sentence_lenght:
            sentences.append(sentence.lower())

In [11]:
total_sentences = len(sentences)
print('Total sentences', total_sentences)

Total sentences 154346


### Convert Sentences to Words and Create Vocabulary with Frequency

In [12]:
stop_words = set(stopwords.words('english'))
word_list = []
vocabulary_with_frequency = {}

for sentence in sentences:
    words = word_tokenize(sentence)
    words_without_stop_words = [word for word in words if word.isalpha() and word not in stop_words and len(word) != 1]
    
    word_lenght = len(words_without_stop_words)
    if word_lenght <= max_word_count and word_lenght >= min_word_count:
        word_list.append(words_without_stop_words)
        
        for word in words_without_stop_words:
            if word not in vocabulary_with_frequency.keys():
                vocabulary_with_frequency[word] = 1
            else:
                vocabulary_with_frequency[word] += 1

### After Filtering Total Words and Vocabulary

In [13]:
total_sentences = len(word_list)

total_words = 0
for words in word_list:
    total_words += len(words)
print('Total words in corpus', total_words)
print('Vocabulary size', len(vocabulary_with_frequency))

Total words in corpus 2212850
Vocabulary size 55271


### Sort Vocabulary Based on Frequency Count

In [14]:
sorted_vocabulary_with_frequency = sorted(vocabulary_with_frequency.items(), key=lambda x: x[1], reverse=True)

### Remove Less Freuquent Word form Dictionary and Assign Unique Id to Each Word

<br> 
Create two Dictionaries.<br> 
<b>word_to_id:</b> to get word for a word id. <br>
<b>id_to_word:</b> to get the id for word. <br>
</br>


In [15]:
word_to_id = {}
word_to_id[0] = 'PAD'
id_to_word = {}
id_to_word['PAD'] = 0
word_id = 1

for word, _ in sorted_vocabulary_with_frequency:
    if word_id < vocabulary_size:
        word_to_id[word] = word_id
        id_to_word[word_id] = word
        word_id += 1
        
print(list(islice(word_to_id.items(), 15)))
print('-------------------------------------')
print(list(islice(id_to_word.items(), 15)))

[(0, 'PAD'), ('said', 1), ('new', 2), ('reuters', 3), ('quot', 4), ('us', 5), ('ap', 6), ('two', 7), ('gt', 8), ('lt', 9), ('first', 10), ('monday', 11), ('wednesday', 12), ('tuesday', 13), ('company', 14)]
-------------------------------------
[('PAD', 0), (1, 'said'), (2, 'new'), (3, 'reuters'), (4, 'quot'), (5, 'us'), (6, 'ap'), (7, 'two'), (8, 'gt'), (9, 'lt'), (10, 'first'), (11, 'monday'), (12, 'wednesday'), (13, 'tuesday'), (14, 'company')]


In [16]:
print('New dictionary size after removing less frequent words', vocabulary_size)

New dictionary size after removing less frequent words 400


### Convert `word_list` to `word_id_list` for Expressign the Words Vocabualry Id

<br> Remove the words which are not present in dictionary</br>

In [17]:
data_info['Total sentences before filtering'] = total_sentences
data_info['Total words before filtering'] = total_words

print('Total sentences before filtering', total_sentences)
print('Total words before filtering', total_words)

Total sentences before filtering 143642
Total words before filtering 2212850


In [18]:
sentece_word_ids = []
total_sentences = 0
total_words = 0

for words in word_list:
    filtered_words_ids = [word_to_id[word] for word in words if word in word_to_id.keys()]
    words_in_current_sentece = len(filtered_words_ids)
    
    if words_in_current_sentece >= min_word_count:
        total_sentences += 1
        total_words += words_in_current_sentece
        sentece_word_ids.append(filtered_words_ids)
        
print('--------------------------------------------')
print('Total sentences after filtering', total_sentences)
print('Total words after filtering', total_words)

data_info['Total sentences after filtering'] = total_sentences
data_info['Total words after filtering'] = total_words

--------------------------------------------
Total sentences after filtering 86235
Total words after filtering 689781


### Untill Now 
<br> 
<b>sentece_word_ids:</b> Sentence wise word's id. <br>
<b>word_to_id:</b> Dictionary for getting the <b>word_id</b> for a <b>word</b>.<br>
<b>id_to_word:</b> Dictionary for getting the <b>word</b> for a <b> word_id</b>.<br>
</br>

In [19]:
print('Total sentences', total_sentences)
print('Total words', total_words)
vocabulary_size = len(word_to_id)
print('Total unique words in dictionary', vocabulary_size)
data_info['Total unique words in dictionary'] = vocabulary_size

Total sentences 86235
Total words 689781
Total unique words in dictionary 400


### For loading the GPU of Macbook M1

In [20]:
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


### Helper Function Save and Load the Model and Informatino Data

In every cases name will be `pari` for pair-model, `skip` for skip-gram model and `cbow` for CBOW model.<br>

In [21]:
def get_path_name(name):
    if name == 'pair':
        return 'save/model_pair/'
    elif name == 'skip':
        return 'save/model_skip/'
    elif name == 'cbow':
        return 'save/model_cbow/'
    
def get_unique_file_name():
    return datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f")


def save_model(name, model, file_name):
    path = get_path_name(name)+'model/'
    
    if not os.path.exists(path):
        os.makedirs(path)
        
    if name == 'cbow':
        model.save(path+file_name+'/')
    else:
         model.save(path+file_name+'.h5')
            
            
def load_model(name, file_name):
    path = get_path_name(name)+'model/'
    
    if name == 'cbow':
        print(os.getcwd()+path+file_name)
        return keras.models.load_model(
            os.getcwd()+'/'+path+'/'+file_name+'/', custom_objects={"Average": Average}
        )
    
    return tf.keras.models.load_model(path+file_name+'.h5')


def get_model_list(name):
    path = get_path_name(name)+'model/'
    
    if os.path.exists(path):
        return os.listdir(path)
    
def save_info(name, file_name, data):
    path = get_path_name(name)+'data_info/'
    
    if not os.path.exists(path):
        os.makedirs(path)
            
    with open(path+file_name+'.yaml', 'w') as file:
        documents = yaml.dump(data, file, sort_keys=False)
    
    
def get_info_list(name):
    path = get_path_name(name)+'data_info/'
    
    if os.path.exists(path):
        return os.listdir(path)
        

def get_info(name, file_name):
    path = get_path_name(name)+'data_info/'
    
    if os.path.exists(path):
            with open(path+file_name+'.yaml') as file:
                return yaml.load(file, Loader=yaml.Loader)

In [22]:
file_name = get_unique_file_name()

### Generate 2D `vocabulary_size` List for Getting the `one-hot` Vector for a Word Id

Purpose
<br>
If we want to get/create a one-hot vector for a word_id, than just we have to call the corrosponding row or column of from the one_hot matrix.
<br>
<br>
All the elements of the first row or column of the one_hot is zeror. So, If we need a zerro containing(All elemtns are zeror) one-hot vecotr, than just need to take the first row or the first column.



In [23]:
one_hot = np.zeros((vocabulary_size,vocabulary_size))

for index in range(1, vocabulary_size):
    one_hot[index][index] = 1

# Data Processing for the CBOW and SKIP-GRAM Model

In `CBOW` and `SKIP-GRAM` models, we will be needed the `target word` and their corresponding `context words`. 
For creating the context word for a target word, we need to consider total `window_size * 2` words. Among the `window_size * 2` words, `window_size` words will be on left side of the target word and `window_size` words will be on the right side. If the number of context words for target word is less than the `window_size * 2`, then I have use the zero padding. 


<br> This function creates the target words and their corrosponding context words.<br>
return:<br>
------<b>target_id:</b> of the target word.<br>
------<b>context_ids:</b> of the context words. I have also used the zero padding to make total number of `window_size * 2` target words.<br>
<br>

# Important note 
context_ids te target id er id onusare value assing kora hoiche. 1 ta target er jnno jotota context ache toto value oi index a assing kora hocihe

In [24]:
def get_target_contexts(word_ids, window_size):
    target_id = []
    context_ids = []
    
    for ids in word_ids:
        for index, word_id in enumerate(ids):
            if not word_id:
                continue
                
            window_start = max(0, index - window_size)
            window_end = min(len(ids), index + window_size + 1)
            
            target_id.append(one_hot[word_id])
            
            zero_context_ids = np.zeros(vocabulary_size)
            for window_index in range(window_start, window_end):
                if window_index != index:
                    zero_context_ids[ids[window_index]] += 1
            context_ids.append(zero_context_ids)
            del zero_context_ids
            
    return target_id, context_ids

In [25]:
target_id, context_ids =  get_target_contexts(sentece_word_ids, window_size)
data_info['Total target words'] = len(target_id)

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(target_id, context_ids, test_size=0.2, random_state=42)

X_train = np.array(X_train)
X_test = np.array(X_test)
Y_train = np.array(Y_train)
Y_test = np.array(Y_test)

# CBOW Model

### Calculate the Sum of the Input Vectors 

I have pre calculated the sum for reducing the training time.

In [27]:
data_info_cbow =  data_info.copy()

### Split Training Testing Data

`20%` data for testing.

### Extending Keras Layer for Taking the Average

In [28]:
class Average(keras.layers.Layer):
    def __init__(self, units=32, input_dim=32):
        super(Average, self).__init__()

    def call(self, inputs):
        return tf.math.divide(inputs, window_size*2)
    
    @classmethod
    def from_config(cls, config):
        return cls(**config)

In [29]:
inp = Input(shape=(vocabulary_size, ))
x = Dense(300, name='w2v_embedding_cbow')(inp)
x = Average(200)(x)
x = Dense(vocabulary_size, activation='softmax')(x)

model_cbow = Model(inputs=inp, outputs=x)
model_cbow.summary()

Metal device set to: Apple M1
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 400)]             0         
_________________________________________________________________
w2v_embedding_cbow (Dense)   (None, 300)               120300    
_________________________________________________________________
average (Average)            (None, 300)               0         
_________________________________________________________________
dense (Dense)                (None, 400)               120400    
Total params: 240,700
Trainable params: 240,700
Non-trainable params: 0
_________________________________________________________________


2021-11-24 15:23:52.659402: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-11-24 15:23:52.662763: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


### Fit the CBOW model

In [30]:
def custom_loss_function(y_true, y_pred):
    return tf.reduce_mean(-tf.math.reduce_sum(y_true * tf.math.log(y_pred), axis=[1]))


training_started = datetime.datetime.now()
model_cbow.compile(loss=custom_loss_function, optimizer=tf.optimizers.SGD(learning_rate=0.01), metrics=['accuracy'])
history_cbow = model_cbow.fit(Y_train, X_train, epochs = 5, batch_size=50, verbose=1, shuffle=True)

2021-11-24 15:23:54.924237: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-24 15:23:54.927515: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2021-11-24 15:23:55.076485: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 1/5

KeyboardInterrupt: 

In [None]:
training_end = datetime.datetime.now()
data_info_cbow['training time'] =  training_end - training_started
data_info_cbow['cbow model training history'] = history_cbow.history

### Save CBOW Model

In [None]:
save_model('cbow', model_cbow, file_name)

### Testing CBOW Model

In [None]:
loss, accuracy = model_cbow.evaluate(X_test, Y_test, verbose=2)

In [None]:
data_info_cbow['cbow model evaluatation loss'] = loss
data_info_cbow['cbow model evaluatation accuracy'] = accuracy
print('Evaluated loss',loss, 'and accuracy', accuracy)

### Save CBOW model information

In [None]:
save_info('cbow',file_name, data_info_cbow)

In [None]:
weights_cbow = model_cbow.get_layer('w2v_embedding_cbow').get_weights()[0]

path_cbow = get_path_name('cbow') + 'vector_metadata/'


if not os.path.exists(path_cbow):
        os.makedirs(path_cbow)

out_v = io.open(path_cbow+file_name+'vectors.tsv', 'w', encoding='utf-8')
out_m = io.open(path_cbow+file_name+'metadata.tsv', 'w', encoding='utf-8')

for index in range(1, vocabulary_size):
    vec = weights_cbow[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(id_to_word[index] + "\n")
out_v.close()
out_m.close()

In [None]:
print('End of CBOW model')

In [None]:
# del X_contexts                     
# del X_test_cbow            
# del X_train_cbow           
# del Y_test_cbow            
# del Y_train_cbow          
# del accuracy               
# del data_info_cbow        
# del history_cbow                     
# del inp                   
# del loss                             
# del pad_sequences          
# del training_end           
# del training_started      
# del x   
# del model_cbow 
# del out_m
# del index                                  
# del out_v                  
# del path_cbow             
# del vec                   
# del weights_cbow           

In [None]:
# %whos

## SKIP-GRAMS

In [None]:
data_info_skip = data_info.copy()

### Create SKIP-GRAMS Model

In [None]:
inp = Input(shape=(vocabulary_size, ))
x = Dense(500, name='w2v_embedding_skip', bias_initializer=tf.initializers.RandomNormal(stddev=1.0))(inp)
x = Dense(vocabulary_size, bias_initializer=tf.initializers.RandomNormal(stddev=1.0), activation='softmax')(x)
model_skip = Model(inputs=inp, outputs=x)
model_skip.summary()

### Fit skip-grams model

In [None]:
training_started = datetime.datetime.now()

In [None]:
def custom_loss_function(y_true, y_pred):
    return tf.reduce_mean(-tf.math.reduce_sum(y_true * tf.math.log(y_pred), axis=[1]))


model_skip.compile(loss=custom_loss_function, optimizer=tf.optimizers.SGD(learning_rate=0.01), metrics=['accuracy'])
history_skip = model_skip.fit(X_train, Y_train, epochs = 3, batch_size=50, verbose=1, shuffle=True)

In [None]:
training_end = datetime.datetime.now()
data_info_skip['training time'] =  training_end - training_started
data_info_skip['skip-grams model training history'] = history_skip.history

### Save SKIP_GRAM Model

In [None]:
save_model('skip', model_skip, file_name)

### Test SKIP_GRAM Model

In [None]:
loss, accuracy = model_skip.evaluate(X_test, Y_test, verbose=2)

In [None]:
data_info_skip['skip model evaluatation loss'] = loss
data_info_skip['skip model evaluatation accuracy'] = accuracy
print('Evaluated loss',loss, 'and accuracy', accuracy)

### Save Data info for Skip-Gram model


In [None]:
save_info('skip',file_name, data_info_skip)

### Save Vector and Metadata For SKIP-GRAM

In [None]:
weights_skip = model_skip.get_layer('w2v_embedding_skip').get_weights()[0]

path_skip = get_path_name('skip') + 'vector_metadata/'


if not os.path.exists(path_skip):
        os.makedirs(path_skip)

out_v = io.open(path_skip+file_name+'vectors.tsv', 'w', encoding='utf-8')
out_m = io.open(path_skip+file_name+'metadata.tsv', 'w', encoding='utf-8')

for index in range(1, vocabulary_size):
    vec = weights_skip[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(id_to_word[index] + "\n")
out_v.close()
out_m.close()

In [None]:
print('End Skip-Gram Model')

# Summary 

### CBOW Model

In [None]:
cbow = load_model('cbow', file_name)
print(cbow.summary())

In [None]:
get_info('cbow', file_name)

### Skip-Gram Model

In [None]:
skip = load_model('skip', file_name)
print(skip.summary())

In [None]:
get_info('skip', file_name)

In [None]:
print('THE END')