### Iimport libraries

In [1]:
import numpy as np
import pandas as pd
import sys
import yaml
import os
import re

import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.preprocessing import text

from itertools import islice

import matplotlib.pyplot as plt
%matplotlib inline

### Load data

In [2]:
df = pd.read_csv('enwiki.csv')
print('------------------Head------------------')
print(df.head())
print('------------------Tail------------------')
print(df.tail())

------------------Head------------------
   ARTICLE_ID      TITLE                 SECTION_TITLE  \
0           0  Anarchism                  Introduction   
1           0  Anarchism     Etymology and terminology   
2           0  Anarchism                       History   
3           0  Anarchism  Anarchist schools of thought   
4           0  Anarchism   Internal issues and debates   

                                        SECTION_TEXT  
0  \n\n\n\n\n\n'''Anarchism''' is a political phi...  
1  \n\nThe term ''anarchism'' is a compound word ...  
2  \n\n===Origins===\nWoodcut from a Diggers docu...  
3  \nPortrait of philosopher Pierre-Joseph Proudh...  
4  \nconsistent with anarchist values is a contro...  
------------------Tail------------------
        ARTICLE_ID              TITLE    SECTION_TITLE  \
265134       30475  Triboluminescence  Further reading   
265135       30475  Triboluminescence   External links   
265136       30476       Markov chain     Introduction   
265137 

### Remove unnecessary data from dataFrame

# Set constrains for filtering the data

In [3]:
data_limit = 5000 # 0 for all taking all data. There is exis at most 2,65,139 paragraphs 
max_sentence_lenght = 1000
max_word_count = 100
min_word_count = 5
max_sentence = 1000
vocabulary_size = 300
window_size = 4

In [4]:
df.drop(['ARTICLE_ID', 'TITLE', 'SECTION_TITLE'], axis=1)
section_texts = df['SECTION_TEXT'].apply(str)
print('Total section texts ----->', len(section_texts))

Total section texts -----> 265139


In [5]:
if data_limit != 0:
    section_texts = section_texts[:data_limit]
print('Total section texts ----->', len(section_texts))

Total section texts -----> 500


### Convert text to sentences

In [6]:
sentences = []
for texts in section_texts:
    for sentence in sent_tokenize(texts):
        if len(sentence) < max_sentence_lenght:
            sentences.append(sentence.lower())

In [7]:
total_sentences = len(sentences)
print('Total sentences', total_sentences)

Total sentences 10159


### Convert sentences to words and create vocabulary with frequency

In [8]:
stop_words = set(stopwords.words('english'))
word_list = []
vocabulary_with_frequency = {}

for sentence in sentences:
    words = word_tokenize(sentence)
    words_without_stop_words = [word for word in words if word.isalpha() and word not in stop_words and len(word) != 1]
    
    word_lenght = len(words_without_stop_words)
    if word_lenght <= max_word_count and word_lenght >= min_word_count:
        word_list.append(words_without_stop_words)
        
        for word in words_without_stop_words:
            if word not in vocabulary_with_frequency.keys():
                vocabulary_with_frequency[word] = 1
            else:
                vocabulary_with_frequency[word] += 1

### Total words and vocabulary size

In [9]:
# After filtering the words
total_sentences = len(word_list)

total_words = 0
for words in word_list:
    total_words += len(words)
print('Total words in corpus', total_words)
print('Vocabulary size', len(vocabulary_with_frequency))

Total words in corpus 117846
Vocabulary size 20465


In [10]:
sorted_vocabulary_with_frequency = sorted(vocabulary_with_frequency.items(), key=lambda x: x[1], reverse=True)

### Remove less freuquent word form dictionary and assign unique id to vocabulary word


<br> 
Create two vocabulary.<br> 
<b>word_to_id</b> for gettig the word for an id. <br>
<b>id_to_word</b> for gettign the id for word. <br>
</br>


In [11]:
word_to_id = {}
word_to_id[0] = 'PAD'
id_to_word = {}
id_to_word['PAD'] = 0
word_id = 1

for word, _ in sorted_vocabulary_with_frequency:
    if word_id < vocabulary_size:
        word_to_id[word] = word_id
        id_to_word[word_id] = word
        word_id += 1
        
print(list(islice(word_to_id.items(), 15)))
print('-------------------------------------')
print(list(islice(id_to_word.items(), 15)))

[(0, 'PAD'), ('also', 1), ('one', 2), ('lincoln', 3), ('state', 4), ('first', 5), ('apollo', 6), ('many', 7), ('alaska', 8), ('time', 9), ('used', 10), ('may', 11), ('two', 12), ('new', 13), ('achilles', 14)]
-------------------------------------
[('PAD', 0), (1, 'also'), (2, 'one'), (3, 'lincoln'), (4, 'state'), (5, 'first'), (6, 'apollo'), (7, 'many'), (8, 'alaska'), (9, 'time'), (10, 'used'), (11, 'may'), (12, 'two'), (13, 'new'), (14, 'achilles')]


In [12]:
print('New dictionary size', vocabulary_size)

New dictionary size 50


### Convert word_list to word_id_list for expressign the words of a sentece using vocabulary id

<br> Remove the words which are not present in dictionary</br>

In [13]:
print('Before filtering total sentenc', total_sentences)
print('Before filtering total words', total_words)

sentece_word_ids = []
total_sentences = 0
total_words = 0

for words in word_list:
    filtered_words_ids = [word_to_id[word] for word in words if word in word_to_id.keys()]
    words_in_current_sentece = len(filtered_words_ids)
    
    if words_in_current_sentece >= min_word_count:
        total_sentences += 1
        total_words += words_in_current_sentece
        sentece_word_ids.append(filtered_words_ids)
        
print('--------------------------------------------')
print('After filtering total sentence', total_sentences)
print('After filtering total words', total_words)

Before filtering total sentenc 9070
Before filtering total words 117846
--------------------------------------------
After filtering total sentence 200
After filtering total words 1214


### Untill Now 
<br> 
<b>sentece_word_ids:</b> Sentence wise words's id. <br>
<b>word_to_id:</b> Vocabulary for getting the <b>word_id</b> for a <b>word</b>.<br>
<b>id_to_word:</b> Vocabulary for getting the <b>word</b> for a <b> word_id</b>.<br>
<b>positive_skip_grams:</b> skipgrams of target and context word pairs.
</br>

In [14]:
print('Total senteces', total_sentences)
print('Total words', total_words)
vocabulary_size = len(word_to_id)
print('Total unique words in vocabulary', vocabulary_size)

Total senteces 200
Total words 1214
Total unique words in vocabulary 50


### Save data for later use
<br> It will help use to skip the data processing step

In [15]:
def save_data(data, to_file):
    if os.path.exists(to_file+'.yaml'):
        os.remove(to_file+'.yaml')
    
    with open(to_file+'.yaml', 'w') as file:
        documents = yaml.dump(data, file, sort_keys=False)
    
def load_data(from_file):
    with open(from_file+'.yaml') as file:
        return yaml.load(file, Loader=yaml.FullLoader)

### Save

In [16]:
# save_data(word_to_id, 'word_to_id')
# save_data(id_to_word, 'id_to_word')
# save_data(sentece_word_ids, 'sentece_word_ids')
# # save_data(positive_skip_grams, 'positive_skip_grams')

# ⚠️⚠️⚠️ Reset Everything ⚠️⚠️⚠️

In [17]:
# %reset

# Start again 🏃‍♂️🏃‍♂️🏃‍♂️

In [18]:
import numpy as np
import pandas as pd
import sys
import yaml
import os
import io

import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow import keras

from tensorflow.keras import layers
from keras.preprocessing import text

from itertools import islice
from sklearn.model_selection import train_test_split

from tensorflow.keras.layers import Embedding, Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras import losses, optimizers
from tensorflow.keras.activations import softmax

In [19]:
def save_data(data, to_file):
    if os.path.exists(to_file+'.yaml'):
        os.remove('id_to_word.yaml')
    
    with open(to_file+'.yaml', 'w') as file:
        documents = yaml.dump(data, file, sort_keys=False)
    
def load_data(from_file):
    with open(from_file+'.yaml') as file:
        return yaml.load(file, Loader=yaml.FullLoader)

### Load data

In [20]:
# id_to_word = load_data('id_to_word')
# word_to_id = load_data('word_to_id')
# sentece_word_ids = load_data('sentece_word_ids')
# # positive_skip_grams = load_data('positive_skip_grams')
# vocabulary_size = len(word_to_id)

#### Example after loadign data

In [21]:
print(list(islice(id_to_word.items(), 10)))
print('------------------------------------------')
print(list(islice(word_to_id.items(), 10)))
print('------------------------------------------')
print(sentece_word_ids[:5])
print('------------------------------------------')

[('PAD', 0), (1, 'also'), (2, 'one'), (3, 'lincoln'), (4, 'state'), (5, 'first'), (6, 'apollo'), (7, 'many'), (8, 'alaska'), (9, 'time')]
------------------------------------------
[(0, 'PAD'), ('also', 1), ('one', 2), ('lincoln', 3), ('state', 4), ('first', 5), ('apollo', 6), ('many', 7), ('alaska', 8), ('time', 9)]
------------------------------------------
[[18, 7, 24, 19, 43, 27, 43], [32, 37, 10, 10, 33, 16, 27, 33, 16], [49, 5, 33, 30, 36], [4, 30, 43, 11, 46, 4, 5, 11], [4, 43, 30, 33, 16]]
------------------------------------------


# Function for getting the one hot encoded vector

In [22]:
def get_encoded_vector_for(word_id):
    lst = [0 for _ in range(vocabulary_size)]
    lst[word_id] = 1
    return lst

### Generate 2D `vocabulary_size` list for getting the one-hot vector for a index

In [30]:
one_hot = [[0 for i in range(vocabulary_size)] for j in range(vocabulary_size)]

for index in range(1, vocabulary_size):
    one_hot[index][index] = 1

### Create (taget, context) pair for all of the words and convert them into one-hot encoding

In [31]:
X = []
Y = []

for ids in sentece_word_ids:
    skip, _ = tf.keras.preprocessing.sequence.skipgrams(
        ids,
        vocabulary_size,
        window_size=window_size, negative_samples=0,
        shuffle=False,
        categorical=True)
    for target, context in skip:
        X.append(one_hot[target])
        Y.append(one_hot[context])
print(np.array(X).shape)
print(vocabulary_size)

(5712, 50)
50


### Split training testing 

In [32]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Set GPU

In [33]:
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [34]:
### Implement the model for 

In [35]:
inp = Input(shape=(vocabulary_size, ))
x = Dense(200, name='w2v_embedding')(inp)
x = Dense(vocabulary_size, activation='softmax')(x)

model = Model(inputs=inp, outputs=x)
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 50)]              0         
_________________________________________________________________
w2v_embedding (Dense)        (None, 200)               10200     
_________________________________________________________________
dense_2 (Dense)              (None, 50)                10050     
Total params: 20,250
Trainable params: 20,250
Non-trainable params: 0
_________________________________________________________________


In [36]:
model.compile(loss=losses.categorical_crossentropy, optimizer='adam', metrics=['accuracy'])
model.fit(X_train, Y_train, epochs = 3, batch_size=2, verbose=1, shuffle=True)

Epoch 1/3
  20/2285 [..............................] - ETA: 12s - loss: 3.8930 - accuracy: 0.0250    

2021-11-17 20:42:19.123353: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x2ca53d130>

In [None]:
weights = model.get_layer('w2v_embedding').get_weights()[0]

In [None]:
if os.path.exists('metadata.tsv'):
        os.remove('metadata.tsv')
        
if os.path.exists('vectors.tsv'):
        os.remove('vectors.tsv')

In [None]:
out_v = io.open('vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('metadata.tsv', 'w', encoding='utf-8')

In [None]:
for index in range(1, vocabulary_size):
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(id_to_word[index] + "\n")
out_v.close()
out_m.close()
   

In [None]:
print('End')