### Iimport libraries

In [1]:
import numpy as np
import pandas as pd
import sys
import yaml
import os

import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.preprocessing import text

from itertools import islice

### Load data

In [2]:
df = pd.read_csv('enwiki.csv')
print('------------------Head------------------')
print(df.head())
print('------------------Tail------------------')
print(df.tail())

------------------Head------------------
   ARTICLE_ID      TITLE                 SECTION_TITLE  \
0           0  Anarchism                  Introduction   
1           0  Anarchism     Etymology and terminology   
2           0  Anarchism                       History   
3           0  Anarchism  Anarchist schools of thought   
4           0  Anarchism   Internal issues and debates   

                                        SECTION_TEXT  
0  \n\n\n\n\n\n'''Anarchism''' is a political phi...  
1  \n\nThe term ''anarchism'' is a compound word ...  
2  \n\n===Origins===\nWoodcut from a Diggers docu...  
3  \nPortrait of philosopher Pierre-Joseph Proudh...  
4  \nconsistent with anarchist values is a contro...  
------------------Tail------------------
        ARTICLE_ID              TITLE    SECTION_TITLE  \
265134       30475  Triboluminescence  Further reading   
265135       30475  Triboluminescence   External links   
265136       30476       Markov chain     Introduction   
265137 

### Remove unnecessary data from dataFrame

# Sete text limit for controlling the training and performance

In [3]:
# data_limit = 100

In [4]:
df.drop(['ARTICLE_ID', 'TITLE', 'SECTION_TITLE'], axis=1)
section_texts = df['SECTION_TEXT'].apply(str)
# section_texts = section_texts[:data_limit]
print('Total section texts ----->', len(section_texts))

Total section texts -----> 265139


### Set constrains for filtering the data

In [5]:
max_sentence_lenght = 1000
max_word_count = 100
min_woord_count = 5
max_sentence = 1000

### Convert text to sentences

In [6]:
sentences = []
for texts in section_texts:
    for sentence in sent_tokenize(texts):
        if len(sentence) < max_sentence_lenght:
            sentences.append(sentence)

In [7]:
total_sentences = len(sentences)
print('Total sentences', total_sentences)

Total sentences 3790493


In [16]:
print(sentences[:30000])



### Convert sentences to words and create vocabulary with frequency

In [8]:
stop_words = set(stopwords.words('english'))
word_list = []
vocabulary_with_frequency = {}

for sentence in sentences:
    words = word_tokenize(sentence)
    word_lenght = len(words)
    if word_lenght <= max_word_count and word_lenght >= min_woord_count:
        words = [word.lower() for word in words if word.isalpha() and word not in stop_words]
        word_list.append(words)
        
        for word in words:
            if word not in vocabulary_with_frequency.keys():
                vocabulary_with_frequency[word] = 1
            else:
                vocabulary_with_frequency[word] += 1

### Total words and vocabulary size

In [9]:
# After filtering the words
total_sentences = len(word_list)

total_words = 0
for words in word_list:
    total_words += len(words)
print('Total words', total_words)
print('Total unique words size', len(vocabulary_with_frequency))

Total words 46842818
Total unique words size 634539


### Sort the vocabulary based on their frequency

In [10]:
sorted_vocabulary_with_frequency  = sorted(vocabulary_with_frequency.items(), key=lambda x: x[1], reverse=True)
print(sorted_vocabulary_with_frequency[:10])
print('-----------------------------------------')
print(sorted_vocabulary_with_frequency[len(sorted_vocabulary_with_frequency)-10:])

[('the', 784090), ('in', 295815), ('also', 190665), ('one', 163906), ('first', 158683), ('new', 135533), ('a', 126570), ('used', 116096), ('two', 113851), ('may', 104353)]
-----------------------------------------
[('τρίβειν', 1), ('fractoluminescence', 1), ('piezoluminescent', 1), ('unpeeling', 1), ('electrization', 1), ('molotskii', 1), ('tudik', 1), ('poissions', 1), ('tribo', 1), ('memorylessness', 1)]


In [12]:
print(sorted_vocabulary_with_frequency[:30000])



### Assing unique id to vocabulary word
<br> 
Create two vocabulary.<br> 
<b>word_to_id</b> for gettig the word for an id. <br>
<b>id_to_word</b> for gettign the id for word. <br>
</br>

In [11]:
word_to_id = {}
word_to_id[0] = 'PAD'
id_to_word = {}
id_to_word['PAD'] = 0
word_id = 1

for word, _ in sorted_vocabulary_with_frequency:
    word_to_id[word] = word_id
    id_to_word[word_id] = word
    word_id += 1
    
print(list(islice(word_to_id.items(), 10)))
print('-------------------------------------')
print(list(islice(id_to_word.items(), 10)))

[(0, 'PAD'), ('the', 1), ('in', 2), ('also', 3), ('one', 4), ('first', 5), ('new', 6), ('a', 7), ('used', 8), ('two', 9)]
-------------------------------------
[('PAD', 0), (1, 'the'), (2, 'in'), (3, 'also'), (4, 'one'), (5, 'first'), (6, 'new'), (7, 'a'), (8, 'used'), (9, 'two')]


### Convert word_list to wor_id_list for expressign the words of a sentece using vocabulary id

In [12]:
sentece_word_ids = []
for words in word_list:
    sentece_word_ids.append([word_to_id[word] for word in words])

print(word_list[0])
print(sentece_word_ids[0])

['anarchism', 'political', 'philosophy', 'advocates', 'societies', 'based', 'voluntary', 'institutions']
[9019, 124, 862, 6342, 2938, 93, 6483, 1514]


### Generate positive target & context word
<br>
We will just save the id of the target and context word instead of the word. <br>
Also don't need to save the label as we are just taking the positive skipgrams.
</br>

In [13]:
window_size = 8
vocab_size = len(id_to_word)
positive_skip_grams = []

for word_ids in sentece_word_ids:
    skip_grams, labels = tf.keras.preprocessing.sequence.skipgrams(
        word_ids,
        vocabulary_size=vocab_size,
        window_size=window_size,
        negative_samples=0)
    positive_skip_grams.append(skip_grams)
total_positive_skip_grams = len(positive_skip_grams)
print('Total positive skip grams pairs', total_positive_skip_grams)

Total positive skip grams pairs 3681645


### Example of skipgrams

In [14]:
print('target_id, target word--->context_id, context word')
print('---------------------------------------------')
for skip_grams in positive_skip_grams[:1]:
    for target, context in skip_grams:
        print(target, id_to_word[target],'--->',context ,id_to_word[context])

target_id, target word--->context_id, context word
---------------------------------------------
1514 institutions ---> 2938 societies
2938 societies ---> 93 based
124 political ---> 6483 voluntary
6483 voluntary ---> 862 philosophy
2938 societies ---> 6342 advocates
1514 institutions ---> 6483 voluntary
6342 advocates ---> 9019 anarchism
1514 institutions ---> 9019 anarchism
862 philosophy ---> 9019 anarchism
1514 institutions ---> 862 philosophy
6483 voluntary ---> 9019 anarchism
6342 advocates ---> 1514 institutions
6342 advocates ---> 6483 voluntary
2938 societies ---> 6483 voluntary
862 philosophy ---> 6342 advocates
6342 advocates ---> 862 philosophy
9019 anarchism ---> 862 philosophy
93 based ---> 862 philosophy
124 political ---> 6342 advocates
6483 voluntary ---> 6342 advocates
862 philosophy ---> 93 based
9019 anarchism ---> 2938 societies
6342 advocates ---> 124 political
93 based ---> 9019 anarchism
862 philosophy ---> 6483 voluntary
2938 societies ---> 1514 institutions
86

### Untill Now
<br> 
<b>sentece_word_ids:</b> Sentence wise words's id. <br>
<b>word_to_id:</b> Vocabulary for getting the <b>word_id</b> for a <b>word</b>.<br>
<b>id_to_word:</b> Vocabulary for getting the <b>word</b> for a <b> word_id</b>.<br>
<b>positive_skip_grams:</b> skipgrams of target and context word pairs.
</br>

In [15]:
print('Total senteces', total_sentences)
print('Total words', total_words)
vocabulary_size = len(word_to_id)
print('Total unique words in vocabulary', vocabulary_size)
print('Total positive skipgrams pair', total_positive_skip_grams)

Total senteces 3681645
Total words 46842818
Total unique words in vocabulary 634540
Total positive skipgrams pair 3681645


### Save data for later use
<br> It will help use to skip the data processing step

In [16]:
def save_data(data, to_file):
    if os.path.exists(to_file+'.yaml'):
        os.remove(to_file+'.yaml')
    
    with open(to_file+'.yaml', 'w') as file:
        documents = yaml.dump(data, file, sort_keys=False)
    
def load_data(from_file):
    with open(from_file+'.yaml') as file:
        return yaml.load(file, Loader=yaml.FullLoader)

### Save

In [None]:
save_data(word_to_id, 'word_to_id')
save_data(id_to_word, 'id_to_word')
save_data(sentece_word_ids, 'sentece_word_ids')
save_data(positive_skip_grams, 'positive_skip_grams')

# ⚠️⚠️⚠️ Reset Everything ⚠️⚠️⚠️

In [None]:
%reset

# Start again 🏃‍♂️🏃‍♂️🏃‍♂️

In [None]:
import numpy as np
import pandas as pd
import sys
import yaml
import os

import nltk
from nltk.corpus import PlaintextCorpusReader
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.preprocessing import text

from itertools import islice

In [None]:
def save_data(data, to_file):
    if os.path.exists(to_file+'.yaml'):
        os.remove('id_to_word.yaml')
    
    with open(to_file+'.yaml', 'w') as file:
        documents = yaml.dump(data, file, sort_keys=False)
    
def load_data(from_file):
    with open(from_file+'.yaml') as file:
        return yaml.load(file, Loader=yaml.FullLoader)

### Load data

In [None]:
id_to_word = load_data('id_to_word')
word_to_id = load_data('word_to_id')
sentece_word_ids = load_data('sentece_word_ids')
positive_skip_grams = load_data('positive_skip_grams')

#### Example after loadign data

In [None]:
print(list(islice(id_to_word.items(), 10)))
print('------------------------------------------')
print(list(islice(word_to_id.items(), 10)))
print('------------------------------------------')
print(sentece_word_ids[:5])
print('------------------------------------------')
print('target_id, target word--->context_id, context word')
print('---------------------------------------------')
for skip_grams in positive_skip_grams[:1]:
    for target, context in skip_grams:
        print(target, id_to_word[target],'--->',context ,id_to_word[context])

# Function for getting the one hot encoded vector

In [None]:
def get_encoded_vector_for(word_id, vocab_size):
#     return np_utils.to_categorical(word_id, vocab_size)
    lst = [0] * vocab_size
    lst[word_id] = 1
    return lst
print(get_encoded_vector_for(10, 11))

In [None]:
physical_devices = tf.config.list_physical_devices('GPU')
print(physical_devices)
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
window_size = 8
vocab_size = len(id_to_word)
print()
positive_skip_grams, labels = tf.keras.preprocessing.sequence.skipgrams(
      sentece_word_ids[0],
      vocabulary_size=vocab_size,
      window_size=window_size,
      negative_samples=0)
print(len(positive_skip_grams))