#**Implement Skip-Gram Model**

###**Import the Libraries**

In [None]:
import re
import nltk
from nltk.corpus import gutenberg
from string import punctuation
import numpy as np

In [None]:
import keras.backend as K
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Lambda
from keras.preprocessing.sequence import skipgrams
from keras.layers import Concatenate
from keras.layers.core import Reshape

In [None]:
nltk.download('gutenberg')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')


###**Build the Model**

In [None]:
def normalize_document(doc):
  doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
  doc = doc.lower()
  doc = doc.strip()
  tokens = wpt.tokenize(doc)
  filtered_tokens = [token for token in tokens if token not in stop_words]
  doc = ' '.join(filtered_tokens)
  return doc


In [None]:
normalize_corpus = np.vectorize(normalize_document)
bible = gutenberg.sents('bible-kjv.txt')
remove_terms = punctuation + '0123456789'
norm_bible = [[word.lower() for word in sent if word not in remove_terms] for sent in bible]
norm_bible = [' '.join(tok_sent) for tok_sent in norm_bible]
norm_bible = filter(None, normalize_corpus(norm_bible))
norm_bible = [tok_sent for tok_sent in norm_bible if len(tok_sent.split()) > 2]


In [None]:
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(norm_bible)
word2id = tokenizer.word_index
id2word = {v:k for k, v in word2id.items()}
vocab_size = len(word2id) + 1
embed_size = 100

In [None]:
wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_bible]
print('Vocabulary Size:', vocab_size)

Vocabulary Size: 12425


In [None]:
def generate_context_word_pairs(corpus, window_size, vocab_size):
  context_length = window_size*2
  for words in corpus:
    sentence_length = len(words)
    for index, word in enumerate(words):
      context_words = []
      label_word = []
      start = index - window_size
      end = index + window_size + 1
      context_words.append([words[i]
      for i in range(start, end)
      if 0 <= i < sentence_length
      and i != index])
      label_word.append(word)
      x = sequence.pad_sequences(context_words, maxlen=context_length)
      y = np_utils.to_categorical(label_word, vocab_size)
      yield (x, y)
      
skip_grams = [skipgrams(wid, vocabulary_size=vocab_size, window_size=10) for wid in
wids]


###**Compiling and checking the model accuracy**

In [None]:
word_model = Sequential()
word_model.add(Embedding(vocab_size, embed_size,embeddings_initializer="glorot_uniform",input_length=1))
word_model.add(Reshape((embed_size, )))
context_model = Sequential()
context_model.add(Embedding(vocab_size, embed_size,embeddings_initializer="glorot_uniform",input_length=1))
context_model.add(Reshape((embed_size,)))


In [None]:
model = Sequential()
model.add(Concatenate([word_model, context_model]))
model.add(Dense(1, kernel_initializer="glorot_uniform", activation="sigmoid"))
model.compile(loss="mean_squared_error", optimizer="rmsprop")
for epoch in range(1, 6):
  loss = 0
for i, elem in enumerate(skip_grams):
  pair_first_elem = np.array(list(zip(*elem[0]))[0], dtype='int32')
  pair_second_elem = np.array(list(zip(*elem[0]))[1], dtype='int32')
  labels = np.array(elem[1], dtype='int32')
  X = [pair_first_elem, pair_second_elem]
  Y = labels
  if i % 10000 == 0:
    print('Processed {} (skip_first, skip_second, relevance) pairs'.format(i))
  
print('Epoch:', epoch, 'Loss:', loss)
