In [None]:
!pip install tensorflow 

In [3]:
p=[]
with open("/kaggle/input/sherleck-books/a_study_in_scarlet.txt","r") as f:
    p=f.read()


In [4]:
import nltk
nltk.download('punkt')

text = p

tokens = nltk.word_tokenize(text.lower())

vocab = {word: idx for idx, word in enumerate(set(tokens))}
vocab_size = len(vocab)

#print("Vocabulary:", vocab)


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# Generate (target, context) pairs for Skip-gram using a simple sliding window
window_size = 2
skip_gram_pairs = []

for i, target in enumerate(tokens):
    for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
        if i != j:
            skip_gram_pairs.append((vocab[target], vocab[tokens[j]]))

#print("Skip-gram pairs:", skip_gram_pairs)

# For CBOW, prepare (context, target) pairs:
cbow_pairs = []
for i, target in enumerate(tokens):
    context = []
    for j in range(max(0, i - window_size), min(len(tokens), i + window_size + 1)):
        if i != j:
            context.append(vocab[tokens[j]])
    if context:
        cbow_pairs.append((context, vocab[target]))

#print("CBOW pairs:", cbow_pairs)


In [None]:
import tensorflow as tf
from tensorflow.keras import layers, Model

embedding_dim = 50

# Define a simple Skip-gram model
class SkipGramModel(Model):
    def __init__(self, vocab_size, embedding_dim):
        super(SkipGramModel, self).__init__()
        self.embedding = layers.Embedding(vocab_size, embedding_dim, input_length=1)
        self.dense = layers.Dense(vocab_size, activation='softmax')
    
    def call(self, inputs):
        x = self.embedding(inputs)
        x = tf.reshape(x, (-1, embedding_dim))
        return self.dense(x)

# Create model
skipgram_model = SkipGramModel(vocab_size, embedding_dim)
skipgram_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Example training data for Skip-gram
import numpy as np
# Separate targets and contexts from pairs
targets = np.array([pair[0] for pair in skip_gram_pairs])
contexts = np.array([pair[1] for pair in skip_gram_pairs])

# Train the model (for demonstration, use a small number of epochs)
skipgram_model.fit(targets, contexts, epochs=10, batch_size=2)




Epoch 1/10
[1m111895/111895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m124s[0m 1ms/step - loss: 6.8623
Epoch 2/10
[1m  3790/111895[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m1:55[0m 1ms/step - loss: 6.5489

In [None]:
# Define a simple CBOW model: average context embeddings to predict the target word.
class CBOWModel(Model):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOWModel, self).__init__()
        self.embedding = layers.Embedding(vocab_size, embedding_dim, input_length=None)
        self.dense = layers.Dense(vocab_size, activation='softmax')
    
    def call(self, inputs):
        # inputs shape: (batch_size, context_window)
        x = self.embedding(inputs)
        # Average over context words
        x = tf.reduce_mean(x, axis=1)
        return self.dense(x)

cbow_model = CBOWModel(vocab_size, embedding_dim)
cbow_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Prepare CBOW training data: for simplicity, pad context list to fixed size (here: window_size*2)
import numpy as np

max_context = window_size * 2
def pad_context(context_list, max_len):
    return context_list + [0]*(max_len - len(context_list))

contexts_cbow = np.array([pad_context(c, max_context) for c, _ in cbow_pairs])
targets_cbow = np.array([target for _, target in cbow_pairs])

cbow_model.fit(contexts_cbow, targets_cbow, epochs=10, batch_size=2)


In [None]:
import math

# Calculate perplexity for the Skip-gram model
skip_loss = skipgram_model.evaluate(targets, contexts, verbose=0)
skip_perplexity = math.exp(skip_loss)
print("Skip-gram Perplexity:", skip_perplexity)

# Calculate perplexity for the CBOW model
cbow_loss = cbow_model.evaluate(contexts_cbow, targets_cbow, verbose=0)
cbow_perplexity = math.exp(cbow_loss)
print("CBOW Perplexity:", cbow_perplexity)


In [None]:
import numpy as np

# Choose a sample target word and get its index
sample_word = "sofa"
sample_index = vocab[sample_word] if sample_word in vocab else list(vocab.values())[0]

# Predict context word probabilities from the sample target
predicted_probs = skipgram_model.predict(np.array([sample_index]))
predicted_context_index = np.argmax(predicted_probs, axis=-1)[0]

# Find the corresponding word from the vocabulary
predicted_context_word = [word for word, idx in vocab.items() if idx == predicted_context_index][0]
print("Skip-gram prediction - For target word '{}', predicted context word: '{}'".format(sample_word, predicted_context_word))


In [None]:
# Use the first CBOW pair as a sample
sample_context_indices, true_target_index = cbow_pairs[0]

# Pad the context to fixed length (max_context defined earlier)
sample_context_padded = pad_context(sample_context_indices, max_context)
sample_context_padded = np.array([sample_context_padded])

# Predict target word probabilities from the context
predicted_target_probs = cbow_model.predict(sample_context_padded)
predicted_target_index = np.argmax(predicted_target_probs, axis=-1)[0]

# Find the corresponding word from the vocabulary
predicted_target_word = [word for word, idx in vocab.items() if idx == predicted_target_index][0]
print("CBOW prediction - For context words {} , predicted target word: '{}'".format(sample_context_indices, predicted_target_word))
