In [1]:
import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten

# Sample data
data = """Deep learning (also known as deep structured learning) is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised. Deep-learning architectures such as deep neural networks, deep belief networks, deep reinforcement learning, recurrent neural networks, convolutional neural networks and Transformers have been applied to fields including computer vision, speech recognition, natural language processing, machine translation, bioinformatics, drug design, medical image analysis, climate science, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance."""

# Data preparation
sentences = data.lower().split('.')
clean_sent = []
for sentence in sentences:
    if sentence.strip():  # Check for non-empty sentence,and remove the staring and ending whitespace if it is not having any string present then it will proceed
        sentence = re.sub('[^a-zA-Z\s]', '', sentence)  # Remove non-alphabetic characters
        clean_sent.append(sentence.strip())

# Generate sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_sent)
sequences = tokenizer.texts_to_sequences(clean_sent)

# Mapping words to indexes
index_to_word = {v: k for k, v in tokenizer.word_index.items()}
vocab_size = len(tokenizer.word_index) + 1
emb_size = 10
context_size = 2

# Generate training data for CBOW
contexts = []
targets = []

for sequence in sequences:
    for i in range(context_size, len(sequence) - context_size):
        target = sequence[i]
        context = []
        # Collect context words within the window size
        for j in range(-context_size, context_size + 1):
            if j != 0:
                context.append(sequence[i + j])
        contexts.append(context)
        targets.append(target)

# Convert the contexts and targets to numpy arrays
X = np.array(contexts)
Y = np.array(targets)

# Model training
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emb_size, input_length=context_size * 2))  # CBOW needs context size * 2
model.add(Flatten())

model.add(Dense(vocab_size, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, Y, epochs=2, verbose=1)

# Print context and target pairs after training
for i in range(5):  # Print the first 5 examples
    words = [index_to_word[j] for j in contexts[i]]
    target = index_to_word[targets[i]]
    print(f"Context words: {words} -> Target word: {target}")

# Output
print("Model training completed.")




Epoch 1/2




[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step - accuracy: 0.0526 - loss: 4.3291
Epoch 2/2
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0506 - loss: 4.3232  
Context words: ['deep', 'learning', 'known', 'as'] -> Target word: also
Context words: ['learning', 'also', 'as', 'deep'] -> Target word: known
Context words: ['also', 'known', 'deep', 'structured'] -> Target word: as
Context words: ['known', 'as', 'structured', 'learning'] -> Target word: deep
Context words: ['as', 'deep', 'learning', 'is'] -> Target word: structured
Model training completed.
