In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from gensim.models import Word2Vec
import string

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Data preparation
def clean_and_tokenize(text):
    # Tokenize sentences and words, remove punctuation, and convert to lowercase
    sentences = sent_tokenize(text)
    word_tokens = [word_tokenize(sentence) for sentence in sentences]
    word_tokens = [[word.lower() for word in sentence if word not in string.punctuation] for sentence in word_tokens]
    return word_tokens

def remove_stopwords(tokens):
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    return [[word for word in sentence if word.lower() not in stop_words] for sentence in tokens]

def preprocess_text(text):
    # Combine tokenization and stopword removal for preprocessing
    tokens = clean_and_tokenize(text)
    tokens = remove_stopwords(tokens)
    return tokens

# Generate training data
def generate_cbow_data(tokens, window_size=2):
    data = []
    unique_words = set()

    for sentence in tokens:
        for i, target_word in enumerate(sentence):
            # Generate context for each target word
            context = [sentence[j] for j in range(max(0, i - window_size), min(len(sentence), i + window_size + 1)) if j != i]
            unique_words.update(context + [target_word])
            data.append(context + [target_word])  # Combine context and target into a single list

    return data, list(unique_words)

# Train model
def train_cbow_model(data, unique_words, embedding_size=100, epochs=100):
    # Train Word2Vec CBOW model
    model = Word2Vec(vector_size=embedding_size, window=2, sg=0, min_count=1)
    model.build_vocab(data)
    model.train(data, total_examples=model.corpus_count, epochs=epochs)
    return model

# Output
def find_similar_words(model, word, topn=5, exclude_words=None):
    # Find similar words to a given word (excluding specified words)
    similar_words = [(w, s) for w, s in model.wv.most_similar(word, topn=topn + len(exclude_words)) if w not in exclude_words]
    return similar_words[:topn]

def predict_word(model, context_words, exclude_words=None):
    # Predict a word based on the context words provided
    vector_sum = sum([model.wv[word] for word in context_words if word in model.wv and not all(char in string.punctuation for char in word) and word not in exclude_words])
    if vector_sum.any():
        # Find similar words to the calculated vector, excluding specified words and words in the context
        filtered_similar_words = [(w, s) for w, s in model.wv.similar_by_vector(vector_sum, topn=len(model.wv)) if w not in exclude_words and w not in context_words]
        return filtered_similar_words[0][0] if filtered_similar_words else None

# Example text
text = "There was a lion, bird and rat in jungle. Monkey was sitting on tree. Rabbit was below the tree."
tokens = preprocess_text(text)
print(tokens)
print("\n")

# Generate CBOW training data
cbow_data, unique_words = generate_cbow_data(tokens)

# Train CBOW model
cbow_model = train_cbow_model(cbow_data, unique_words, epochs=100)

# Example output
given_word = "jungle"
exclude_punctuation = list(string.punctuation)
similar_words = find_similar_words(cbow_model, given_word, exclude_words=exclude_punctuation)
print(f"Similar words to '{given_word}': {similar_words}")

example_context_words = ['lion', 'bird', 'rat']
predicted_word = predict_word(cbow_model, example_context_words, exclude_words=exclude_punctuation)
print(f"Predicted word from context words {example_context_words}: {predicted_word}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


[['lion', 'bird', 'rat', 'jungle'], ['monkey', 'sitting', 'tree'], ['rabbit', 'tree']]


Similar words to 'jungle': [('sitting', 0.13919983804225922), ('monkey', 0.13157564401626587), ('bird', 0.06499496847391129), ('lion', 0.020040670409798622), ('rat', 0.010431213304400444)]
Predicted word from context words ['lion', 'bird', 'rat']: sitting
