In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.probability import FreqDist
from nltk.corpus import wordnet
from nltk import pos_tag

In [2]:
def preprocess_text(text):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    # Tokenize each sentence into words and apply lemmatization
    lemmatizer = WordNetLemmatizer()
    preprocessed_sentences = []
    for sentence in sentences:
        words = word_tokenize(sentence)
        # Remove stopwords and non-alphabetic characters
        words = [lemmatizer.lemmatize(word.lower()) for word in words if word.isalpha() and word.lower() not in stopwords.words('english')]
        preprocessed_sentences.append(words)
    return preprocessed_sentences


In [3]:
from gensim.models import Word2Vec

def train_word_embeddings(preprocessed_text):
    model = Word2Vec(preprocessed_text, size=100, window=5, min_count=1, workers=4)
    return model


In [4]:
def get_word_vector(word, model):
    try:
        vector = model.wv[word]
        return vector
    except KeyError:
        # If the word is not found in the vocabulary
        return None


In [10]:
from gensim.models import Word2Vec

# Preprocess text and train initial embeddings
text = "This is an example sentence."
preprocessed_text = preprocess_text(text)
model = Word2Vec(sentences=preprocessed_text, vector_size=100, window=5, min_count=1, workers=4)

# Get initial embeddings
initial_embeddings = model.wv.key_to_index

# Print initial embeddings
print("Initial Embeddings:")
for word, index in initial_embeddings.items():
    vector = model.wv[word]
    print(word, vector)

# Update embeddings with new text
new_text = "This is a new sentence."
new_preprocessed_text = preprocess_text(new_text)
model.build_vocab(new_preprocessed_text, update=True)
model.train(new_preprocessed_text, total_examples=model.corpus_count, epochs=model.epochs)

# Get updated embeddings
updated_embeddings = model.wv.key_to_index

# Print updated embeddings
print("\nUpdated Embeddings:")
for word, index in updated_embeddings.items():
    vector = model.wv[word]
    print(word, vector)


Initial Embeddings:
sentence [-5.3622725e-04  2.3643136e-04  5.1033497e-03  9.0092728e-03
 -9.3029495e-03 -7.1168090e-03  6.4588725e-03  8.9729885e-03
 -5.0154282e-03 -3.7633716e-03  7.3805046e-03 -1.5334714e-03
 -4.5366134e-03  6.5540518e-03 -4.8601604e-03 -1.8160177e-03
  2.8765798e-03  9.9187379e-04 -8.2852151e-03 -9.4488179e-03
  7.3117660e-03  5.0702621e-03  6.7576934e-03  7.6286553e-04
  6.3508903e-03 -3.4053659e-03 -9.4640139e-04  5.7685734e-03
 -7.5216377e-03 -3.9361035e-03 -7.5115822e-03 -9.3004224e-04
  9.5381187e-03 -7.3191668e-03 -2.3337686e-03 -1.9377411e-03
  8.0774371e-03 -5.9308959e-03  4.5162440e-05 -4.7537340e-03
 -9.6035507e-03  5.0072931e-03 -8.7595852e-03 -4.3918253e-03
 -3.5099984e-05 -2.9618145e-04 -7.6612402e-03  9.6147433e-03
  4.9820580e-03  9.2331432e-03 -8.1579173e-03  4.4957981e-03
 -4.1370760e-03  8.2453608e-04  8.4986202e-03 -4.4621765e-03
  4.5175003e-03 -6.7869602e-03 -3.5484887e-03  9.3985079e-03
 -1.5776526e-03  3.2137157e-04 -4.1406299e-03 -7.6826881