In [3]:
import gensim
from gensim.models import Word2Vec, fasttext, basemodel
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re,string


In [4]:
sentences = [
    "The quick brown fox jumps over the lazy dog.",
    "Artificial intelligence is transforming the world we live in.",
    "Deep learning techniques have greatly improved image recognition.",
    "Natural language processing allows computers to understand human language.",
    "Data science combines statistics, computer science, and domain knowledge.",
    "The weather is nice today, perfect for a walk in the park.",
    "Cats are often seen as independent and curious creatures.",
    "The stock market fluctuates based on various economic indicators.",
    "Exploring new cuisines can be an exciting culinary adventure.",
    "Machine learning algorithms can learn from data and make predictions.",
]

In [5]:
cleaned_sentences = []
for sentence in sentences:
    sentence = sentence.lower()
    sentence = re.sub(r'[' + string.punctuation + ']', '', sentence)  # Remove punctuation
    words = sentence.split()
    cleaned_sentences.append(words)

print(cleaned_sentences) 

[['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'], ['artificial', 'intelligence', 'is', 'transforming', 'the', 'world', 'we', 'live', 'in'], ['deep', 'learning', 'techniques', 'have', 'greatly', 'improved', 'image', 'recognition'], ['natural', 'language', 'processing', 'allows', 'computers', 'to', 'understand', 'human', 'language'], ['data', 'science', 'combines', 'statistics', 'computer', 'science', 'and', 'domain', 'knowledge'], ['the', 'weather', 'is', 'nice', 'today', 'perfect', 'for', 'a', 'walk', 'in', 'the', 'park'], ['cats', 'are', 'often', 'seen', 'as', 'independent', 'and', 'curious', 'creatures'], ['the', 'stock', 'market', 'fluctuates', 'based', 'on', 'various', 'economic', 'indicators'], ['exploring', 'new', 'cuisines', 'can', 'be', 'an', 'exciting', 'culinary', 'adventure'], ['machine', 'learning', 'algorithms', 'can', 'learn', 'from', 'data', 'and', 'make', 'predictions']]


In [6]:
# Initialize the Word2Vec model
model = Word2Vec(sentences=cleaned_sentences, vector_size=100, window=5, min_count=1, workers=4, sg=1)

# Train the model
model.train(sentences, total_words=len(sentences), epochs=10)  # Adjust epochs according to your needs

# Save the model
model.save('word2vec_model.model')


In [7]:
print(model)

Word2Vec<vocab=79, vector_size=100, alpha=0.025>


In [10]:
# Get word vectors
#print(model.wv['cats'])  # Example: Vector for 'cat'

# Find similar words
similar_words = model.wv.most_similar('learning', topn=5)
#print(similar_words)
for word, similarity in similar_words:
    print(f"{word}: {similarity}")

are: 0.2048470377922058
quick: 0.178854301571846
often: 0.16478683054447174
over: 0.1647077351808548
economic: 0.14939117431640625


In [11]:
from gensim.models import FastText
from gensim.utils import simple_preprocess


In [12]:
model_ft = FastText(
    sentences=cleaned_sentences,
    vector_size=100,  # Desired dimensionality of word vectors
    window=5,         # Context window size
    min_count=1,      # Ignore words with frequency lower than this
    epochs=10,        # Number of training epochs
    min_n=3,          # Minimum length of character n-grams
    max_n=6           # Maximum length of character n-grams
)

In [13]:
print("Training complete!")

# You can save the trained model for later use
model_ft.save("fasttext.model")

# Load a saved model
# model_ft = FastText.load("fasttext.model")

Training complete!


In [14]:
print(model_ft)

FastText<vocab=79, vector_size=100, alpha=0.025>


In [15]:
# Get word vectors
print(model_ft.wv['cats'])  # Example: Vector for 'cat'

# Find similar words
similar_words = model_ft.wv.most_similar('computer', topn=5)
#print(similar_words)
for word, similarity in similar_words:
    print(f"{word}: {similarity}")

[ 7.40063959e-04 -2.22702743e-03  1.66845959e-04 -1.64000870e-04
  1.65959343e-03  3.54314980e-04 -3.48753319e-03  1.06613315e-03
  7.31494336e-04  1.77278742e-03  3.04223904e-05  3.72497103e-04
  1.14309916e-03  1.37822842e-03  9.23539745e-04  1.01227791e-03
 -1.92596810e-03  1.67554687e-03  1.15786889e-03 -6.70962036e-04
 -6.20148086e-04  2.39434652e-03 -1.82943651e-03  8.85732297e-04
  7.80585571e-04 -2.92350003e-03 -1.45963405e-03 -2.91437586e-03
  4.74989152e-04  2.66354182e-03  1.97057589e-03  7.36803340e-04
  8.06871103e-04  1.70344440e-03  2.50757835e-03 -4.18425491e-03
 -3.95614450e-04 -2.22398108e-03 -1.00545911e-03 -8.91898701e-04
  5.23301831e-04 -1.25129125e-03 -3.09557217e-05  1.34247623e-03
 -1.92352501e-03  2.91019300e-04  1.28291396e-03  1.59886281e-03
 -1.15333503e-04 -2.20665243e-03  7.30930420e-04 -3.67990159e-03
  1.75600871e-05  2.17235205e-03 -1.32606074e-03  2.02091993e-03
  2.12951051e-03  1.74516335e-03  2.77338899e-03 -7.24400976e-04
  6.15922982e-05  4.83805

In [16]:
print(model_ft.wv['brown']) 

[ 1.2467726e-03  1.4294123e-03 -5.3756579e-04 -1.6706369e-03
  2.6824910e-04 -1.3063306e-03 -4.4282558e-04 -2.5285671e-03
  2.6834519e-03  1.5831494e-03 -5.9941708e-04  1.2274151e-04
 -8.8621564e-05 -3.4938713e-03 -1.0953621e-03  1.6996145e-03
 -1.0023414e-03  1.7837378e-03 -1.5263911e-03 -1.5787021e-03
  6.9531606e-04 -6.5853738e-04  6.5536186e-04 -1.3626592e-03
 -5.0497160e-04  6.0532318e-04 -3.0028724e-04 -1.9711591e-03
  5.6647381e-04 -6.7858701e-04  1.2829925e-03 -2.0296269e-03
 -3.7227219e-03 -3.0457885e-03 -4.7996014e-04 -1.1163530e-03
  1.8976571e-03  1.5967693e-04  3.2887158e-03  2.2398203e-03
  6.6891115e-04 -2.7861774e-03 -4.0591569e-04 -1.4465790e-03
  9.1799261e-04 -1.3588240e-03  2.1983203e-03 -1.1000640e-04
  2.8750710e-03  5.6421885e-04 -8.8894158e-04  7.3919469e-04
 -2.6106045e-03  8.1918691e-04  1.2076354e-03  2.0838410e-03
 -2.5062734e-04 -1.4046145e-03 -2.2669781e-03  2.3914550e-03
  1.0263665e-03 -1.7516873e-03 -1.0460396e-03  1.0417083e-03
  2.5432170e-03  1.28845

In [13]:
sentence1 = 'cat in the wall'
sentence2= 'the weather is cloudy today'
# Find the most similar words to 'sentence'
similar_words = model_ft.wv.most_similar(sentence2, topn=5)
print(f"Words most similar to 'sentence':\n{similar_words}\n")

# Find most similar words, even for OOV words
similar_oov = model_ft.wv.most_similar('learning computer for one year', topn=5)
print(f"Words most similar to OOV word 'anothersentenceexample':\n{similar_oov}\n")

Words most similar to 'sentence':
[('the', 0.3163280189037323), ('walk', 0.3010618984699249), ('today', 0.2275698035955429), ('dog', 0.20017848908901215), ('intelligence', 0.17871858179569244)]

Words most similar to OOV word 'anothersentenceexample':
[('computer', 0.4790094196796417), ('computers', 0.4531535804271698), ('learning', 0.4286929666996002), ('learn', 0.33162397146224976), ('today', 0.2413749247789383)]



In [16]:
sentence1 = 'cat in the wall'
# Calculate the similarity between two words
similarity = model_ft.wv.similarity(sentence1, 'rat')
print(f"Similarity between 'sentence' and 'word2vec': {similarity}\n")
sentence2= 'the weather is cloudy today'
similarity_oov = model_ft.wv.similarity(sentence2, 'weather')
print(f"Similarity between 'sentence' and OOV word 'totallynewword': {similarity_oov}\n")


Similarity between 'sentence' and 'word2vec': 0.02959563583135605

Similarity between 'sentence' and OOV word 'totallynewword': 0.1776554435491562



In [76]:
sim_score = model_ft.wv.similarity('man', 'human')

In [67]:
print(sim_score)

-0.103488185


### Code from internet using chatgpt

In [20]:
from gensim.models import FastText
from gensim.test.utils import common_texts  # Example corpus
from pprint import pprint

# Example corpus (you can use your own preprocessed text data)
print("Sample corpus:")
pprint(common_texts)

# Train FastText model
model = FastText(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4, sg=1)

# Save and load the model
model.save("fasttext.model")
model = FastText.load("fasttext.model")

# Example: Get vector for a word
word_vector = model.wv['computer']
print("\nVector for 'computer':\n", word_vector)

# Find similar words
similar_words = model.wv.most_similar('computer')
print("\nWords similar to 'computer':")
pprint(similar_words)

# Handle out-of-vocabulary (OOV) word
oov_vector = model.wv['compuuter']  # misspelled
print("\nVector for misspelled 'compuuter':\n", oov_vector)

Sample corpus:
[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

Vector for 'computer':
 [ 2.96936167e-04  3.31060466e-04 -8.77768325e-04  3.39444174e-04
 -5.01747418e-04 -2.04214524e-03 -1.24066719e-03 -1.94044539e-03
  1.34510931e-03 -2.41268426e-03  9.18505422e-04 -1.03151030e-03
 -7.63410062e-04  7.31222244e-05  1.38286629e-03  5.19435504e-04
 -2.98849802e-04 -1.19464763e-03 -1.17238448e-03 -6.08951552e-04
 -6.78338984e-04  3.92779708e-04  9.88251195e-05  8.12689308e-04
  5.81971311e-04  7.01953366e-04 -7.36806658e-04 -1.03962549e-03
 -6.25258312e-04 -2.40496884e-04 -1.19316357e-03 -2.65940849e-04
  7.36046524e-04 -7.21505727e-04 -1.27508014e-03  1.24231781e-04
  3.77583550e-04 -1.33155228e-03 -2.73441360e-03 -3.04829708e-04
  9