In [25]:
import nltk
import fasttext
import re
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import gutenberg

In [26]:
# Download NLTK resources (you may need to download NLTK data if not done previously)
nltk.download('punkt')
nltk.download('gutenberg')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [27]:
# Get some text data (using Gutenberg corpus as an example)
text_corpus = ""
for file_id in gutenberg.fileids():
    text_corpus += gutenberg.raw(file_id)

# Preprocessing function to clean the text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove non-alphanumeric characters and extra spaces
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(' +', ' ', text)
    return text

In [28]:
# Preprocess the text corpus
cleaned_text = preprocess_text(text_corpus)

In [29]:
# Tokenize sentences and words
sentences = sent_tokenize(cleaned_text)
tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]

In [30]:
# Save tokenized sentences to a text file
text_data_path = 'tokenized_text.txt'

In [31]:
with open(text_data_path, 'w') as text_file:
    for sentence in tokenized_sentences:
        text_file.write(' '.join(sentence) + '\n')

print(f"Tokenized text data has been written to {text_data_path}")

Tokenized text data has been written to tokenized_text.txt


In [33]:
# Training the FastText model
model = fasttext.train_unsupervised(text_data_path, model='skipgram')

# Save the trained model
model.save_model('trained_model.bin')
print("FastText model trained and saved")

FastText model trained and saved


In [34]:
# Example usage of the trained model
loaded_model = fasttext.load_model('trained_model.bin')



In [35]:
# Get the word vector for a specific word
word_vector = loaded_model.get_word_vector('word')

In [37]:
word_vector

array([ 0.14741686, -0.4035446 ,  0.10212318, -0.02559507, -0.2579022 ,
        0.4110144 ,  0.27578896, -0.12794673,  0.2722545 , -0.22372477,
       -0.12523542,  0.40178066, -0.60799015, -0.04848578, -0.29949635,
       -0.0416134 , -0.00498964,  0.04314872, -0.19523036,  0.37438512,
        0.18229593, -0.30549097,  0.34077477,  0.16452955, -0.05857192,
       -0.10769588, -0.5636373 , -0.07856709, -0.24399365, -0.20481552,
        0.00186683, -0.1715961 , -0.5540044 , -0.17661123, -0.21138766,
       -0.21934208, -0.26010048, -0.49465987,  0.0963107 ,  0.05410622,
        0.16340041, -0.0926865 ,  0.2543234 ,  0.20681542,  0.25434396,
       -0.3932816 ,  0.31264868, -0.03619537, -0.11160152, -0.02666578,
        0.6115323 ,  0.27803785, -0.37732777, -0.6420789 ,  0.0678767 ,
       -0.06797723, -0.3240654 , -0.3361098 ,  0.0506219 , -0.09752444,
       -0.31721026,  0.20631003,  0.17398131, -0.61781275,  0.10100478,
        0.36431026, -0.0202862 , -0.16360347,  0.16964242, -0.08

In [36]:
# Find the most similar words to a given word
similar_words = loaded_model.get_nearest_neighbors('word')
print("Similar words to 'word':", similar_words)


Similar words to 'word': [(0.7850816249847412, 'byword'), (0.7579824328422546, 'words'), (0.6917113065719604, 'syllable'), (0.6844764351844788, 'it'), (0.6827720403671265, 'spoke'), (0.6763255596160889, 'trick'), (0.6746193766593933, 'monologue'), (0.6743306517601013, 'spell'), (0.667864978313446, 'spokes'), (0.6638267636299133, 't')]
