In [1]:
!pip install FastText

Collecting FastText
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from FastText)
  Using cached pybind11-3.0.0-py3-none-any.whl.metadata (10.0 kB)
Using cached pybind11-3.0.0-py3-none-any.whl (292 kB)
Building wheels for collected packages: FastText
  Building wheel for FastText (pyproject.toml) ... [?25l[?25hdone
  Created wheel for FastText: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4508437 sha256=009c1c12889218767018dc4ca3f37d88915b6170ea24821bd6ef0c60912b6950
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a513fa6b79451473ceb7713017823c3
Successfully built FastText
Installing collected packages: pybind11, FastText
Successf

In [5]:
# step 1: installing the libraries
import fasttext
import os
import joblib

In [4]:
# step 2: Creating Training Data
def create_sample_data():
  # sample sentences for training
  sentences = [
      "The king rules the kingdom",
      "The queen helps the king",
      "Running is good for excercise",
      "Walking is healthy activity",
      "The walker walks slowly",
      "Reading books is fun",
      "The reader reads daily"
  ]

  with open("training_data.txt", "w") as f:
    for sentence in sentences:
      f.write(sentence.lower() + "\n")

  print("Training data created in 'training_data.txt'")

create_sample_data()

Training data created in 'training_data.txt'


In [10]:
# step 3: Training a basic fast text model
def train_simple_model():
  model = fasttext.train_unsupervised("training_data.txt", model="skipgram", dim=50, epoch=30, minCount=1, minn=3, maxn=6)
  # training on the skipgram model
  # choosing model as skipgram
  # dimension choosen as 50
  # epochs = 50
  # min word frequency of embedding is choosen as 1
  # minimum n gram is taken as 3
  # maximum n gram is taken as 6

  model.save_model("word_vectors.bin")
  print("Model trained and saved as 'word_vectors.bin'")
  return model

train_simple_model()

Model trained and saved as 'word_vectors.bin'


<fasttext.FastText._FastText at 0x7bbb4de77a10>

In [12]:
model = train_simple_model()

Model trained and saved as 'word_vectors.bin'


In [19]:
# step 4: step getting word vecotrs
def get_word_vectors(model):
  king_vector = model.get_word_vector("king")
  print(f"vector for 'king': {king_vector[:5]}")
  print(f"vector shape: {king_vector.shape}")

  kingdom_vector = model.get_word_vector("kingdom")
  print(f"vector for 'kingdom': {kingdom_vector[:5]}")
  print(f"vector shape: {kingdom_vector.shape}")

get_word_vectors(model)

vector for 'king': [-0.00018306 -0.0003302   0.00042938  0.00089004 -0.00164623]
vector shape: (50,)
vector for 'kingdom': [-0.00059493  0.00241587 -0.00059253 -0.0007504  -0.0007329 ]
vector shape: (50,)


In [24]:
# step 5: finding similar words
def find_similar_words(model, word, k=3):
  print(f"\n words similar to '{word}'")
  try:
    neighbor_word = model.get_nearest_neighbors(word, k)
    for i, (similarity, similar_words) in enumerate(neighbor_word):
      print(f"{i}. {similar_words} : {similarity}")
  except Exception as e:
    print(f"error: {e}")
find_similar_words(model, 'king')
find_similar_words(model, "kingdom")


 words similar to 'king'
0. walks : 0.26921141147613525
1. healthy : 0.23639050126075745
2. rules : 0.23427143692970276

 words similar to 'kingdom'
0. good : 0.23384441435337067
1. rules : 0.22116202116012573
2. is : 0.21689122915267944


In [None]:
# # step 6 : text classification implementation
# def create_classification_data():
#   reviews=[
#       ("This movie is amazing and fun", ),
#       (),
#       (),
#   ]