In [1]:
!pip install FastText

Collecting FastText
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from FastText)
  Using cached pybind11-3.0.0-py3-none-any.whl.metadata (10.0 kB)
Using cached pybind11-3.0.0-py3-none-any.whl (292 kB)
Building wheels for collected packages: FastText
  Building wheel for FastText (pyproject.toml) ... [?25l[?25hdone
  Created wheel for FastText: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4508434 sha256=6e9044be4445a38e49054737e2a17dcd05ace93f0c437da0dca175bfef14fb81
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a513

In [2]:
# step 1: installing the libraries
import fasttext
import os
import joblib

In [5]:
# step 2: Creating Training Data
def create_sample_data():
  # sample sentences for training

  number = int(input("enter the number of the strings you want to take as input: "))
  sentences = [
      "The king rules the kingdom",
      "The queen helps the king"
  ]
  for i in range(0, number):
    sentence = input(f"enter the sentence: {i+1}")
    sentences.append(sentence)

  with open("training_data.txt", "w") as f:
    for sentence in sentences:
      f.write(sentence.lower() + "\n")

  print("Training data created in 'training_data.txt'")

create_sample_data()


enter the number of the strings you want to take as input: 3
enter the sentence: 1The cat is playing basketball
enter the sentence: 2i am eating dinner
enter the sentence: 3Can you give me football?
Training data created in 'training_data.txt'


In [7]:
def create_dummy_data():
  sentences = [
      "A cat is smiling",
      "I am playing basketball",
      "My mom is cooking food",
      "My brother is saying rubbish things"
  ]

  with open("training_data.txt", "a") as f:
    for sentence in sentences:
      f.write(sentence.lower() + '\n')

  print("Training data appended in training_data.txt")

create_dummy_data()

Training data created in training_data.txt


In [11]:
# step 3: Training a basic fast text model
def train_skipgram_model():
  model = fasttext.train_unsupervised("training_data.txt", model="skipgram", dim=50, epoch=30, minCount=1, minn=3, maxn=6)
  '''
    sample data are in training_data.txt
    training on the skipgram model
    choosing model as skipgram
    dimension choosen as 50
    epochs = 50
    min word frequency of embedding is choosen as 1
    minimum n gram is taken as 3
    maximum n gram is taken as 6
  '''
  model.save_model("word_vectors.bin")
  print("Model trained and saved as 'word_vectors.bin'")
  return model

train_skipgram_model()

Model trained and saved as 'word_vectors.bin'


<fasttext.FastText._FastText at 0x7bae4535b150>

In [10]:
def train_cbow_model():
  model = fasttext.train_unsupervised("training_data.txt", model="cbow", dim=50, minCount=1, minn=3, maxn=6)
  """
    data is saved in the training_data.txt
    this model is trained on using cbow model, so we can check skipgram is efficient or the skipgram
    embedding dimension is set to the 50
    min frequency is set to be 1
    min n gram is 3
    and max n gram is 6
  """
  model.save_model("word_vectors_cbow.bin")
  print("Model trainied and saved as 'word_vectors.bin'")
  return model

train_cbow_model()

Model trainied and saved as 'word_vectors.bin'


<fasttext.FastText._FastText at 0x7bae282b6750>

In [12]:
model = train_skipgram_model()

Model trained and saved as 'word_vectors.bin'


In [14]:
model_cbow = train_cbow_model()

Model trainied and saved as 'word_vectors.bin'


In [17]:
# step 4: step getting word vecotrs
def get_word_vectors(model):
  king_vector = model.get_word_vector("king")
  print(f"vector for 'king': {king_vector[:5]}")
  print(f"vector shape: {king_vector.shape}")

  kingdom_vector = model.get_word_vector("kingdom")
  print(f"vector for 'kingdom': {kingdom_vector[:5]}")
  print(f"vector shape: {kingdom_vector.shape}")

print(f"the word vector for king and kingdom by using the skipgram model")
get_word_vectors(model)
print("\n")
print(f"the word vector for king and kingdom by using the cbow model")
get_word_vectors(model_cbow)

the word vector for king and kingdom by using the skipgram model
vector for 'king': [-0.00019264 -0.00032311  0.00042608  0.00088622 -0.00164325]
vector shape: (50,)
vector for 'kingdom': [-0.00020329 -0.00094665  0.00032624 -0.00194728 -0.00075324]
vector shape: (50,)


the word vector for king and kingdom by using the cbow model
vector for 'king': [-0.0001826  -0.00033079  0.0004302   0.00088911 -0.00164602]
vector shape: (50,)
vector for 'kingdom': [-0.00019653 -0.00095493  0.00033112 -0.00194544 -0.00075633]
vector shape: (50,)


In [19]:
# step 5: finding similar words
def find_similar_words(model, word, k=3):
  print(f"\n words similar to '{word}'")
  try:
    neighbor_word = model.get_nearest_neighbors(word, k)
    for i, (similarity, similar_words) in enumerate(neighbor_word):
      print(f"{i}. {similar_words} : {similarity}")
  except Exception as e:
    print(f"error: {e}")

find_similar_words(model, 'engineer')
print("\n")
find_similar_words(model, "pochinki")
print("\n")
find_similar_words(model_cbow, "hello")
print("\n")
find_similar_words(model_cbow, "cats")


 words similar to 'engineer'
0. mom : 0.3215050995349884
1. smiling : 0.26361843943595886
2. king : 0.15681423246860504



 words similar to 'pochinki'
0. brother : 0.24800024926662445
1. playing : 0.2303360104560852
2. dinner : 0.21316547691822052



 words similar to 'hello'
0. helps : 0.38231831789016724
1. cat : 0.18260356783866882
2. saying : 0.16983535885810852



 words similar to 'cats'
0. eating : 0.2832822799682617
1. smiling : 0.14695900678634644
2. am : 0.14695431292057037


In [22]:
# step 6 : text classification implementation
def create_classification_data():
  reviews = [
      ("This movie is amazing and fun", "Positive"),
      ("I had really great time in meeting new people", "Positive"),
      ("Excellent film with good plot", "Positive"),
      ("Terrible movie very boring", "negative"),
      ("Bad acting and poor story", "negative"),
      ("Boring and predictable plot", "negative")
  ]

  with open('movie_reviews.txt', 'w') as f:
    for text, label in reviews:
      f.write(f"__label__{label} {text.lower()} \n")

  print("Classification data created 'movie_reviews.txt'")

create_classification_data()

Classification data created 'movie_reviews.txt'


In [34]:
# training the text classifier
def train_text_classifier():
    classifier = fasttext.train_supervised(
        'movie_reviews.txt',
        epoch=25,
        lr=0.1,
        wordNgrams=2,
        verbose=2
    )
    """
    read the data from the movie_reviewws.txt
    epoch is 25
    learning rate is used as 0.1 which is the magnitude of change/update to model weights during the backpropagation
    training process. standard value for learning rate is used as less that 1.0
    """

    classifier.save_model('text_classifier.bin')
    print("Classifier trained and saved")
    return classifier

classifier = train_text_classifier()



Classifier trained and saved


In [35]:
# making predictions using text_classifier
def test_classifier(classifier):
    test_sentences = [
        "This is a fantastic movie",
        "Boring and terrible film",
        "Great story and acting",
        "Worst movie I have seen"
    ]

    print("\nClassification Results:")
    print("-" * 40)

    for sentence in test_sentences:
        labels, probabilities = classifier.predict(sentence, k=1)
        predicted_label = labels[0].replace('__label__', '')
        confidence = probabilities[0]
        print(f"Text: '{sentence}'")
        print(f"Prediction: {predicted_label} (confidence: {confidence:.4f})\n")

test_classifier(classifier)


Classification Results:
----------------------------------------


ValueError: Unable to avoid copy while creating an array as requested.
If using `np.array(obj, copy=False)` replace it with `np.asarray(obj)` to allow a copy when needed (no behavior change in NumPy 1.x).
For more details, see https://numpy.org/devdocs/numpy_2_0_migration_guide.html#adapting-to-changes-in-the-copy-keyword.