## Building fastText based models


### Importing the libraries and data


In [1]:
from gensim.models import FastText
from gensim.test.utils import common_texts

In [2]:
common_texts


[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

## Building a basic model


In [3]:
model = FastText(vector_size=5, window=3, min_count=1)


In [4]:
model.build_vocab(common_texts)
model.train(common_texts, total_examples=len(common_texts), epochs=10)

(36, 290)

## Check the vocabulary


In [5]:
model.wv.key_to_index 


{'system': 0,
 'graph': 1,
 'trees': 2,
 'user': 3,
 'minors': 4,
 'eps': 5,
 'time': 6,
 'response': 7,
 'survey': 8,
 'computer': 9,
 'interface': 10,
 'human': 11}

In [6]:
model.wv['human']


array([-0.03166137,  0.02326731,  0.01241683,  0.00036033,  0.02841445],
      dtype=float32)

In [7]:
model.wv.most_similar(positive=['computer', 'interface'], negative=['human'])


[('user', 0.7968785166740417),
 ('system', 0.17462188005447388),
 ('response', 0.104334257543087),
 ('survey', 0.009604745544493198),
 ('trees', -0.07640466839075089),
 ('time', -0.1330047994852066),
 ('minors', -0.13927175104618073),
 ('eps', -0.24093686044216156),
 ('graph', -0.291752427816391)]

## min_n and max_n parameters


In [8]:
model = FastText(vector_size=5, window=3, min_count=1, min_n=1, max_n=5)


In [9]:
model.build_vocab(common_texts)
model.train(common_texts, total_examples=len(common_texts), epochs=10)

(36, 290)

## Let's try and fetch a representation for an out of vocabulary word


In [10]:
model.wv['rubber']


array([ 0.01833104, -0.02146881,  0.00600105, -0.03445042, -0.0165866 ],
      dtype=float32)

### Checkout the most similar feature using an Out of Vocab term


In [11]:
model.wv.most_similar(positive=['computer', 'human'], negative=['rubber'])


[('trees', 0.795038104057312),
 ('eps', 0.7793108820915222),
 ('minors', 0.2440604418516159),
 ('time', 0.1623203307390213),
 ('user', -0.04820729047060013),
 ('graph', -0.1567206084728241),
 ('survey', -0.20417772233486176),
 ('interface', -0.39214828610420227),
 ('response', -0.6897355914115906),
 ('system', -0.8435077667236328)]

### Extending the built model to incorporate words from new sentences


In [12]:
sentences_to_be_added = [["I", "am", "learning", "Natural", "Language", "Processing"],
                         ["Natural", "Language", "Processing", "is", "cool"]]


In [13]:
model.build_vocab(sentences_to_be_added, update=True)
model.train(common_texts, total_examples=len(sentences_to_be_added), epochs=10)

(43, 290)

In [14]:
model.wv.key_to_index 


{'system': 0,
 'graph': 1,
 'trees': 2,
 'user': 3,
 'minors': 4,
 'eps': 5,
 'time': 6,
 'response': 7,
 'survey': 8,
 'computer': 9,
 'interface': 10,
 'human': 11,
 'I': 12,
 'am': 13,
 'learning': 14,
 'Natural': 15,
 'Language': 16,
 'Processing': 17,
 'is': 18,
 'cool': 19}

## Summary

In this assignment we learn to create a basic fasttext model from scratch. This module allows training word embeddings from a training corpus with the additional ability to obtain word vectors for out-of-vocabulary words.
