In [None]:
import fasttext

In [2]:
!wget https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz

--2022-06-28 14:51:45--  https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 172.67.9.4, 104.22.75.142, 104.22.74.142
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|172.67.9.4|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 457609 (447K) [application/x-tar]
Saving to: ‘cooking.stackexchange.tar.gz’


2022-06-28 14:51:45 (3.52 MB/s) - ‘cooking.stackexchange.tar.gz’ saved [457609/457609]



In [3]:
!tar xvzf cooking.stackexchange.tar.gz

x cooking.stackexchange.id
x cooking.stackexchange.txt
x readme.txt


In [4]:
!head cooking.stackexchange.txt
# each line is a single example in the training data, with the labels – there can be more than 1 label – in the form __label__X, followed by a line of text at the end. For example, the first line has the content “How much does potato starch affect a cheese sauce recipe?” and is labeled as both “sauce” and “cheese”.

__label__sauce __label__cheese How much does potato starch affect a cheese sauce recipe?
__label__food-safety __label__acidity Dangerous pathogens capable of growing in acidic environments
__label__cast-iron __label__stove How do I cover up the white spots on my cast iron stove?
__label__restaurant Michelin Three Star Restaurant; but if the chef is not there
__label__knife-skills __label__dicing Without knife skills, how can I quickly and accurately dice vegetables?
__label__storage-method __label__equipment __label__bread What's the purpose of a bread box?
__label__baking __label__food-safety __label__substitutions __label__peanuts how to seperate peanut oil from roasted peanuts at home?
__label__chocolate American equivalent for British chocolate terms
__label__baking __label__oven __label__convection Fan bake vs bake
__label__sauce __label__storage-lifetime __label__acidity __label__mayonnaise Regulation and balancing of readymade packed mayonnaise and other sauces


In [5]:
# fastText training
!head -12404 cooking.stackexchange.txt > cooking.train
!tail -3000 cooking.stackexchange.txt > cooking.test

In [17]:
import fasttext

# Train model
model = fasttext.train_supervised(input="cooking.train")

# Test single prediction
model.predict("easy recipe for sourdough bread ?")

Read 0M words
Number of words:  14543
Number of labels: 735
Progress: 100.0% words/sec/thread:   60903 lr:  0.000000 avg.loss:  9.907210 ETA:   0h 0m 0s


(('__label__baking',), array([0.12350871]))

In [18]:
## More predictions
# Test single prediction
print(model.predict("easy recipe for trouble?"))
print(model.predict("easy recipe for baking soda?"))
print(model.predict("easy recipe for catfish?"))
print(model.predict("easy recipe for frying a fish?"))

(('__label__substitutions',), array([0.1686313]))
(('__label__baking',), array([0.10162595]))
(('__label__substitutions',), array([0.09758869]))
(('__label__baking',), array([0.07138634]))


In [19]:
# Evaluate on test data
model.test("cooking.test")

# response format (N, P@1, R@1)

(3000, 0.14133333333333334, 0.06112152227187545)

In [20]:
# try asking for the top 5
model.test("cooking.test", 5)

# we’ve increased recall by adding additional predicted labels for each example, but we’ve lost precision, since our second through fifth best predictions are less likely to be more correct than our first one.

(3000, 0.0676, 0.14617269713132477)

## Model Tuning

In [22]:
## Tokenization / Preprocessing
# t they’re now in lowercase and that there are spaces separating out punctuation, which will result in a different tokenization.

!cat cooking.stackexchange.txt | sed -e "s/\([.\!?,'/()]\)/ \1 /g" | tr "[:upper:]" "[:lower:]" > cooking.preprocessed.txt
!head -12404 cooking.preprocessed.txt > cooking.train
!tail -3000 cooking.preprocessed.txt > cooking.test

In [24]:
!head -n 10 cooking.train

__label__sauce __label__cheese how much does potato starch affect a cheese sauce recipe ? 
__label__food-safety __label__acidity dangerous pathogens capable of growing in acidic environments
__label__cast-iron __label__stove how do i cover up the white spots on my cast iron stove ? 
__label__restaurant michelin three star restaurant; but if the chef is not there
__label__knife-skills __label__dicing without knife skills ,  how can i quickly and accurately dice vegetables ? 
__label__storage-method __label__equipment __label__bread what ' s the purpose of a bread box ? 
__label__baking __label__food-safety __label__substitutions __label__peanuts how to seperate peanut oil from roasted peanuts at home ? 
__label__chocolate american equivalent for british chocolate terms
__label__baking __label__oven __label__convection fan bake vs bake
__label__sauce __label__storage-lifetime __label__acidity __label__mayonnaise regulation and balancing of readymade packed mayonnaise and other s

In [25]:
# Train model
model = fasttext.train_supervised(input="cooking.train")
model.test('cooking.test')

# at least see some improvement in precision and recall (

Read 0M words
Number of words:  8952
Number of labels: 735
Progress: 100.0% words/sec/thread:   75870 lr:  0.000000 avg.loss:  9.810714 ETA:   0h 0m 0s


(3000, 0.16433333333333333, 0.07106818509442121)

In [26]:
# By default, fastText performs 5 epochs. Since we don’t have that much training data, let’s try increasing the number of epochs to 25:
model = fasttext.train_supervised(input="cooking.train", epoch=25)
model.test("cooking.test")

#  but a precision above 50% is at least starting to look useful.

Read 0M words
Number of words:  8952
Number of labels: 735
Progress: 100.0% words/sec/thread:   77361 lr:  0.000000 avg.loss:  7.184034 ETA:   0h 0m 0s


(3000, 0.52, 0.22488107250973044)

In [27]:
model = fasttext.train_supervised(input="cooking.train", epoch=100)
model.test("cooking.test")

#  clearly facing diminishing returns. Increasing the number of epochs can only get you so far.

Read 0M words
Number of words:  8952
Number of labels: 735
Progress: 100.0% words/sec/thread:   68347 lr:  0.000000 avg.loss:  3.202423 ETA:   0h 0m 0s  0h 0m 3s


(3000, 0.5493333333333333, 0.2375666714718178)

In [28]:
model = fasttext.train_supervised(input="cooking.train", epoch=25, lr=1.0)
model.test("cooking.test")

Read 0M words
Number of words:  8952
Number of labels: 735
Progress: 100.0% words/sec/thread:   80304 lr:  0.000000 avg.loss:  4.499678 ETA:   0h 0m 0s


(3000, 0.5816666666666667, 0.2515496612368459)

In [29]:
model = fasttext.train_supervised(input="cooking.train", epoch=25, lr=0.01)
model.test("cooking.test")

# Clearly a higher learning rate is better here – not surprising, given that we have a small amount of data.

Read 0M words
Number of words:  8952
Number of labels: 735
Progress: 100.0% words/sec/thread:   82303 lr:  0.000000 avg.loss: 10.715544 ETA:   0h 0m 0s


(3000, 0.096, 0.04151650569410408)

In [None]:
# retrain our model to account for bigrams by adding “-wordNgrams 2”
# Retrain with 25 epochs, bigrams, and learning rate of 1.0 and evaluate again
model = fasttext.train_supervised(input="cooking.train", lr=1.0, epoch=25, wordNgrams=2)
model.test("cooking.test")

In [30]:
help(fasttext.FastText)

Help on module fasttext.FastText in fasttext:

NAME
    fasttext.FastText

DESCRIPTION
    # Copyright (c) 2017-present, Facebook, Inc.
    # All rights reserved.
    #
    # This source code is licensed under the MIT license found in the
    # LICENSE file in the root directory of this source tree.

FUNCTIONS
    cbow(*kargs, **kwargs)
    
    eprint(*args, **kwargs)
    
    load_model(path)
        Load a model given a filepath and return a model object.
    
    read_args(arg_list, arg_dict, arg_names, default_values)
    
    skipgram(*kargs, **kwargs)
    
    supervised(*kargs, **kwargs)
    
    tokenize(text)
        Given a string of text, tokenize it and return a list of tokens
    
    train_supervised(*kargs, **kwargs)
        Train a supervised model and return a model object.
        
        input must be a filepath. The input text does not need to be tokenized
        as per the tokenize function, but it must be preprocessed and encoded
        as UTF-8. You might wan

In [31]:
model.predict("easy recipe for sourdough bread ?")

(('__label__baking',), array([0.08652446]))

## Documentation: https://fasttext.cc/docs/en/python-module.html

## NLTK

In [32]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
str = "Cats eat raw fish."
tokens = nltk.word_tokenize(str)
nltk.pos_tag(tokens)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sengopal/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package punkt to /Users/sengopal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[('Cats', 'NNS'), ('eat', 'VBP'), ('raw', 'JJ'), ('fish', 'NN'), ('.', '.')]

### POS
> “NNS” denotes a plural noun, “VBP” denotes verb that is singular and present-tense but not third-person, “JJ” denotes an adjective, and “NN” denotes a singlular noun.
>
> ![](penn_treebank_pos.png)

In [33]:
nltk.download('words')
nltk.download('maxent_ne_chunker')
str = "Barack Obama served as the 44th President of the United States."
tokens = nltk.word_tokenize(str)
nltk.ne_chunk(nltk.pos_tag(tokens))

[nltk_data] Downloading package words to /Users/sengopal/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/sengopal/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


ModuleNotFoundError: No module named 'svgling'

Tree('S', [Tree('PERSON', [('Barack', 'NNP')]), Tree('PERSON', [('Obama', 'NNP')]), ('served', 'VBD'), ('as', 'IN'), ('the', 'DT'), ('44th', 'CD'), ('President', 'NNP'), ('of', 'IN'), ('the', 'DT'), Tree('GPE', [('United', 'NNP'), ('States', 'NNPS')]), ('.', '.')])

In [34]:
str = "Mr. Ed is a zebra."
tokens = nltk.word_tokenize(str)
nltk.ne_chunk(nltk.pos_tag(tokens))

ModuleNotFoundError: No module named 'svgling'

Tree('S', [Tree('PERSON', [('Mr.', 'NNP'), ('Ed', 'NNP')]), ('is', 'VBZ'), ('a', 'DT'), ('zebra', 'NN'), ('.', '.')])

In [35]:
str = "Pi is not 3.14?"
tokens = nltk.word_tokenize(str)
nltk.ne_chunk(nltk.pos_tag(tokens))

ModuleNotFoundError: No module named 'svgling'

Tree('S', [Tree('GPE', [('Pi', 'NNP')]), ('is', 'VBZ'), ('not', 'RB'), ('3.14', 'CD'), ('?', '.')])

## Synonyms
https://fasttext.cc/docs/en/unsupervised-tutorial.html

In [38]:
# Skipgram model :
#  fasttext skipgram -input /workspace/search_with_machine_learning_course/data/wiki_sample.txt -output wiki -maxn 0
model = fasttext.train_unsupervised('/Users/sengopal/build/my-git/search_with_machine_learning_course/data/wiki_sample.txt', model='skipgram', maxn=0)
model.save_model("wiki.bin")

Read 0M words
Number of words:  9871
Number of labels: 0
Progress: 100.0% words/sec/thread:  180559 lr:  0.000000 avg.loss:  2.461266 ETA:   0h 0m 0s


In [40]:
model.get_word_vector("politics")

array([ 4.51803803e-01,  4.05361801e-02, -1.86746687e-01,  2.75188148e-01,
        8.90553668e-02,  1.13700092e-01, -3.35877031e-01,  1.13159738e-01,
        2.17511162e-01,  7.48647423e-03, -7.51596913e-02,  3.44507862e-04,
       -4.31794047e-01, -3.18042487e-02, -1.11480244e-01,  1.90409020e-01,
        1.43702447e-01,  2.06547499e-01,  1.85254565e-03, -2.23737568e-01,
        2.12700382e-01,  5.51540792e-01, -1.80349410e-01,  3.93031649e-02,
        7.40892068e-02,  4.47227918e-02,  1.59627840e-01, -1.38234645e-02,
       -3.65492702e-01, -1.67179585e-01,  1.95508525e-01,  7.33919740e-02,
       -3.78228799e-02, -4.64942195e-02, -6.63160160e-02,  2.52171308e-01,
        1.75371543e-01, -1.05553761e-01, -2.09394693e-01, -7.66221434e-02,
       -2.52725959e-01, -7.97243137e-03, -4.71099764e-02,  3.01281065e-01,
        4.85131025e-01,  2.80443817e-01,  7.68438503e-02,  6.80727735e-02,
        9.35588703e-02, -9.07346383e-02, -1.76610559e-01, -3.66963521e-02,
       -1.73832610e-01,  

In [41]:
model.get_nearest_neighbors('politics')

[(0.8555908203125, 'privy'),
 (0.8484763503074646, 'governors'),
 (0.8414998054504395, 'documents'),
 (0.8396217226982117, 'politicians'),
 (0.8302420377731323, 'ministers'),
 (0.8212708234786987, 'senate'),
 (0.8194386959075928, 'governed'),
 (0.819216251373291, 'parliamentary'),
 (0.8184235692024231, 'register'),
 (0.8164151906967163, 'commonwealth')]

In [42]:
model.get_nearest_neighbors('linux')

[(0.9506422281265259, 'unix'),
 (0.94535893201828, 'dna'),
 (0.9450774192810059, 'kernel'),
 (0.943400502204895, 'implementation'),
 (0.942667543888092, 'files'),
 (0.9385414719581604, 'functional'),
 (0.93833988904953, 'microsoft'),
 (0.937720537185669, 'interface'),
 (0.9318779110908508, 'binding'),
 (0.9311510920524597, 'inputs')]