<a href="https://colab.research.google.com/github/sibteali786/500-AI-Machine-learning-Deep-learning-Computer-vision-NLP-Projects-with-code/blob/main/17_fasttext/fasttext_indian_food.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<h3>Download and explore pre-trained models</h3>

##### (1) Explore English Model

In [2]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/73.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.6-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.6-py3-none-any.whl (243 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.3-cp311-cp311-linux_x86_64.whl size=4313505 sha256=8c4e4c761729575c5fc21720f08c8bf5c0f5fd6e5e32e4934ef1c0dd62ab81d2
  Stored in directory: /root/.cache/pip/wheels/65/4f/35/5057db0249224e9ab55a51

In [5]:
import fasttext
import requests
import os
import gzip # Import the gzip module

# Define the URL and the local path to save the compressed model
model_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz"
local_compressed_model_path = "cc.en.300.bin.gz" # Path for the downloaded .gz file
local_uncompressed_model_path = "cc.en.300.bin" # Path for the uncompressed .bin file

# Check if the compressed file already exists to avoid re-downloading
if not os.path.exists(local_compressed_model_path):
    print(f"Downloading model from {model_url}...")
    # Download the file
    response = requests.get(model_url, stream=True)
    response.raise_for_status() # Raise an exception for bad status codes (4xx or 5xx)

    # Save the downloaded file
    with open(local_compressed_model_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)
    print("Download complete.")
else:
    print(f"Compressed model already exists at {local_compressed_model_path}. Skipping download.")

# Check if the uncompressed file exists, if not, decompress the .gz file
if not os.path.exists(local_uncompressed_model_path):
    print(f"Decompressing {local_compressed_model_path}...")
    with gzip.open(local_compressed_model_path, 'rb') as f_in:
        with open(local_uncompressed_model_path, 'wb') as f_out:
            f_out.writelines(f_in)
    print("Decompression complete.")
else:
    print(f"Uncompressed model already exists at {local_uncompressed_model_path}. Skipping decompression.")


# Load the model from the local uncompressed file path
model_en = fasttext.load_model(local_uncompressed_model_path)

Compressed model already exists at cc.en.300.bin.gz. Skipping download.
Decompressing cc.en.300.bin.gz...
Decompression complete.


In [6]:
dir(model_en)

['__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_labels',
 '_words',
 'f',
 'get_analogies',
 'get_dimension',
 'get_input_matrix',
 'get_input_vector',
 'get_label_id',
 'get_labels',
 'get_line',
 'get_meter',
 'get_nearest_neighbors',
 'get_output_matrix',
 'get_sentence_vector',
 'get_subword_id',
 'get_subwords',
 'get_word_id',
 'get_word_vector',
 'get_words',
 'is_quantized',
 'labels',
 'predict',
 'quantize',
 'save_model',
 'set_args',
 'set_matrices',
 'test',
 'test_label',
 'words']

In [7]:
model_en.get_word_vector("good").shape

(300,)

In [8]:
model_en.get_analogies("berlin","germany","france")

[(0.7303731441497803, 'paris'),
 (0.6408537030220032, 'france.'),
 (0.6393311023712158, 'avignon'),
 (0.6316676139831543, 'paris.'),
 (0.5895596742630005, 'montpellier'),
 (0.5884554386138916, 'rennes'),
 (0.5850598812103271, 'grenoble'),
 (0.5832924246788025, 'london'),
 (0.5806092619895935, 'strasbourg'),
 (0.574320375919342, 'Paris.')]

In [9]:
model_en.get_analogies("berlin","germany","india")

[(0.7148876190185547, 'delhi'),
 (0.6974374055862427, 'mumbai'),
 (0.648612916469574, 'jaipur'),
 (0.6349966526031494, 'kolkata'),
 (0.6279922723770142, 'pune'),
 (0.6277596354484558, 'bangalore'),
 (0.6044078469276428, 'hyderabad'),
 (0.6021745800971985, 'noida'),
 (0.6018899083137512, 'bhubaneswar'),
 (0.599077582359314, 'nashik')]

In [10]:
model_en.get_analogies("driving","car","phone")

[(0.610385537147522, 'texting'),
 (0.5203558802604675, 'phone-calling'),
 (0.5153835415840149, 'cellphone'),
 (0.5135326981544495, 'cell-phone'),
 (0.5117910504341125, 'dialing'),
 (0.5087355971336365, 'texing'),
 (0.5079342722892761, 'text-messaging'),
 (0.500900387763977, 'txting'),
 (0.4960441589355469, 'texting.'),
 (0.4951859414577484, 'Texting')]

In [11]:
model_en.get_analogies("driving","car","book")

[(0.5302355885505676, 'reading'),
 (0.517051637172699, 'book.I'),
 (0.5137901306152344, 'book--and'),
 (0.5090512633323669, 'book.That'),
 (0.5005884766578674, 'book--it'),
 (0.49395182728767395, 'book--I'),
 (0.49293914437294006, 're-reading'),
 (0.49156999588012695, 'book.This'),
 (0.49107635021209717, 'reading--and'),
 (0.48960915207862854, 'book--the')]

In [12]:
model_en.get_nearest_neighbors("chutney")
# bad performance as its not aware of this word and their context

[(0.8078702092170715, 'chutneys'),
 (0.7138292789459229, 'thokku'),
 (0.701572060585022, 'Chutney'),
 (0.6875490546226501, 'achaar'),
 (0.684525728225708, 'piccalilli'),
 (0.6737173199653625, 'raita'),
 (0.6715506911277771, 'chatni'),
 (0.6610829830169678, 'chutney.'),
 (0.6505922675132751, 'gojju'),
 (0.6398508548736572, 'kasundi')]

In [15]:
model_en.get_nearest_neighbors("halwa")

[(0.8563978672027588, 'kheer'),
 (0.8392286896705627, 'burfi'),
 (0.8193163871765137, 'Halwa'),
 (0.7894062995910645, 'kesari'),
 (0.778471827507019, 'payasam'),
 (0.7706475853919983, 'burfis'),
 (0.7590622901916504, 'laddoo'),
 (0.7504664659500122, 'ladoo'),
 (0.7471016645431519, 'rabdi'),
 (0.7396334409713745, 'laddu')]

In [14]:
model_en.get_nearest_neighbors("saragva", k=3)

[(0.5384978652000427,
  'ReportsTabloidCrimeYakuzaTokyoGinzaIkebukuroKabukichoRoppongiShibuyaShimbashiShinjukuUenoJapanChibaFukuokaKobeKyotoNagoyaOkinawaOsakaSaitamaYokohamaSportsBaseballHorse'),
 (0.5373231768608093,
  'NoidaVaranasiBareillyMathuraAligarhMoradabadSaharanpurBijnorJaunpurGorakhpurMuzaffarnagarSultanpurDehradunHaridwarNainitalRoorkeeGarhwalBardhamanMurshidabadHooghlyMedinipurNorth'),
 (0.5331498980522156,
  'NagarBhiwaniKarnalKurukshetraMahendragarhSirsaPanipatJindJhajjarRewariSolanShimlaKangraHamirpurMandiJammuSrinagarRanchiJamshedpurMangaloreMysoreBelgaumGulbargaTumkurBijapurDavanagereDharwadShimogaUdupiHassanBidarHubliKolarBagalkotKannadaChitradurgaMandyaGadagBellaryRaichurThiruvananthapuramThrissurErnakulamMalappuramKochiKottayamKannurKozhikodeKollamPalakkadPathanamthittaCalicutTrivandrumAlappuzhaKasaragodBhopalIndoreGwaliorJabalpurUjjainSagarChhatarpurPuneNagpurAurangabadNashikKolhapurAhmed')]

##### (2) Explore Hindi Model

In [None]:
model_hi = fasttext.load_model('C:\\Code\\nlp-tutorials\\downloads\\cc.hi.300.bin')



In [None]:
model_hi.get_nearest_neighbors("अच्छा")

[(0.6697985529899597, 'बुरा'),
 (0.6132625341415405, 'अच्छे'),
 (0.608695387840271, 'अच्चा'),
 (0.6058669090270996, 'अच्छाखासा'),
 (0.5848375558853149, 'कीअच्छा'),
 (0.5826330184936523, 'औरअच्छा'),
 (0.5811230540275574, 'हो.अच्छा'),
 (0.5805407762527466, 'हीअच्छा'),
 (0.5795978307723999, 'लगता'),
 (0.5777745246887207, '58अच्छा')]

In [None]:
model_hi.get_nearest_neighbors("गाय")

[(0.6485272645950317, 'गायों'),
 (0.6403631567955017, 'गोमाता'),
 (0.6264104247093201, 'बछड़े'),
 (0.6045769453048706, 'बछडे'),
 (0.6030024886131287, 'दुधारु'),
 (0.5880257487297058, 'भेंस'),
 (0.5822192430496216, 'भैंस'),
 (0.5819100737571716, 'दुधारू'),
 (0.5773836970329285, 'गौमाता'),
 (0.5771614909172058, 'गायें')]

## Custom train word embeddings on indian food receipes 😋

dataset credits: https://www.kaggle.com/datasets/sooryaprakash12/cleaned-indian-recipes-dataset

In [16]:
import pandas as pd
import requests
import io

url = "https://raw.githubusercontent.com/codebasics/nlp-tutorials/refs/heads/main/17_fasttext/Cleaned_Indian_Food_Dataset.csv"
response_text = requests.get(url).text
df = pd.read_csv(io.StringIO(response_text))
print(df.shape)
df.head(3)

(5938, 9)


Unnamed: 0,TranslatedRecipeName,TranslatedIngredients,TotalTimeInMins,Cuisine,TranslatedInstructions,URL,Cleaned-Ingredients,image-url,Ingredient-count
0,Masala Karela Recipe,"1 tablespoon Red Chilli powder,3 tablespoon Gr...",45,Indian,"To begin making the Masala Karela Recipe,de-se...",https://www.archanaskitchen.com/masala-karela-...,"salt,amchur (dry mango powder),karela (bitter ...",https://www.archanaskitchen.com/images/archana...,10
1,Spicy Tomato Rice (Recipe),"2 teaspoon cashew - or peanuts, 1/2 Teaspoon ...",15,South Indian Recipes,"To make tomato puliogere, first cut the tomato...",https://www.archanaskitchen.com/spicy-tomato-r...,"tomato,salt,chickpea lentils,green chilli,rice...",https://www.archanaskitchen.com/images/archana...,12
2,Ragi Semiya Upma Recipe - Ragi Millet Vermicel...,"1 Onion - sliced,1 teaspoon White Urad Dal (Sp...",50,South Indian Recipes,"To begin making the Ragi Vermicelli Recipe, fi...",https://www.archanaskitchen.com/ragi-vermicell...,"salt,rice vermicelli noodles (thin),asafoetida...",https://www.archanaskitchen.com/images/archana...,12


In [18]:
df.TranslatedInstructions[0]

'To begin making the Masala Karela Recipe,de-seed the karela and slice.\nDo not remove the skin as the skin has all the nutrients.\nAdd the karela to the pressure cooker with 3 tablespoon of water, salt and turmeric powder and pressure cook for three whistles.\nRelease the pressure immediately and open the lids.\nKeep aside.Heat oil in a heavy bottomed pan or a kadhai.\nAdd cumin seeds and let it sizzle.Once the cumin seeds have sizzled, add onions and saute them till it turns golden brown in color.Add the karela, red chilli powder, amchur powder, coriander powder and besan.\nStir to combine the masalas into the karela.Drizzle a little extra oil on the top and mix again.\nCover the pan and simmer Masala Karela stirring occasionally until everything comes together well.\nTurn off the heat.Transfer Masala Karela into a serving bowl and serve.Serve Masala Karela along with Panchmel Dal and Phulka for a weekday meal with your family.\n'

In [19]:
import re

text = 'To begin making the Masala Karela Recipe,de-seed the karela and slice.\nDo not remove the skin as the skin has all the nutrients.\nAdd the karela to the pressure cooker with 3 tablespoon of water, salt and turmeric powder and pressure cook for three whistles.\nRelease the pressure immediately and open the lids.\nKeep aside.Heat oil in a heavy bottomed pan or a kadhai.\nAdd cumin seeds and let it sizzle.Once the cumin seeds have sizzled, add onions and saute them till it turns golden brown in color.Add the karela, red chilli powder, amchur powder, coriander powder and besan.\nStir to combine the masalas into the karela.Drizzle a little extra oil on the top and mix again.\nCover the pan and simmer Masala Karela stirring occasionally until everything comes together well.\nTurn off the heat.Transfer Masala Karela into a serving bowl and serve.Serve Masala Karela along with Panchmel Dal and Phulka for a weekday meal with your family.\n'

re.sub(r"[^\w\s]", " ", text, flags=re.MULTILINE)

'To begin making the Masala Karela Recipe de seed the karela and slice \nDo not remove the skin as the skin has all the nutrients \nAdd the karela to the pressure cooker with 3 tablespoon of water  salt and turmeric powder and pressure cook for three whistles \nRelease the pressure immediately and open the lids \nKeep aside Heat oil in a heavy bottomed pan or a kadhai \nAdd cumin seeds and let it sizzle Once the cumin seeds have sizzled  add onions and saute them till it turns golden brown in color Add the karela  red chilli powder  amchur powder  coriander powder and besan \nStir to combine the masalas into the karela Drizzle a little extra oil on the top and mix again \nCover the pan and simmer Masala Karela stirring occasionally until everything comes together well \nTurn off the heat Transfer Masala Karela into a serving bowl and serve Serve Masala Karela along with Panchmel Dal and Phulka for a weekday meal with your family \n'

In [20]:
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(r'[ \n]+', ' ', text)
    return text.strip().lower()

In [21]:
text = 'To begin making the Masala Karela Recipe,de-seed the karela and slice.\nDo not remove the skin as the skin has all the nutrients.\nAdd the karela to the pressure cooker with 3 tablespoon of water, salt and turmeric powder and pressure cook for three whistles.\nRelease the pressure immediately and open the lids.\nKeep aside.Heat oil in a heavy bottomed pan or a kadhai.\nAdd cumin seeds and let it sizzle.Once the cumin seeds have sizzled, add onions and saute them till it turns golden brown in color.Add the karela, red chilli powder, amchur powder, coriander powder and besan.\nStir to combine the masalas into the karela.Drizzle a little extra oil on the top and mix again.\nCover the pan and simmer Masala Karela stirring occasionally until everything comes together well.\nTurn off the heat.Transfer Masala Karela into a serving bowl and serve.Serve Masala Karela along with Panchmel Dal and Phulka for a weekday meal with your family.\n'

preprocess(text)

'to begin making the masala karela recipe de seed the karela and slice do not remove the skin as the skin has all the nutrients add the karela to the pressure cooker with 3 tablespoon of water salt and turmeric powder and pressure cook for three whistles release the pressure immediately and open the lids keep aside heat oil in a heavy bottomed pan or a kadhai add cumin seeds and let it sizzle once the cumin seeds have sizzled add onions and saute them till it turns golden brown in color add the karela red chilli powder amchur powder coriander powder and besan stir to combine the masalas into the karela drizzle a little extra oil on the top and mix again cover the pan and simmer masala karela stirring occasionally until everything comes together well turn off the heat transfer masala karela into a serving bowl and serve serve masala karela along with panchmel dal and phulka for a weekday meal with your family'

In [22]:
df.TranslatedInstructions = df.TranslatedInstructions.map(preprocess)

In [23]:
df.TranslatedInstructions[0]

'to begin making the masala karela recipe de seed the karela and slice do not remove the skin as the skin has all the nutrients add the karela to the pressure cooker with 3 tablespoon of water salt and turmeric powder and pressure cook for three whistles release the pressure immediately and open the lids keep aside heat oil in a heavy bottomed pan or a kadhai add cumin seeds and let it sizzle once the cumin seeds have sizzled add onions and saute them till it turns golden brown in color add the karela red chilli powder amchur powder coriander powder and besan stir to combine the masalas into the karela drizzle a little extra oil on the top and mix again cover the pan and simmer masala karela stirring occasionally until everything comes together well turn off the heat transfer masala karela into a serving bowl and serve serve masala karela along with panchmel dal and phulka for a weekday meal with your family'

In [24]:
df.to_csv("food_receipes.txt", columns=["TranslatedInstructions"], header=None, index=False)

In [25]:
import fasttext

model = fasttext.train_unsupervised("food_receipes.txt")

In [26]:
model.get_nearest_neighbors("paneer")

[(0.7046746611595154, 'tikka'),
 (0.6630706191062927, 'tikkas'),
 (0.6622522473335266, 'tandoori'),
 (0.6518504619598389, 'bhurji'),
 (0.6466901302337646, 'reshmi'),
 (0.6369193196296692, 'nawabi'),
 (0.6190375685691833, 'makhanwala'),
 (0.6179590821266174, 'hariyali'),
 (0.6143130660057068, 'makhani'),
 (0.5987952351570129, 'malai')]

In [27]:
model.get_nearest_neighbors("chutney")

[(0.9275704622268677, 'chutneys'),
 (0.7463748455047607, 'dhaniya'),
 (0.7132056951522827, 'imli'),
 (0.7042087316513062, 'khajur'),
 (0.6639349460601807, 'kanchipuram'),
 (0.6590506434440613, 'pudina'),
 (0.6549491286277771, 'gothsu'),
 (0.6544407606124878, 'chammanthi'),
 (0.6525646448135376, 'south'),
 (0.6511055827140808, 'madurai')]

In [28]:
model.get_nearest_neighbors("halwa")

[(0.7467402815818787, 'khoya'),
 (0.7186369299888611, 'burfi'),
 (0.7104381322860718, 'rabri'),
 (0.6857462525367737, 'mawa'),
 (0.6752265095710754, 'badam'),
 (0.672613799571991, 'sheera'),
 (0.6673717498779297, 'kheer'),
 (0.6628114581108093, 'mohan'),
 (0.6588674187660217, 'basundi'),
 (0.6500763297080994, 'doodh')]

In [29]:
model.get_nearest_neighbors("dosa")

[(0.8473756909370422, 'dosai'),
 (0.8177902698516846, 'dosas'),
 (0.7941131591796875, "dosa's"),
 (0.7563254237174988, 'uthappam'),
 (0.7445687055587769, 'uttapam'),
 (0.7228896021842957, 'kanchipuram'),
 (0.7157869338989258, 'dose'),
 (0.7090448141098022, 'neer'),
 (0.7085314393043518, 'pesarattu'),
 (0.7060192227363586, 'chembaruthi')]

In [30]:
model.get_nearest_neighbors("moong")

[(0.7587465047836304, 'sprouted'),
 (0.7246676683425903, 'moth'),
 (0.7035178542137146, 'horse'),
 (0.6858246326446533, 'mooga'),
 (0.6832014918327332, 'horsegram'),
 (0.6807715892791748, 'dal'),
 (0.6562758684158325, 'moolangi'),
 (0.6540908217430115, 'sprout'),
 (0.6464773416519165, 'moongphali'),
 (0.6430541276931763, 'tuvar')]

In [31]:
model.get_word_vector("dosa").shape

(100,)

In [32]:
model.get_nearest_neighbors("saragva")

[(0.8921212553977966, 'fansi'),
 (0.8687644004821777, 'bhoplya'),
 (0.8530213832855225, 'phanu'),
 (0.8452553749084473, 'agathi'),
 (0.8449345827102661, 'saagu'),
 (0.8402233719825745, 'sookhi'),
 (0.8388527631759644, 'vaangi'),
 (0.8383015394210815, 'bhuga'),
 (0.8374607563018799, 'sukhi'),
 (0.8336049318313599, 'phalguni')]

https://fasttext.cc/docs/en/unsupervised-tutorial.html for details on parameters in train_unsupervised function. Based on the need one can use following parameters for fine tunning,

1. epochs = Default value is 5. Epoch is how many times it will loop over the same dataset for the training
2. lr = Learning rate
3. thread = Number of threads for the training