In [1]:
#!pip install fasttext

In [2]:
#!git clone https://github.com/mkonicek/nlp.git
#!cd nlp

In [3]:
# disable this line in load.py file #words = remove_stop_words(words) 
# to get 'तिच्या', 'तिला' , 'माझ्या',

In [4]:
#!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.mr.300.vec.gz
#!gunzip cc.mr.300.vec.gz

In [5]:
#! head -n 50001 cc.mr.300.vec | tail -n 50000 > /tmp/marathi_partial.vec # partial module

In [6]:
#!cp cc.mr.300.vec /tmp/marathi_full.vec # full module 

In [1]:
from typing import Any, Iterable, List, Optional, Set, Tuple

from load import load_words
import math
import vectors as v
from vectors import Vector
from word import Word

In [2]:
def most_similar(base_vector: Vector, words: List[Word]) -> List[Tuple[float, Word]]:
    """Finds n words with smallest cosine similarity to a given word"""
    words_with_distance = [(v.cosine_similarity_normalized(base_vector, w.vector), w) for w in words]
    # We want cosine similarity to be as large as possible (close to 1)
    sorted_by_distance = sorted(words_with_distance, key=lambda t: t[0], reverse=True)
    return sorted_by_distance

In [3]:
def print_most_similar(words: List[Word], text: str) -> None:
    base_word = find_word(text, words)
    if not base_word:
        print(f"Uknown word: {text}")
        return
    print(f"Words related to {base_word.text}:")
    sorted_by_distance = [
        word.text for (dist, word) in
            most_similar(base_word.vector, words)
            if word.text.lower() != base_word.text.lower()
        ]
    print(', '.join(sorted_by_distance[:10]))

In [4]:
def read_word() -> str:
    return input("Type a word: ")

In [5]:
def find_word(text: str, words: List[Word]) -> Optional[Word]:
    try:
       return next(w for w in words if text == w.text)
    except StopIteration:
       return None

In [6]:
def closest_analogies(
    left2: str, left1: str, right2: str, words: List[Word]
) -> List[Tuple[float, Word]]:
    word_left1 = find_word(left1, words)
    word_left2 = find_word(left2, words)
    word_right2 = find_word(right2, words)
    if (not word_left1) or (not word_left2) or (not word_right2):
        return []
    vector = v.add(
        v.sub(word_left1.vector, word_left2.vector),
        word_right2.vector)
    closest = most_similar(vector, words)[:10]
    def is_redundant(word: str) -> bool:
        """
        Sometimes the two left vectors are so close the answer is e.g.
        "shirt-clothing is like phone-phones". Skip 'phones' and get the next
        suggestion, which might be more interesting.
        """
        word_lower = word.lower()
        return (
            left1.lower() in word_lower or
            left2.lower() in word_lower or
            right2.lower() in word_lower)
    closest_filtered = [(dist, w) for (dist, w) in closest if not is_redundant(w.text)]
    return closest_filtered

In [7]:
def print_analogy(left2: str, left1: str, right2: str, words: List[Word]) -> None:
    analogies = closest_analogies(left2, left1, right2, words)
    if (len(analogies) == 0):
        print(f"{left2}-{left1} is like {right2}-?")
    else:
        (dist, w) = analogies[0]
        #alternatives = ', '.join([f"{w.text} ({dist})" for (dist, w) in analogies])
        print(f"{left2}-{left1} is like {right2}-{w.text}")

In [14]:
words = load_words('/tmp/marathi_partial.vec')

Loading /tmp/marathi_partial.vec...
Loaded 50000 words.
Removed stop words, 27603 remain.
Removed duplicates, 23387 remain.


In [15]:
print_most_similar(words, 'अपयश')
print_most_similar(words, 'संगीत')

Words related to अपयश:
मिळवण्यात, यश, नैराश्य, उपचारातील, राखण्यात, आलंय, हाताळण्यात, पकडण्यात, करण्यात, घवघवीत
Words related to संगीत:
लोकसंगीत, नाट्यसंगीत, संगीतकार, संगीतावर, गायक, संगीतज्ञ, संगीतविषयक, बहारदार, श्रवणीय, नाटक


In [16]:
print_most_similar(words, 'अधिकार')
print_most_similar(words, 'श्रेणी')
print_most_similar(words, 'आनंद')
print_most_similar(words, 'प्रचंड')

Words related to अधिकार:
आधिकार, सर्वाधिकार, हक्क, विशेषाधिकार, अधिकारात, एकाधिकार, अधिकारारूढ, मताधिकार, घटनात्मक, प्रदान
Uknown word: श्रेणी
Words related to आनंद:
आनंदच, समाधान, अवर्णनीय, आनंदोत्सव, आनंदात, मनमुराद, उत्साह, हर्ष, अनुभव, लुटत
Words related to प्रचंड:
खूप, अफाट, अपरिमित, अतोनात, अतिशय, फार, जबरदस्त, प्रमाणात, भयंकर, भरपूर


In [17]:
print_most_similar(words, words[190].text)
print_most_similar(words, words[230].text)
print_most_similar(words, words[330].text)
print_most_similar(words, words[430].text)

Words related to २०१२:
२०११, २०१३, २०१०, २०१४, २००९, २००८, २००७, २०१६, २०१५, जून
Words related to निकाल:
निकालावर, राखून, निकालानंतर, सामने१, निकालात, सामने२, सामने३, ईंडीझ, won, धावफलक
Words related to प्रचंड:
खूप, अफाट, अपरिमित, अतोनात, अतिशय, फार, जबरदस्त, प्रमाणात, भयंकर, भरपूर
Words related to पवार:
शरद, अजित, पवारांवर, उमेदवार२, चव्हाण, जयसिंगराव, उमेदवार१, तळेकर, पाटील, चिखलीकर


In [18]:
print_analogy('पवार', 'शरद' , 'मोदी', words)

पवार-शरद is like मोदी-?


In [19]:
print_analogy('तिच्या', 'तिला' , 'माझ्या', words)

तिच्या-तिला is like माझ्या-?


In [20]:
print_analogy('महाराष्ट्र', 'मुंबई', 'गुजरात', words)
print_analogy('महाराष्ट्र', 'मुंबई', 'बिहार', words)
print_analogy('पक्षी', 'मोर', 'प्राणी', words)
print_analogy('मुंबई', 'पुणे' , 'नागपूर', words)
print_analogy('सुख', 'दु:ख', 'यश', words)

महाराष्ट्र-मुंबई is like गुजरात-अहमदाबाद
महाराष्ट्र-मुंबई is like बिहार-?
पक्षी-मोर is like प्राणी-?
मुंबई-पुणे is like नागपूर-?
सुख-दु:ख is like यश-पण


In [21]:
print_analogy('राजा', 'राणी', 'नर', words)

राजा-राणी is like नर-?
