In [1]:
import numpy as np
import pandas as pd
import spacy
from bidi.algorithm import get_display

from googletrans import Translator
translator = Translator()

# Test word vectors

In [3]:
vec_path = '/Users/pabloherrero/Documents/ManHatTan/mht/data/processed/he_vectors'
nlp = spacy.load(vec_path)

In [4]:
doc = nlp("זהו טקסט בעברית.")
for token in doc:
    print(token.text, token.has_vector, token.vector_norm)

זהו True 1.0681664
טקסט True 0.8047567
בעברית True 0.40524793
. True 1.5317025


In [50]:
doc = nlp("חברה שלי פיפו לא אדום.")
for token in doc:
    print(token.text, token.has_vector, token.vector_norm)

חברה True 0.77420473
שלי True 1.0418928
פיפו True 0.0
לא True 1.9327171
אדום True 0.83313257
. True 1.5317025


# Search for similar words

## Arbitrary word

In [5]:
adom_id = nlp.vocab.strings['אדום']
adom_vec = nlp.vocab.vectors[adom_id]
most_similar_words = nlp.vocab.vectors.most_similar(np.asarray([adom_vec]), n=5)
words = [nlp.vocab.strings[w] for w in most_similar_words[0][0]]
words


['אדום', 'כחול', 'צהוב', 'ירוק', 'שחור']

In [6]:
adom_id = nlp.vocab.strings['אמרתי']
adom_vec = nlp.vocab.vectors[adom_id]
most_similar_words = nlp.vocab.vectors.most_similar(np.asarray([adom_vec]), n=8)
words = [nlp.vocab.strings[w] for w in most_similar_words[0][0]]
words

['אמרתי', 'ואמרתי', 'אגיד', 'חשבתי', 'ידעתי', 'שאמרתי', 'אומר', 'אמר']

## Test on LIP

In [7]:
lippath = '/Users/pabloherrero/Documents/ManHatTan/mht/data/processed/LIPSTICK/hebrew_db.lip'
lip = pd.read_csv(lippath)
lip.set_index('word_ll', inplace=True, drop=False)

In [8]:
vec_path = '/Users/pabloherrero/Documents/ManHatTan/mht/data/processed/he_vectors'
# Load your spaCy model that contains the Hebrew vectors
nlp = spacy.load(vec_path)

# Define the target word and process it to get its token (make sure it has a vector)
target_word = "עכביש"
target_token = nlp(target_word)[0]

other_words = lip.word_ll.values.tolist()  # List of words to compare with the target word
# Compute similarity for each word in the list
similarities = []
for word in other_words:
    token = nlp(word)[0]
    # Check if both tokens have vectors
    if target_token.has_vector and token.has_vector:
        sim = target_token.similarity(token)
        similarities.append((word, sim))
    else:
        similarities.append((word, 0.0))

# Sort words by similarity in descending order
similarities.sort(key=lambda x: x[1], reverse=True)

# Print the top 10 most similar words
for word, score in similarities[:10]:
    print(f"{word}: {score:.3f}, {lip.loc[word, 'word_ul']}")


עכביש: 1.000, spider
קוף: 0.344, monkey
מפלצת: 0.316, monster
נחש: 0.286, snake
יצור: 0.260, creature
מכשפה: 0.259, witch
קוסם: 0.257, magician
מיקרופון: 0.254, microphone
טיפש: 0.251, Stupid
שריון: 0.251, armor


## Write word vectors

In [9]:
all_entries = lip.word_ll.values.tolist()
vect_word = []
vectors = []
for word in all_entries:
    token = nlp(word)[0]
    # Check if both tokens have vectors
    if target_token.has_vector and token.has_vector:
        vect_word.append(word)
        vectors.append(token.vector)

In [11]:
pathout = '/Users/pabloherrero/Documents/ManHatTan/mht/data/processed/vectors_lip/vectors_heb_lip.npz'

np.savez(pathout, tokens=vect_word, vectors=vectors)

# Load word vectors from npz file

In [128]:
import asyncio
from googletrans import LANGUAGES

from googletrans import Translator
translator = Translator()

In [19]:
translator = Translator()

# possible_words = sample(filtered_words, 5)
translations = await translator.translate(filtered_words, dest='en')
translated_words = [t.text for t in translations]
lang_detection = await translator.detect(translated_words)
detected_lang = [lang.lang for lang in lang_detection]

gota = pd.DataFrame(columns=['word_ll', 'word_ul', 'ui_language', 'learning_language'])
filtered_translations = []

for i, (word_ll, word_ul, detected_lang) in enumerate(zip(filtered_words, translated_words, detected_lang)):
    print(f"{i+1}")
    if detected_lang == translations[0].src:
        print(f"Failed to translate: {word_ll} -> {word_ul}")
        filtered_words.remove(word_ll)
        translated_words.remove(word_ul)
    else:
        print(f"Translation: {word_ll} -> {word_ul}")
        filtered_translations.append(word_ul)
gota['word_ll'] = filtered_words
gota['word_ul'] = filtered_translations
gota['ui_language'] = 'en'
gota['learning_language'] = 'iw'
gota

1
Translation: מפלצות -> Monsters
2
Translation: חתולה -> cat
3
Translation: חתול -> cat
4
Translation: מכשפה -> witch
5
Translation: דמות -> character
6
Translation: האמא -> The mom
7
Translation: רובוט -> robot
8
Translation: ילדה -> girl
9
Translation: מכונית -> car
10
Translation: תחפושת -> disguise
11
Translation: Monster -> Monster
12
Translation: בדיחה -> joke
13
Translation: משוגע -> crazy
14
Translation: דרקון -> dragon
15
Translation: משאית -> truck
16
Translation: נסיכה -> princess
17
Translation: טירוף -> Madness
18
Translation: ציפור -> A bird
19
Translation: נאצי -> Nazi
20
Translation: נערה -> girl
21
Translation: גיהנום -> hell
22
Translation: אגדה -> legend
23
Translation: אמא -> mother
24
Translation: סבתא -> grandmother
25
Translation: ילדת -> A girl
26
Translation: חרא -> Shit
27
Translation: גיבור -> Hero
28
Translation: קופסה -> box


Unnamed: 0,word_ll,word_ul,ui_language,learning_language
0,מפלצות,Monsters,en,iw
1,חתולה,cat,en,iw
2,חתול,cat,en,iw
3,מכשפה,witch,en,iw
4,דמות,character,en,iw
5,האמא,The mom,en,iw
6,רובוט,robot,en,iw
7,ילדה,girl,en,iw
8,מכונית,car,en,iw
9,תחפושת,disguise,en,iw


In [75]:
from googletrans import LANGUAGES, LANGCODES

In [77]:
LANGCODES['hebrew']

'iw'

In [21]:
async def translate_text(src: pd.DataFrame, dest_lang: str):
    async with Translator() as translator:
        src_array = [w for w in src.word_ll.values]
        result = await translator.translate(src_array, src=src.learning_language.iloc[0], dest=src.ui_language.iloc[0])
        print([t.text for t in result])

In [22]:
async def bulk_translate(src: pd.DataFrame, dest_lang: str = None):
    async with Translator() as translator:
        src_list = [w for w in src.word_ll.values]
        if dest_lang == None:
            src.ui_language.iloc[0]
        result = await translator.translate(src_list, src=src.learning_language.iloc[0], dest=dest_lang)
        translated_words = [t.text for t in result]
        dest = pd.DataFrame({dest_lang: translated_words}, index=src_list, )
        return dest

In [23]:
await bulk_translate(gota, 'en')

Unnamed: 0,en
מפלצות,Monsters
חתולה,cat
חתול,cat
מכשפה,witch
דמות,character
האמא,The mom
רובוט,robot
ילדה,girl
מכונית,car
תחפושת,disguise
