**Work Done**
- Able to compare cosine distance and euclidian distance between two sentences using embeddings generated by MURIL
- Used nltk and wordnet to find words close in meaning for a given input word (not using with the model at the moment)
- Used spacy to randomly select Adjectives,Nouns,Verbs from a given sentence
- Using nlpaug we can replace either the entire sentence with an augmented form or replace a random word with a "similar" one.

**Used Google Translate model to translate between Hindi and English to test which Idioms when translated would result in their actual meaning.**
- Currently continuing further experimentation related to this idea.


**Idioms which work**
- in hot water
- both go hand in hand
- dont sweat it
- beat around the bush
- call it a day
- easy as pie
- do not sweat it, dont sweat it

In [1]:
import tensorflow as tf

In [2]:
import tensorflow_text as text

In [None]:
from transformers import bert
from bert import bert_tokenization
import numpy as np
from scipy.spatial import distance

In [None]:
#function definition
def get_model(model_url, max_seq_length):
  inputs = dict(
    input_word_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
    input_mask=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
    input_type_ids=tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32),
    ) 
  muril_layer = hub.KerasLayer(model_url, trainable=True)
  outputs = muril_layer(inputs)
  assert 'sequence_output' in outputs
  assert 'pooled_output' in outputs
  assert 'encoder_outputs' in outputs
  assert 'default' in outputs
  return tf.keras.Model(inputs=inputs,outputs=outputs["pooled_output"]), muril_layer
#function call
max_seq_length = 128

muril_model, muril_layer = get_model(
    model_url="https://tfhub.dev/google/MuRIL/1", max_seq_length=max_seq_length)

In [None]:
#converts input into BERT-acceptable format (preprocessing)
vocab_file = muril_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = muril_layer.resolved_object.do_lower_case.numpy()
tokenizer = bert_tokenization.FullTokenizer(vocab_file, do_lower_case)

In [None]:
#the 3 types of embeddings in BERT
def create_input(input_strings, tokenizer, max_seq_length):
  input_ids_all, input_mask_all, input_type_ids_all = [], [], []
  for input_string in input_strings:
    input_tokens = ["[CLS]"] + tokenizer.tokenize(input_string) + ["[SEP]"]
    input_ids = tokenizer.convert_tokens_to_ids(input_tokens)
    sequence_length = min(len(input_ids), max_seq_length)
    if len(input_ids) >= max_seq_length:
      input_ids = input_ids[:max_seq_length]
    else:
      input_ids = input_ids + [0] * (max_seq_length - len(input_ids))
    input_mask = [1] * sequence_length + [0] * (max_seq_length - sequence_length)
    input_ids_all.append(input_ids)
    input_mask_all.append(input_mask)
    input_type_ids_all.append([0] * max_seq_length)
  return np.array(input_ids_all), np.array(input_mask_all), np.array(input_type_ids_all)

In [None]:
def encode(input_text):
  input_ids, input_mask, input_type_ids = create_input(input_text,tokenizer,max_seq_length)
  inputs = dict(
      input_word_ids=input_ids,
      input_mask=input_mask,
      input_type_ids=input_type_ids,
  )
  return muril_model(inputs)

In [None]:
import nlpaug.augmenter.word as naw
import spacy
nlp = spacy.load("en_core_web_lg")
import random

In [None]:
def select_random(sentence):
    doc = nlp(sentence)
    words = [token.text for token in doc if token.pos_ in ['VERB','ADJ','NOUN']]
    if(len(words)==0):
        return doc[0].text
    random_word_index = random.randrange(len(words))  # Generate a random index
    random_word = words[random_word_index]
    return random_word

In [None]:
aug = naw.SynonymAug(
    aug_src='wordnet',
    model_path=None,
    name='Synonym_Aug',
    aug_min=1,
    aug_max=10,
    aug_p=0.1,  # Adjust this value for closer synonyms
    lang='eng',
    stopwords=None,
    tokenizer=None,
    reverse_tokenizer=None,
    stopwords_regex=None,
    force_reload=False,
    verbose=0
)


In [None]:
def replace_w_aug(sentence):
    word=select_random(sentence)
    word_aug = aug.augment(word)
    l=sentence.split()
    l[l.index(word)]=word_aug[0]
    l=' '.join(l)
    return l

In [None]:
def euclid_dist(embeddings,x,y):
    dst_1 = distance.euclidean(np.array(embeddings[x]), np.array(embeddings[y]))
    print("Euclidian Distance between '{}' & '{}' is {}".format(sentences[x],sentences[y],dst_1))
    print()

In [None]:
from scipy.spatial.distance import cosine
def cosine_dist(embeddings,x,y):
    # Assuming embeddings1 and embeddings2 are the embeddings of two sentences
    cosine_similarity1 = 1 - cosine(embeddings[x], embeddings[y])
    print("Cosine Similarity for '{}' & '{}' is {}".format(sentences[x],sentences[y],cosine_similarity1))
    cosine_distance1 = 1 - cosine_similarity1
    print("Cosine Distance between '{}' & '{}' is {}".format(sentences[x],sentences[y],cosine_distance1))
    print()

In [None]:
doc=nlp(idiom)
for d in doc:
    print(d.text,d.pos_)

in ADP
hot ADJ
water NOUN


In [None]:
euclid_dist(embeddings,0,1)
euclid_dist(embeddings,0,2)
cosine_dist(embeddings,0,1)
cosine_dist(embeddings,0,2)

Euclidian Distance between 'dont worry about it' & 'dont sweat it' is 0.0038874142337590456

Euclidian Distance between 'dont worry about it' & 'dont perspiration it' is 0.00786435417830944

Cosine Similarity for 'dont worry about it' & 'dont sweat it' is 0.9999198317527771
Cosine Distance between 'dont worry about it' & 'dont sweat it' is 8.016824722290039e-05

Cosine Similarity for 'dont worry about it' & 'dont perspiration it' is 0.9996719360351562
Cosine Distance between 'dont worry about it' & 'dont perspiration it' is 0.00032806396484375



In [18]:
from translate import Translator

translator = Translator(to_lang="hi")


In [22]:
real="in trouble"
idiom="in hot water"
sentences = [real,idiom,"in hot coffee"]
# embeddings = encode(sentences)
# embeddings

In [23]:
for sentence in sentences:
    translation = translator.translate(sentence)
    print(translation, sentence)

मुसीबत में in trouble
गर्म पानी में in hot water
हॉट कॉफ़ी में in hot coffee


In [None]:
word = nlp("This is a sentence.")
sentence2 = nlp("This is another sentence.")

similarity = sentence1.similarity(sentence2)
print("Similarity:", similarity)