In [1]:
!pip install -q sentence_transformers
!pip install -q mtranslate

[K     |████████████████████████████████| 81kB 9.5MB/s 
[K     |████████████████████████████████| 2.5MB 29.7MB/s 
[K     |████████████████████████████████| 1.2MB 42.9MB/s 
[K     |████████████████████████████████| 3.3MB 39.7MB/s 
[K     |████████████████████████████████| 901kB 52.8MB/s 
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Building wheel for mtranslate (setup.py) ... [?25l[?25hdone


# How I use text embeddings

## filtered translation with multilingual Sentence Embbeding

In [18]:
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer
import torch
from tqdm.autonotebook import tqdm
from mtranslate import translate

In [3]:
class SentenceSimilarityMultiLang():
    
    def __init__(self, model_name='stsb-xlm-r-multilingual'):
        # add device
        self.model = SentenceTransformer(model_name)

    def __call__(self, text):
        # tokenization step
        sentence_embeddings = self.model.encode(text, convert_to_tensor=True)
        return sentence_embeddings.unsqueeze(1)

    def cosine_similarity(self, a, b):
        a, b = self([a, b])
        return torch.cosine_similarity(a, b).item()

In [4]:
ssml = SentenceSimilarityMultiLang()

HBox(children=(FloatProgress(value=0.0, max=1013951149.0), HTML(value='')))




In [5]:
in_persian = 'چگونه می توانم به شما کمک کنم؟'
in_english = 'How can I help you?'

ssml.cosine_similarity(in_persian, in_english)

0.9940208196640015

In [6]:
in_persian = 'میتونم به شما کمک کنم؟'
in_english = 'How can I help you?'

ssml.cosine_similarity(in_persian, in_english)

0.9143402576446533

In [10]:
in_persian = 'نحوه ای کمک به دیگران را بیان کنید؟'
in_english = 'How can I help you?'

ssml.cosine_similarity(in_persian, in_english)

0.7608310580253601

In [11]:
class TransWithSimilarityCheck():
    def __init__(self, languages=None, min_score=.9, similar_model_name='stsb-xlm-r-multilingual'):
        self.languages = languages
        self.min_score = min_score
        self.sentence_similar = SentenceSimilarityMultiLang(
            model_name=similar_model_name)

    def _translator(self, sentence):
        return translate(sentence, from_language='en', to_language='fa')

    def __call__(self, sentences):
        augmented = []
        for i, s in tqdm(enumerate(sentences), total=len(sentences)):
            aug = self._translator(s)
            score = self.sentence_similar.cosine_similarity(s, aug)
            if score >= self.min_score:
                augmented.append({'id': i, 'aug': aug, 'score': score})

        return augmented

In [12]:
augmenter = TransWithSimilarityCheck(languages=['en', 'fa'], min_score=.9)
augmented = augmenter(['How can I help you?'])
print(augmented)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


[{'id': 0, 'aug': 'چگونه می توانم به شما کمک کنم؟', 'score': 0.9940208196640015}]


In [17]:
augmenter = TransWithSimilarityCheck(languages=['en', 'fa'], min_score=.5)
augmented = augmenter(['easy peasy let me squeezy'])
print(augmented)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


[{'id': 0, 'aug': 'راحت اجازه دهید من فشار بدهم', 'score': 0.6430428624153137}]


## filtered back translation with similar Sentence Embbeding

In [19]:
class GoogleBackTranslator():
    def __init__(self, n_diff=1):
        self.n_diff = n_diff

    def __call__(self, sentence, languages):
        # any languages from fa .... 
        for i, lang in enumerate(languages[:-1]):
            sentence = translate(sentence, from_language=lang, to_language=languages[i+1])
        # last back
        back_translated = translate(sentence, from_language=languages[i+1], to_language=languages[0])

        tokens = set(back_translated.split(' '))
        if len(tokens.intersection(sentence.split(' '))) >= len(tokens)-self.n_diff:
            return '[||]' # return the SAME token
        return back_translated

- good back translation

In [22]:
bk = GoogleBackTranslator(n_diff=2)
bk('امروز چند شنبس؟', ['fa', 'en'])

'امروز چند شنبه است؟'

- bad back translation

In [25]:
bk = GoogleBackTranslator(n_diff=2)
bk('چجوری میشه از سایت شما خرید کرد؟', ['fa', 'ru'])

'چگونه از سایت خود خرید کنیم؟'

In [26]:
class SentenceSimilarity():
    def __init__(self, model_name='m3hrdadfi/bert-fa-base-uncased-wikitriplet-mean-tokens', max_len=16, device='cpu'):
        self.model_name = model_name
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name).eval()
        self.max_len = max_len
        self.device = device

    def __call__(self, text):
        # tokenization step
        tokens = self.tokenizer(text, truncation=True, padding='max_length', 
                                max_length=self.max_len, return_tensors='pt')

        # model.forward step
        with torch.no_grad():
            embeddings = self.model(**tokens).last_hidden_state
        # Create masked embeddings (just expend size)
        mask = tokens['attention_mask'].unsqueeze(-1).expand(embeddings.shape).float()
        # create sentence embedding (sum embs / sum mask)
        sentence_embeddings = torch.sum(embeddings * mask, dim=1) / torch.clamp(mask.sum(1), min=1e-9) 
        # expand dim for each embedding (helpful for cosine similarity)
        return sentence_embeddings.unsqueeze(1)

    def cosine_similarity(self, a, b):
        a, b = self([a, b])
        return torch.cosine_similarity(a, b).item()

In [None]:
ss = SentenceSimilarity(max_len=32)

- positive example

In [None]:
ss.cosine_similarity(a='برای ترک کامل سیگار چه باید کرد؟', b='برای ترک کامل سیگار چه کاری باید انجام دهید؟')

- negative example

In [28]:
ss.cosine_similarity(a='برای ترک کامل ورزش چه باید کرد؟', b='برای ترک کامل سیگار چه کاری باید انجام دهید؟')

0.6295580267906189

In [29]:
class FilteredBackTranslation():
      # TODO: Parrallel BackTranslator
    def __init__(self, min_score=.8, n_diff=1, similar_model_name='m3hrdadfi/bert-fa-base-uncased-wikitriplet-mean-tokens'):
        self.min_score = min_score
        self.back_translator = GoogleBackTranslator(n_diff=n_diff)
        self.sentence_similar = SentenceSimilarity(model_name=similar_model_name)
        # best languages I find work well for Persian BackTranslation
        self.languages = [['fa', 'en'], ['fa', 'ru'], ['fa', 'ar'], ['fa', 'fr']]

    def __call__(self, sentences, top_chain=2):
        augmented = []
        for i, s in tqdm(enumerate(sentences), total=len(sentences)):
            paraphrazes = []
            scores = []
            # 1:57~30ms 2:85~1m, 3:101~1.4m 4:114~2.1m
            for langs in self.languages[:top_chain]:
                aug = self.back_translator(s, languages=langs) 
                if aug not in paraphrazes:
                    score = self.sentence_similar.cosine_similarity(s, aug)
                    if score >= self.min_score:
                        scores.append(score)
                        paraphrazes.append(aug)

        if len(scores)>0:
            augmented.append({'id': i, 'org': s, 'aug': paraphrazes, 'score': scores})

        return augmented

In [30]:
augmenter = FilteredBackTranslation(min_score=.9)
sentences = ['برای ترک کامل سیگار باید چی کار کرد؟']

augmenter(sentences, top_chain=4)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




[{'aug': ['برای ترک کامل سیگار چه باید کرد؟',
   'برای ترک کامل سیگار چه کاری باید انجام دهید؟'],
  'id': 0,
  'org': 'برای ترک کامل سیگار باید چی کار کرد؟',
  'score': [0.9557236433029175, 0.9505437612533569]}]

In [33]:
sentences = ['چه جوری میتونم وزنم رو کم کنم؟']

augmenter(sentences, top_chain=4)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




[]

In [34]:
sentences = ['راه های درمان خودشیفتگی را بیان کنید؟']

augmenter(sentences, top_chain=4)

HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




[{'aug': ['روش های درمان خودشیفتگی را توصیف کنید؟'],
  'id': 0,
  'org': 'راه های درمان خودشیفتگی را بیان کنید؟',
  'score': [0.907180905342102]}]