In [1]:
from simalign import SentenceAligner
import matplotlib.pyplot as plt
import numpy as np
from typing import List, Text, Tuple
import string

In [2]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [3]:
myaligner = SentenceAligner(model="bert", token_type="word", matching_methods="mai", layer = 12, return_dict=True)

2020-10-20 14:37:33,342 - simalign.simalign - INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased
INFO - Initialized the EmbeddingLoader with model: bert-base-multilingual-cased


In [4]:
def clean_text(words):
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in words]
    return stripped

In [5]:
print(clean_text("En este cuadro, hay cuatro personas: mi padre, mi madre, mi hermano y mi hermana.".split()))

['En', 'este', 'cuadro', 'hay', 'cuatro', 'personas', 'mi', 'padre', 'mi', 'madre', 'mi', 'hermano', 'y', 'mi', 'hermana']


In [6]:
print(clean_text("En este cuadro, hay cuatro personas: mi padre, mi madre, mi hermano y mi hermana.".split()))

['En', 'este', 'cuadro', 'hay', 'cuatro', 'personas', 'mi', 'padre', 'mi', 'madre', 'mi', 'hermano', 'y', 'mi', 'hermana']


In [7]:
def reconstruct_alignment_matrix(matchings: Text, n: int, m: int):
    alignment_matrix = np.zeros((n, m))
    for i,j in matchings:
        alignment_matrix[i, j] = 1
    return alignment_matrix

In [8]:
def print_matchings(matchings,src_sentence,trg_sentence):
    if isinstance(src_sentence,str):
        src_sentence = src_sentence.split()
    if isinstance(trg_sentence,str):
        trg_sentence = trg_sentence.split()
    for i,j in matchings:
        print(f'{src_sentence[i]}->{trg_sentence[j]}')

In [23]:
def get_sim_align_scores(src_sentence,trg_sentence):
    alignment_dict = myaligner.get_word_aligns(src_sentence, trg_sentence)
    #print(alignment_dict["alignments"]["mwmf"])
    print_matchings(alignment_dict["alignments"]["itermax"],src_sentence, trg_sentence)
    n,m=alignment_dict["similarity_matrix"].shape
    alignment_matrix = reconstruct_alignment_matrix(alignment_dict["alignments"]["itermax"],n,m)
    alignment_scores = alignment_dict["similarity_matrix"]*alignment_matrix
    axis_aligned_alignment_score = alignment_scores.sum(0)
    no_of_matchings = alignment_scores.astype(bool).sum(0)
    sim_align_scores = [axis_aligned_alignment_score[i]/no_of_matchings[i] if no_of_matchings[i]>0 else 0 for i in range(len(axis_aligned_alignment_score))]
    return sim_align_scores    


In [20]:
src_sentence = "Football is a sport I love playing"
trg_sentence = "I love to play football"
get_sim_align_scores(src_sentence,trg_sentence)



KeyError: 'iter'

In [16]:
src_sentence = clean_text("If you help the needy, God will reward you.".split())
trg_sentence = clean_text("Giving money to the poor has good consequences.".split())
sim_align_scores = get_sim_align_scores(src_sentence,trg_sentence)
print(sim_align_scores)

the->the
needy->poor
will->has
reward->to
you->money
[0, 0.7328535318374634, 0.761725902557373, 0.8023812174797058, 0.7176111936569214, 0.7320346236228943, 0, 0]




In [24]:
src_sentence = "Language coverage is identical to multilingual BERT"
trg_sentence = "I love to play football"
sim_align_scores = get_sim_align_scores(src_sentence,trg_sentence)
print(sim_align_scores)

is->I
identical->love
to->to
multilingual->football
BERT->play
[0.6834468841552734, 0.6979613304138184, 0.7619485259056091, 0.667186975479126, 0.6814118027687073]




In [49]:
src_sentence = clean_text("No no no no no no no no no no no no no no no no no no no no".split())
trg_sentence = clean_text("En este cuadro hay cuatro personas mi padre mi madre mi hermano y mi hermana".split())
sim_align_scores = get_sim_align_scores(src_sentence,trg_sentence)
print(sim_align_scores)

No. of layers: 13
no->este
no->En
no->hay
no->cuadro
no->personas
no->mi
no->padre
no->mi
no->madre
no->hermano
no->cuatro
no->mi
no->mi
no->y
no->hermana
[0.6392083764076233, 0.6480591297149658, 0.6561990976333618, 0.6487113237380981, 0.6359125375747681, 0.6601014137268066, 0.6517370343208313, 0.668590247631073, 0.6579039692878723, 0.6772670745849609, 0.6628153920173645, 0.677024245262146, 0.6642996072769165, 0.6669628024101257, 0.6628788113594055]


In [29]:
src_sentence = "No , no , no , no , no , no , no , no , no , no , no , no , no , no , no , no , no , no , no , no ."
trg_sentence = "En este cuadro , hay cuatro personas : mi padre , mi madre , mi hermano y mi hermana."
sim_align_scores = get_sim_align_scores(src_sentence,trg_sentence)
print(sim_align_scores)

No. of layers: 13
,->,
,->,
.->:
[0, 0, 0, 0, 0, 0, 0, 0.7086302042007446, 0, 0, 0.7581214308738708, 0, 0, 0.7559026479721069, 0, 0, 0, 0, 0]


In [9]:
alignment_dict = myaligner.get_word_aligns(src_sentence, trg_sentence)

No. of layers: 13


In [10]:
alignment_dict

{'alignments': {'mwmf': [(0, 0),
   (1, 3),
   (2, 2),
   (3, 7),
   (4, 6),
   (5, 8),
   (6, 9),
   (9, 11),
   (12, 5),
   (13, 4),
   (15, 10),
   (16, 1),
   (17, 13),
   (18, 12),
   (19, 14)],
  'inter': [(2, 7), (19, 5)],
  'itermax': [(2, 7), (4, 9), (19, 5)]},
 'similarity_matrix': array([[0.64001304, 0.64697236, 0.696739  , 0.65915066, 0.6650571 ,
         0.700886  , 0.67814094, 0.72961426, 0.67755795, 0.7241428 ,
         0.67745215, 0.6591057 , 0.7131208 , 0.6676848 , 0.6699865 ],
        [0.6352058 , 0.65165544, 0.693285  , 0.657238  , 0.6663618 ,
         0.69544566, 0.67926884, 0.73262787, 0.68343484, 0.73000735,
         0.6832223 , 0.66337407, 0.7130358 , 0.6712997 , 0.66694117],
        [0.6343672 , 0.6510731 , 0.69280434, 0.65658873, 0.6679187 ,
         0.6952791 , 0.67987597, 0.73329306, 0.6854879 , 0.73163176,
         0.68466413, 0.66424155, 0.7119039 , 0.67251307, 0.66762334],
        [0.6338898 , 0.6500083 , 0.69205016, 0.65571356, 0.6681889 ,
         0.6956

In [54]:
alignment_dict["similarity_matrix"].shape

(37, 15)

In [55]:
n,m=alignment_dict["similarity_matrix"].shape
alignment_matrix = reconstruct_alignment_matrix(alignment_dict["alignments"]["inter"],n,m)
alignment_scores = alignment_dict["similarity_matrix"]*alignment_matrix

In [56]:
alignment_matrix

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0.

In [57]:
alignment_scores

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 

In [58]:
np.divide(alignment_scores.sum(0),alignment_scores.astype(bool).sum(0),where=alignment_scores.astype(bool).sum(0).astype(bool))

array([0.76631784, 0.7336641 , 0.75985754, 0.72677809, 0.71257454,
       0.75589621, 0.75527006, 0.80278468, 0.75795823, 0.81723225,
       0.76679623, 0.76473534, 0.77462155, 0.78129834, 0.87100923])

In [59]:
alignment_scores.sum(0)

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.81723225,
       0.        , 0.        , 0.        , 0.        , 0.87100923])

In [60]:
alignment_scores.astype(bool).sum(0)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1])

In [61]:
alignment_scores.astype(bool).sum(0).astype(bool)

array([False, False, False, False, False, False, False, False, False,
        True, False, False, False, False,  True])

In [62]:
alignment_scores.sum(0)/alignment_scores.astype(bool).sum(0)

array([       nan,        nan,        nan,        nan,        nan,
              nan,        nan,        nan,        nan, 0.81723225,
              nan,        nan,        nan,        nan, 0.87100923])

In [63]:
np.divide(alignment_scores.sum(0),alignment_scores.astype(bool).sum(0),where=alignment_scores.astype(bool).sum(0).astype(bool))

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.81723225,
       0.        , 0.        , 0.        , 0.        , 0.87100923])