In [19]:
import os
import sys
import numpy as np

import nltk
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate import bleu
from nltk import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
detokenizer = TreebankWordDetokenizer()
nltk.download('punkt')

from scipy import spatial
import gensim
from gensim.models import Word2Vec
from utils import *
from laserembeddings import Laser
from sentence_transformers import SentenceTransformer
from mincostflow import MCFGraph

# global variables
from bert_score import score
def load_word2vec_model(model_path):
    return gensim.models.KeyedVectors.load_word2vec_format(model_path, unicode_errors='ignore', limit=500000)
word2vec_zh_model = load_word2vec_model('/mnt/zamia/song/lrec/model/word2vec/zh/model.txt') 
word2vec_en_model = load_word2vec_model('/mnt/zamia/song/lrec/model/word2vec/en/model.txt') 
word2vec_ja_model = load_word2vec_model('/mnt/zamia/song/lrec/model/word2vec/ja/model.txt') 
laser_model = Laser()
sentbert_model = SentenceTransformer('all-mpnet-base-v2')
model_dict = {'zh': word2vec_zh_model, 'en': word2vec_en_model, 'ja': word2vec_ja_model, 'laser': laser_model, 'sentbert': sentbert_model}

[nltk_data] Downloading package punkt to /home/song/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to /home/song/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
def word2vec_sentence2vector(sentence, lang, word2vec_lang1_model, word2vec_lang2_model, config):
    if (lang == config['lang1']):
        model = word2vec_lang1_model
    elif (lang == config['lang2']):
        model = word2vec_lang2_model
    vecs = []
    for word in sentence.split():
        try:
            word = word.lower()
            vecs.append(model[word])
        except:
            #print ("no word in model: ", word)
            continue
    return vecs

def laser_sentence2vector(sentence, lang, model):
    return model.embed_sentences(detokenize(sentence, detokenizer, lang), lang)

def sentbert_sentence2vector(sentence, lang, model):
    return [model.encode(detokenize(sentence, detokenizer, lang))]

def sentences2vectors(sentences, lang, config):
    if (config['model'] == 'laser'):
        vecs = [laser_sentence2vector(sentence, lang, laser_model) for sentence in sentences]
    elif (config['model'] == 'sentbert'):
        vecs = [sentbert_sentence2vector(sentence, lang, sentbert_model) for sentence in sentences]
    elif (config['model'] == 'word2vec'):
        vecs = [word2vec_sentence2vector(sentence, lang, word2vec_lang1_model, word2vec_lang2_model, config) for sentence in sentences]
    return vecs

def connect_vec_list(vec_list1, vec_lists2):
    return (vec_list1 + vec_lists2)


In [21]:
def recover_predict_ids_from_path(path, n1, n2):
    predict_ids = []
    i = 0
    j = 0
    while (i<n1 and j<n2):
        nexti, nextj = path[(i,j)]
        predict_ids.append((list(range(i, nexti+1)), list(range(j, nextj+1))))
        i = nexti+1
        j = nextj+1
    return predict_ids

def dp_sub(i, j, n1, n2, maxcombine, checked, result, path, f, threshold=0):
    if (i>=n1 or j>=n2): return 0
    if (checked.get((i,j)) == True): return result[(i, j)]
    checked[(i,j)] = True 
    result[(i,j)] = 0
    # zero match
    zero_match_1 = dp_sub(i+1, j, n1, n2, maxcombine, checked, result, path, f, threshold)
    zero_match_2 =dp_sub(i+1, j, n1, n2, maxcombine, checked, result, path, f, threshold) 
    if (zero_match_1>result[(i,j)]):
        result[(i,j)] = zero_match_1
        path[(i,j)] = (i+1,j)
    if (zero_match_2>result[(i,j)]):
        result[(i,j)] = zero_match_2
        path[(i,j)] = (i,j+1)
    #dp_sub = max(zero_match_1, zero_match_2)

    # non-zero match
    for k in range(maxcombine):
        if (i+k>=n1): break
        for l in range(maxcombine):
            if (j+l>=n2): break
            if (f[(i, k, j, l)] > threshold):
                tmp_res = dp_sub(i+k+1, j+l+1, n1, n2, maxcombine, checked, result, path, f, threshold) + f[(i, k, j, l)]
                if (tmp_res > result[(i, j)]):
                    path[(i, j)] = (i+k, j+l)
                    result[(i, j)] = tmp_res
    return result[(i, j)]

def dp(f, lang_len, lang_trans_len, config):
    # consider zero match
    checked = {}
    result = {}
    path = {}
    n = int(config['max_combine']) # how many sentences combination is permitted
    threshold = float(config['threshold'])
    dp_sub(0, 0, lang_len, lang_trans_len, n, checked, result, path, f, threshold)
    predict_ids = recover_predict_ids_from_path(path, lang_len, lang_trans_len)
    return predict_ids

def maxcostflow(f, lang_len, lang_trans_len, config):
    s_point = 0
    t_point = lang_len+lang_trans_len+1

    src_start = 1
    src_end = 1+lang_len
    tgt_start = 1+lang_len
    tgt_end = 1+lang_len+lang_trans_len

    # constract the graph
    g=MCFGraph(1+lang_len+lang_trans_len+1)
    for i in range(src_start, src_end):
        g.add_edge(s_point, i, 1, 0)
    for i in range(tgt_start, tgt_end):
        g.add_edge(i, t_point, 1, 0)
    for i in range(src_start, src_end):
        for j in range(tgt_start, tgt_end):
            src_id = i-src_start
            tgt_id = j-tgt_start
            score = 1-f.get((src_id, 0, tgt_id, 0), 0)
            score = int(score*1000)
            g.add_edge(i, j, 1, score)
    # mincostmaxflow
    g.flow(s_point, t_point)

    # recover the result
    predict_ids = []
    for e in g.edges():
        if (e.src>=src_start and e.src<src_end and e.dst>=tgt_start and e.dst<tgt_end and e.flow==1):
            i = e.src-src_start
            j = e.dst-tgt_start
            predict_ids.append(([i], [j]))
    return predict_ids

def greedy(f, lang_len, lang_trans_len, config):
    match = [-1 for i in range(lang_len)]
    predict_ids = []
    for i in range(lang_len):
        for j in range(lang_trans_len):
            if (f.get((i, 0, j, 0), 0) > f.get((i, 0, match[i], 0), 0)):
                match[i] = j
        predict_ids.append(([i], [match[i]]))
    return predict_ids

def align_matrix(f, lang_len, lang_trans_len, config):
    if (config['algorithm'] == 'dp'):
        predict_ids = dp(f, lang_len, lang_trans_len, config)
    elif (config['algorithm'] == 'greedy'):
        predict_ids = greedy(f, lang_len, lang_trans_len, config)
    elif (config['algorithm'] == 'maxmatch'):
        predict_ids = maxcostflow(f, lang_len, lang_trans_len, config)
    return predict_ids


# part of alignment using cosine similarity
def get_similarity_matrix_1way(lang_embeddings, lang_trans_embeddings, config):
    lang_len = len(lang_embeddings)
    lang_trans_len = len(lang_trans_embeddings)
    n = int(config['max_combine'])
    f = {}

    avg_src_embeddings = {}
    avg_src_trans_embeddings = {}

    for i in range(lang_len):
        combined_src_embeddings = lang_embeddings[i]
        for j in range(n):
            if (i+j>=lang_len): break
            if (j!=0):
                combined_src_embeddings += lang_embeddings[i+j]
            avg_src_embeddings[(i, j)] = get_average_vec(combined_src_embeddings)

    for i in range(lang_trans_len):
        combined_src_trans_embeddings = lang_trans_embeddings[i]
        for j in range(n):
            if (i+j>=lang_trans_len): break
            if (j!=0):
                combined_src_trans_embeddings += lang_trans_embeddings[i+j]
            avg_src_trans_embeddings[(i, j)] = get_average_vec(combined_src_trans_embeddings)

    #print ('Pre-processing done for avg embeddings')

    for i in range(lang_len):
        for j in range(n):
            if (i+j>=lang_len): break
            for k in range(lang_trans_len):
                for l in range(n):
                    if (k+l>=lang_trans_len): break
                    avg_src_embedding = avg_src_embeddings[(i,j)]
                    avg_src_trans_embedding = avg_src_trans_embeddings[(k,l)]
                    if (config['measure'] == 'cosine_similarity'):
                        f[(i, j, k, l)] = get_cos_similarity(avg_src_embedding, avg_src_trans_embedding)
                    elif (config['measure'] == 'distance'): 
                        f[(i, j, k, l)] =  distance(avg_src_embedding, avg_src_trans_embedding)
    if (config['measure'] == 'distance'): 
        # extract all distances
        distances = list(f.values())
        min_distance = min(distances)
        max_distance = max(distances)
        diff_distance = max_distance-min_distance
        #print (min_distance, max_distance, diff_distance)
        for i in range(lang_len):
            for j in range(n):
                if (i+j>=lang_len): break
                for k in range(lang_trans_len):
                    for l in range(n):
                        if (k+l>=lang_trans_len): break
                        f[(i, j, k, l)] = float(max_distance-f[(i, j, k, l)])/diff_distance
    return f
    
def similarity_matrix_merge(f_src2tgt, f_tgt2src, lang_len, lang_trans_len, config):
    mix_method = config["mix_method"]
    n = int(config['max_combine'])
    f = {}
    for i in range(lang_len):
        for j in range(n):
            if (i+j>=lang_len): break
            for k in range(lang_trans_len):
                for l in range(n):
                    if (k+l>=lang_trans_len): break
                    s1 = f_src2tgt[(i, j, k, l)]
                    s2 = f_tgt2src[(k, l, i, j)] 
                    if (mix_method == "average"):
                        f[(i, j, k, l)] = (s1+s2)/2
                    elif (mix_method == "max"):
                        f[(i, j, k, l)] = max(s1, s2)
    return f

def get_BLEU4(sentence1, sentence2):
    """
    return BLEU4 scores for sentence1 and sentence2
    """
    from nltk.translate.bleu_score import sentence_bleu
    bleu4_score = sentence_bleu([sentence1], sentence2, weights=(0.25, 0.25, 0.25, 0.25))
    return bleu4_score
    
def get_similarity_matrix_BLEU_1way(lang1_lines, lang1_trans_lines, config):
    lang_len = len(lang1_lines)
    lang_trans_len = len(lang1_trans_lines)
    f = {}
    n = int(config['max_combine'])
    for i in range(lang_len):
        for j in range(n):
            if (i+j>=lang_len): break
            for k in range(lang_trans_len):
                for l in range(n):
                    if (k+l>=lang_trans_len): break
                    combined_src_sentence = ' '.join(lang1_lines[i:i+j+1])
                    combined_trans_src_sentence = ' '.join(lang1_trans_lines[k:k+l+1])
                    f[(i, j, k, l)] = get_BLEU4(combined_src_sentence, combined_trans_src_sentence)
                    #print (i, j, k, l)
                    #print (f[(i, j, k, l)])
    return f

def get_similarity_matrix_BERTscore_1way(lang1_lines, lang1_trans_lines, config):
    lang_len = len(lang1_lines)
    lang_trans_len = len(lang1_trans_lines)
    f = {}
    n = int(config['max_combine'])

    top = 0
    map_dict = {}
    combined_src_sentences = []
    combined_trans_src_sentences = []

    for i in range(lang_len):
        for j in range(n):
            if (i+j>=lang_len): break
            for k in range(lang_trans_len):
                for l in range(n):
                    if (k+l>=lang_trans_len): break
                    combined_src_sentence = ' '.join(lang1_lines[i:i+j+1])
                    combined_trans_src_sentence = ' '.join(lang1_trans_lines[k:k+l+1])
                    combined_src_sentences.append(combined_src_sentence)
                    combined_trans_src_sentences.append(combined_trans_src_sentence)
                    map_dict[(i,j,k,l)]=top
                    top+=1
    P, R, F1 =  score(combined_src_sentences, combined_trans_src_sentences, lang='en')
     
    for i in range(lang_len):
        for j in range(n):
            if (i+j>=lang_len): break
            for k in range(lang_trans_len):
                for l in range(n):
                    if (k+l>=lang_trans_len): break
                    f[(i, j, k, l)] = F1[map_dict[(i,j,k,l)]]
    return f

def align(lang1_lines, lang2_lines, lang1_trans_lines, lang2_trans_lines, config):
    lang_len = len(lang1_lines)
    lang_trans_len = len(lang1_trans_lines)
    #print (f'There are {lang_len} and {lang_trans_len} sentences')

    # load embeddings except BLEU
    if (config["model"] not in ["BLEU", "BERTscore"]):
        lang1_embeddings = sentences2vectors(lang1_lines, config['lang1'], config)
        lang2_embeddings = sentences2vectors(lang2_lines, config['lang2'], config)
        lang1_trans_embeddings = sentences2vectors(lang1_trans_lines, config['lang1'], config)
        lang2_trans_embeddings = sentences2vectors(lang2_trans_lines, config['lang2'], config)
        #print ("Calculated all embedding")

    # similarity
    if (config["model"] == "BLEU"):
        f_src2tgt = get_similarity_matrix_BLEU_1way(lang1_lines, lang1_trans_lines, config)
        f_tgt2src = get_similarity_matrix_BLEU_1way(lang2_lines, lang2_trans_lines, config)
        f_matrix = similarity_matrix_merge(f_src2tgt, f_tgt2src, lang_len, lang_trans_len, config)
    elif (config["model"] == "BERTscore"):
        f_matrix = get_similarity_matrix_BERTscore_1way(lang1_lines, lang1_trans_lines, config)
    elif (config["model"] == "word2vec" or config["multilingual"]=="0"):
        f_src2tgt = get_similarity_matrix_1way(lang1_embeddings, lang1_trans_embeddings, config)
        f_tgt2src = get_similarity_matrix_1way(lang2_embeddings, lang2_trans_embeddings, config)
        f_matrix = similarity_matrix_merge(f_src2tgt, f_tgt2src, lang_len, lang_trans_len, config)
    elif (config["model"] in ["laser", "sentbert"]):
        f_matrix = get_similarity_matrix_1way(lang1_embeddings, lang2_embeddings, config)

    predict_ids = align_matrix(f_matrix, lang_len, lang_trans_len, config)
    return predict_ids

In [23]:
# step 1: load config
config_file = '/mnt/zamia/song/lrec/alignment/src/configs/config.yaml'
config = load_config(config_file)
config["model"] = "BERTscore"

lang1_txt = config['lang1_txt']
lang2_txt = config['lang2_txt']
lang1_trans_txt = config['lang1_trans_txt']
lang2_trans_txt = config['lang2_trans_txt']
target_ids_txt = config['target_ids_txt']

model = config['model']

output_file = config['output_file']

word2vec_lang1_model = model_dict[config['lang1']]
word2vec_lang2_model = model_dict[config['lang2']]
laser_model = model_dict['laser']
sentbert_model = model_dict['sentbert']
print (f'model loaded')

# step 2: load data
lang1_lines = read_sentences_from_file(lang1_txt)
lang2_lines = read_sentences_from_file(lang2_txt)
lang1_trans_lines = read_sentences_from_file(lang1_trans_txt)
lang2_trans_lines = read_sentences_from_file(lang2_trans_txt)
target_ids = get_target_ids(target_ids_txt)

config['algorithm'] = 'dp' # [dp, greedy, maxmatch]
# step 3: calculate result using the current config
predict_ids = align(lang1_lines, lang2_lines, lang1_trans_lines, lang2_trans_lines, config)

# step 4: save result
#save_result(exact_rate, partial_rate, output_file, config)
precision, recall, f1 = res_compare_detail(predict_ids, target_ids)

print ('Precision: %f' % precision)
print ('Recall: %f' % recall)
print ('F1: %f' % f1)
#print (predict_ids)
#print (target_ids)

model loaded


Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Precision: 0.904762
Recall: 0.904762
F1: 0.904762


In [29]:
def gene_yaml(file_num):
    yaml_dict = {}
    if (file_num==0):
        yaml_dict['lang1'] = 'en'
        yaml_dict['lang2'] = 'ja'
        yaml_dict['data_folder'] = '/mnt/zamia/song/lrec/alignment/data/7045'

    if (file_num==1):
        yaml_dict['lang1'] = 'en'
        yaml_dict['lang2'] = 'ja'
        yaml_dict['data_folder'] = '/mnt/zamia/song/lrec/alignment/data/5523'

    if (file_num==2):
        yaml_dict['lang1'] = 'en'
        yaml_dict['lang2'] = 'zh'
        yaml_dict['data_folder'] = '/mnt/zamia/song/lrec/alignment/data/762'

    if (file_num==3):
        yaml_dict['lang1'] = 'en'
        yaml_dict['lang2'] = 'zh'
        yaml_dict['data_folder'] = '/mnt/zamia/song/lrec/alignment/data/905'

    yaml_dict['lang1_model'] = f'/mnt/zamia/song/lrec/model/word2vec/{yaml_dict["lang1"]}/model.txt'
    yaml_dict['lang2_model'] = f'/mnt/zamia/song/lrec/model/word2vec/{yaml_dict["lang2"]}/model.txt'
    yaml_dict['lang1_txt'] = f'{yaml_dict["data_folder"]}/{yaml_dict["lang1"]}.txt'
    yaml_dict['lang2_txt'] = f'{yaml_dict["data_folder"]}/{yaml_dict["lang2"]}.txt'
    yaml_dict['lang1_trans_txt'] = f'{yaml_dict["data_folder"]}/{yaml_dict["lang1"]}.trans.txt'
    yaml_dict['lang2_trans_txt'] = f'{yaml_dict["data_folder"]}/{yaml_dict["lang2"]}.trans.txt'
    yaml_dict['target_ids_txt'] = f'{yaml_dict["data_folder"]}/ids.target.txt'
    yaml_dict['model'] = 'sentbert' # [laser, sentbert, word2vec, bleu(not used)]
    yaml_dict['multilingual'] = '1' # 0: do not use corsslingual embeddings 1: use
    yaml_dict['measure'] = 'distance' # [cosine_similarity, distance]
    yaml_dict['algorithm'] = 'greedy' # [dp, greedy, maxmatch]
    yaml_dict['max_combine'] = '6'
    yaml_dict['threshold'] = '0'
    yaml_dict["mix_method"] = 'max'
    yaml_dict['output_file'] = f'{yaml_dict["data_folder"]}/ids.predict.txt'
    return yaml_dict

def grid_search():
    models = ["word2vec", "laser", "sentbert"]
    multilinguals = ["0", "1"]
    measures = ["cosine_similarity", "distance"]
    algorithms = ["greedy", "maxmatch", "dp"]
    max_combines = ["7"]
    
    #models = ["sentbert"]
    #multilinguals = ["0", "1"]
    #measures = ["cosine_similarity", "distance"]
    #algorithms = ["greedy", "maxmatch", "dp"]
    #max_combines = ["7"]

    # best search
    #models = ["laser", "word2vec"]
    #multilinguals = ["1"]
    #measures = ["cosine_similarity", "distance"]
    #algorithms = ["dp"]
    #max_combines = ["7"]

    #models = ["word2vec"]
    #multilinguals = ["1"]
    #measures = ["cosine_similarity"]
    #algorithms = ["dp", "greedy"]
    #max_combines = ["7"]

    # BLEU
    #models = ["BLEU"]
    #multilinguals = ["1"]
    #measures = ["BLEU"]
    #algorithms = ["greedy", "maxmatch", "dp"]
    #max_combines = ["7"]

    models = ["BERTscore"]
    #models = ["word2vec", "laser", "sentbert", "BLEU"]
    multilinguals = ["0", "1"]
    measures = ["cosine_similarity", "distance", "BLEU"]
    algorithms = ["greedy", "maxmatch", "dp"]
    max_combines = ["7"]
    settings = []
    for file_num in range(4):
        for model in models:
            for multilingual in multilinguals:
                for algorithm in algorithms:
                    for measure in measures:
                        for max_combine in max_combines:
                            if (model in ["word2vec", "BLEU", "BERTscore"] and multilingual == "1"):
                                continue
                            if (model in ["word2vec", "laser", "sentbert" ] and measure == "BLEU"):
                                continue
                            if (model in ["BLEU", "BERTscore"] and measure!="BLEU"):
                                continue
                            settings.append((file_num, model, multilingual, algorithm, measure, max_combine))
    print (len(settings))

    for (file_num, model, multilingual, algorithm, measure, max_combine) in settings:
        global config
        global word2vec_lang1_model, word2vec_lang2_model, laser_model, sentbert_model

        config = gene_yaml(file_num)
        lang1_txt = config['lang1_txt']
        lang2_txt = config['lang2_txt']
        lang1_trans_txt = config['lang1_trans_txt']
        lang2_trans_txt = config['lang2_trans_txt']
        target_ids_txt = config['target_ids_txt']
        lang1_lines = read_sentences_from_file(lang1_txt)
        lang2_lines = read_sentences_from_file(lang2_txt)
        lang1_trans_lines = read_sentences_from_file(lang1_trans_txt)
        lang2_trans_lines = read_sentences_from_file(lang2_trans_txt)
        target_ids = get_target_ids(target_ids_txt)

        word2vec_lang1_model = model_dict[config['lang1']]
        word2vec_lang2_model = model_dict[config['lang2']]
        laser_model = model_dict['laser']
        sentbert_model = model_dict['sentbert']

        config['model'] = model
        config['multilingual'] = multilingual
        config['measure'] = measure
        config['algorithm'] = algorithm
        config['max_combine'] = max_combine
        predict_ids = align(lang1_lines, lang2_lines, lang1_trans_lines, lang2_trans_lines, config)

        precision, recall, f1 = res_compare_detail(predict_ids, target_ids)
        #print (len(predict_ids), len(target_ids))
        print (f'file_num={file_num}, model={model}, multilingual={multilingual}, measure={measure}, algo={algorithm}, max_combine={max_combine}')
        print ('Precision: %f' % precision)
        print ('Recall: %f' % recall)
        print ('F1: %f' % f1)
        model_line =f'file_num={file_num}, model={model}, multilingual={multilingual}, measure={measure}, algo={algorithm}, max_combine={max_combine}' 
        #res_line = f'F1: {f1} Precision: {precision} Recall: {recall}'
        res_line = f'{f1:.3f} {precision:.3f} {recall:.3f}'

        with open("grid_search.txt", "a") as f:
            f.write(model_line + '\n')
            f.write(res_line + "\n")

grid_search()

132
76 76
file_num=0, model=word2vec, multilingual=0, measure=cosine_similarity, algo=greedy, max_combine=7
Precision: 0.684211
Recall: 0.684211
F1: 0.684211
76 76
file_num=0, model=word2vec, multilingual=0, measure=distance, algo=greedy, max_combine=7
Precision: 0.671053
Recall: 0.671053
F1: 0.671053
76 76
file_num=0, model=word2vec, multilingual=0, measure=cosine_similarity, algo=maxmatch, max_combine=7
Precision: 0.723684
Recall: 0.723684
F1: 0.723684
76 76
file_num=0, model=word2vec, multilingual=0, measure=distance, algo=maxmatch, max_combine=7
Precision: 0.723684
Recall: 0.723684
F1: 0.723684
76 76
file_num=0, model=word2vec, multilingual=0, measure=cosine_similarity, algo=dp, max_combine=7
Precision: 0.960526
Recall: 0.960526
F1: 0.960526
76 76
file_num=0, model=word2vec, multilingual=0, measure=distance, algo=dp, max_combine=7
Precision: 0.960526
Recall: 0.960526
F1: 0.960526
76 76
file_num=0, model=laser, multilingual=0, measure=cosine_similarity, algo=greedy, max_combine=7
Pr

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
print (lang2_lines[0])
embeddings = laser_model.embed_sentences(lang2_lines[0], lang='ja')
len(embeddings[0])

In [None]:
# to verify that using tokenized sentence or not doesn't affect the result for sentbert model
# for English, totally no matter, for Japanese, it does a little bit, so we use tokenized sentence for laser and sentbert model
def distance(vector1, vector2):
    return np.linalg.norm(vector1-vector2)
en_line = lang1_lines[0]
en_line_untok = detokenize(lang1_lines[0], detokenizer, lang='en')
ja_line = lang2_lines[0]
ja_line_untok = detokenize(lang2_lines[0], detokenizer, lang='ja')
en_embed = sentbert_model.encode(en_line)
en_untok_embed = sentbert_model.encode(en_line_untok) 
ja_embed = sentbert_model.encode(ja_line)
ja_untok_embed = sentbert_model.encode(ja_line_untok) 

en_cosine_similarity = get_cos_similarity(en_embed, en_untok_embed)
ja_cosine_similarity = get_cos_similarity(ja_embed, ja_untok_embed)
en_distance = distance(en_embed, en_untok_embed)
ja_distance = distance(ja_embed, ja_untok_embed)
print (en_cosine_similarity, ja_cosine_similarity, en_distance, ja_distance)

en_embed = laser_model.embed_sentences(en_line, lang='en')
en_untok_embed = laser_model.embed_sentences(en_line_untok, lang='en')
ja_embed = laser_model.embed_sentences(ja_line, lang='ja')
ja_untok_embed = laser_model.embed_sentences(ja_line_untok, lang='ja')

en_cosine_similarity = get_cos_similarity(en_embed, en_untok_embed)
ja_cosine_similarity = get_cos_similarity(ja_embed, ja_untok_embed)
en_distance = distance(en_embed, en_untok_embed)
ja_distance = distance(ja_embed, ja_untok_embed)
print (en_cosine_similarity, ja_cosine_similarity, en_distance, ja_distance)

In [None]:
sentence = lang1_lines[0]
e1 = laser_sentence2vector(sentence, 'en', laser_model)
e2 = sentbert_sentence2vector(sentence, 'en', sentbert_model)
e3 = word2vec_sentence2vector(sentence, 'en', config)
print (len(e1), len(e2), len(e3))
print (len(e1[0]), len(e2[0]), len(e3[0]))

In [3]:
# BERTscore test
import bert_score
from bert_score import score

In [17]:
sent1 = ["I am a student.", "He likes to play basketball.", "Dog is a animal."]
sent2 = ["I am a teacher.", "She likes to play basketball.", "Cat is a animal."]
# reverse sent2
sent2 = sent2[::-1]
P,R,F = score(sent1, sent2, lang="en")
print (F)

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([0.9054, 0.9948, 0.9117])


tensor(0.9813)
