In [1]:
import transformers
import sys
import ipywidgets
import imp
import pandas as pd
import numpy as np
import torch
import googletrans
import os

In [2]:
%load_ext rpy2.ipython
import rpy2.robjects.lib.ggplot2 as ggplot2

In [None]:
from multilingual_scorer import MultilingualScorer

In [None]:
mls = MultilingualScorer(cache_dir="/shared_hd2/huggingface/")

In [None]:
mls.score('We zullen met ze praten.', 'nl')

In [None]:
mls.score('We wil talk to them.', 'en')

In [None]:
from googletrans import Translator
translator = Translator()

In [None]:
def get_translation_probs(target_sentence):
    scores = []
    for language in ['de', 'nl','fr', 'en']:
        if language == 'en':
            translated_text = target_sentence
            en_score = mls.score(translated_text, language)
        else:
            translated_text = translator.translate(target_sentence, dest=language, src='en').text
        score = mls.score(translated_text, language)    
        scores.append({'language':language,
         'translation': translated_text,
        'score': score[0].numpy(),        
        'source_text': target_sentence})
    
    rdf = pd.DataFrame(scores)
    rdf['en_score'] = en_score[0].numpy().astype('float')
    return(rdf)

get_translation_probs("What is that?") 

In [None]:
sentences = [
    'We will talk to them.',
    'What is that?',
    "Please don't do that.",
    "Where is the dog?",
    "That's a bad idea.",
    "Bring that over here.",
    "Come back over here.",
    "Look at all the stars!",
    "It's time to go to bed.",
    "Give me the cookie.",
    'How do you know?',
    "I'm going to the store",
    'The dog chased the cat.',
    'The cat chased the dog.',
    "I spilled the milk"]

In [None]:
sentence_probs = pd.concat([get_translation_probs(x) for x in sentences]) 

In [None]:
sentence_probs.score = sentence_probs.score.astype('float')

In [None]:
%R -i sentence_probs
sentence_probs

In [None]:
%%R
sprobs = aggregate(en_score ~ source_text, sentence_probs, mean)
sprobs = sprobs[order(sprobs$en_score),]

sentence_probs$source_text = factor(sentence_probs$source_text, levels = sprobs$source_text)

ggplot(sentence_probs) + geom_point(aes(x=source_text, y=-1*score, color=language, shape=language)
, size=6 ) + theme_classic() + theme(axis.text.x = element_text(angle = 45, vjust = 0.5)
) + xlab('Target Sentence (English translation)') + ylab('Log Probability')

# Compare probability of Grammatical and ungrammatical variants

In [None]:
print(mls.score('The cats are on the bed.', 'en'))
print(mls.score('The cat are on the bed.', 'en'))

In [None]:
print(mls.score('Two little kitty cats.', 'en'))
print(mls.score('Two little kitty cat.', 'en'))

# Masked Language Modeling

In [3]:
sys.path.append('/home/stephan/python/mlm-scoring/src/')
from mlm.scorers import MLMScorer, MLMScorerPT, LMScorer
from mlm.models import get_pretrained
import mxnet as mx
ctxs = [mx.cpu()]

  Optimizer.opt_registry[name].__name__))


In [None]:
model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-en-cased')
scorer = MLMScorer(model, vocab, tokenizer, ctxs)
print(scorer.score_sentences(["Hello world!"]))

In [None]:
scorer.score_sentences(["There are two cat.", "There are two cats."])

In [None]:
test = ["There are two cat.",
        "There are two cats.",
        "Los dos perros son feos.",
        "Los dos perro son feos."]

In [None]:
import mlm.models
mlm.models.SUPPORTED_MLMS

In [None]:
scorer.score_sentences(test)

In [None]:
def get_translation_probs_mlm(target_sentence, scorer, lower=False):
    scores = []
    for language in ['de', 'nl','fr', 'en']:
        if language == 'en':
            translated_text = target_sentence
            en_score = scorer.score_sentences([translated_text])[0]
        else:
            translated_text = translator.translate(target_sentence, dest=language, src='en').text
        if lower:
            translated_text = translated_text.lower()
        score = scorer.score_sentences([translated_text])[0]    
        scores.append({'language':language,
         'translation': translated_text,
        'score': score,
        'source_text': target_sentence})
    
    rdf = pd.DataFrame(scores)
    rdf['en_score'] = en_score
    return(rdf)

# Lower Case Multilingual Model

In [None]:
model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-multi-uncased')
scorer = MLMScorer(model, vocab, tokenizer, ctxs)
print(scorer.score_sentences(["hello world!"]))

In [None]:
lower_case_sentence_probs = pd.concat([get_translation_probs_mlm(x, scorer, lower=True) for x in sentences]) 

In [None]:
%R -i lower_case_sentence_probs

In [None]:
%%R
lower_sprobs = aggregate(en_score ~ source_text, lower_case_sentence_probs, mean)
lower_sprobs = lower_sprobs[order(lower_sprobs$en_score),]
lower_case_sentence_probs$source_text = factor(sentence_probs$source_text, levels = lower_sprobs$source_text)

ggplot(lower_case_sentence_probs) + geom_point(aes(x=source_text, y=score, color=language, shape=language)
, size=6 ) + theme_classic() + theme(axis.text.x = element_text(angle = 45, hjust=1,
vjust = 1)) + xlab('Target Sentence (English translation)') + ylab('Log Probability')

# Upper Case Multilingual

In [None]:
model, vocab, tokenizer = get_pretrained(ctxs, 'bert-base-multi-cased')
scorer = MLMScorer(model, vocab, tokenizer, ctxs)

In [None]:
upper_sentence_probs = pd.concat([get_translation_probs_mlm(x, scorer) for x in sentences]) 

In [None]:
%R -i upper_sentence_probs

In [None]:
%%R
upper_sprobs = aggregate(en_score ~ source_text, upper_sentence_probs, mean)
upper_sprobs = upper_sprobs[order(upper_sprobs$en_score),]
upper_sentence_probs$source_text = factor(upper_sentence_probs$source_text, levels = upper_sprobs$source_text)

ggplot(upper_sentence_probs) + geom_point(aes(x=source_text, y=score, color=language, shape=language)
, size=6 ) + theme_classic() + theme(axis.text.x = element_text(angle = 45, hjust=1,
vjust = 1)) + xlab('Target Sentence (English translation)') + ylab('Log Probability')