In [None]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(1, '../rtsvg/') # base location of the .py classes
from rtsvg import *
rt = RACETrack()

In [None]:
#
# Source:  https://en.wikipedia.org/wiki/Apollo_13
#

_text = """Apollo 13 (April 11–17, 1970) was the seventh crewed mission in the Apollo space program and the third meant to land on the Moon. The craft was launched from Kennedy Space Center on April 11, 1970, but the lunar landing was aborted after an oxygen tank in the service module (SM) failed two days into the mission. The crew instead looped around the Moon in a circumlunar trajectory and returned safely to Earth on April 17. The mission was commanded by Jim Lovell, with Jack Swigert as command module (CM) pilot and Fred Haise as Lunar Module (LM) pilot. Swigert was a late replacement for Ken Mattingly, who was grounded after exposure to rubella.
A routine stir of an oxygen tank ignited damaged wire insulation inside it, causing an explosion that vented the contents of both of the SM's oxygen tanks to space. Without oxygen, needed for breathing and for generating electric power, the SM's propulsion and life support systems could not operate. The CM's systems had to be shut down to conserve its remaining resources for reentry, forcing the crew to transfer to the LM as a lifeboat. With the lunar landing canceled, mission controllers worked to bring the crew home alive.
Although the LM was designed to support two men on the lunar surface for two days, Mission Control in Houston improvised new procedures so it could support three men for four days. The crew experienced great hardship, caused by limited power, a chilly and wet cabin and a shortage of potable water. There was a critical need to adapt the CM's cartridges for the carbon dioxide scrubber system to work in the LM; the crew and mission controllers were successful in improvising a solution. The astronauts' peril briefly renewed public interest in the Apollo program; tens of millions watched the splashdown in the South Pacific Ocean on television.
An investigative review board found fault with preflight testing of the oxygen tank and Teflon being placed inside it. The board recommended changes, including minimizing the use of potentially combustible items inside the tank; this was done for Apollo 14. The story of Apollo 13 has been dramatized several times, most notably in the 1995 film Apollo 13 based on Lost Moon, the 1994 memoir co-authored by Lovell – and an episode of the 1998 miniseries From the Earth to the Moon."""

_epochs = 100

model15,tokenizer15,device15 = rt.__textTrainBertModel__(_text, mask_perc=0.15, epochs=_epochs)
_sample = 'Apollo 13 was the seventh crewed mission in the Apollo space program.'
rt.__textBertWordProbabilities__(_sample, model15, tokenizer15, device15)

In [None]:
print('0.50 Mask Rate')
model50,tokenizer50,device50 = rt.__textTrainBertModel__(_text, mask_perc=0.50, epochs=_epochs)
print('0.85 Mask Rate')
model85,tokenizer85,device85 = rt.__textTrainBertModel__(_text, mask_perc=0.85, epochs=_epochs)

In [None]:
_sample = 'Apollo 13 was the seventh crewed mission in the Apollo space program.'

from transformers import BertTokenizer, BertForMaskedLM, TFBertForMaskedLM, AdamW
import tensorflow as tf
import torch

device_def    = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tokenizer_def = BertTokenizer.  from_pretrained('bert-base-cased') # 'bert-base-cased' vs 'bert-large-cased'
model_def     = BertForMaskedLM.from_pretrained('bert-base-cased') # 'bert-base-cased' vs 'bert-large-cased'
model_def.to(device_def)

_models = [(model15,tokenizer15,device15),(model50,tokenizer50,device50),(model85,tokenizer85,device85),(model_def,tokenizer_def,device_def)]
pred_lu, prob_lu, toks_lu = {}, {}, {}
for _model in _models:
    pred_lu[_model],prob_lu[_model],toks_lu[_model],_predictions,_token_inputs = rt.__textBertWordProbabilities__(_sample, _model[0], _model[1], _model[2])
toks = toks_lu[_models[0]]
for i in range(len(toks)):
    x = toks[i]
    s = f'"{x}"'
    o = f'{s:20}'
    for _model in _models:
        o += f'{pred_lu[_model][i]:16}'
    print(o)

In [None]:
_sample = 'Apollo [MASK] was the seventh crewed mission in the Apollo space program.'
_sample = 'Apollo 13 was the seventh crewed mission in the [MASK] space program.'
_sample = '[MASK] 13 was the seventh crewed mission in the Apollo space program.'
_sample = 'Apollo 13 was the [MASK] crewed mission in the Apollo space program.'

for _model in _models:
    _results = rt.__textBertTopKPredictions__(_sample, 10, _model[0], _model[1], _model[2])
    print(_results)

In [None]:
_text      = 'Apollo 13 was the seventh crewed mission in the Apollo space program.'
_parts     = _text.split(' ')
_score_sum = {}
for _model in _models:
    _score_sum[_model] = 0

for i in range(len(_parts)):
    _sample  = ' '.join(_parts[:i]) + ' [MASK] ' + ' '.join(_parts[(i+1):])
    _sample = _sample.strip()
    if _sample.endswith('.') == False:
        _sample += '.'
    _output = f'"{_sample}"'
    _output = f'{_output:90}'
    for _model in _models:
        _results = rt.__textBertTopKPredictions__(_sample, 10, _model[0], _model[1], _model[2])
        _answers = _results.split(' ')
        _found   = False
        for j in range(len(_answers)):
            if (_answers[j] == _parts[i] or (_answers[j] + '.') == _parts[i]) and _found == False:
                _found  =  True
                _output += str(j)
                _score_sum[_model] += j
        if _found == False:
            _output += 'x'
            _score_sum[_model] += 20
        _output += ' | '
    print(_output)
print()
for _model in _models:
    print(_score_sum[_model])

In [None]:
# "bert-base-cased"
_epochs_100_ = '''
"[MASK] 13 was the seventh crewed mission in the Apollo space program."                   x | 6 | 1 | 0 | 
"Apollo [MASK] was the seventh crewed mission in the Apollo space program."               2 | 0 | 2 | 3 | 
"Apollo 13 [MASK] the seventh crewed mission in the Apollo space program."                2 | 2 | 0 | 0 | 
"Apollo 13 was [MASK] seventh crewed mission in the Apollo space program."                2 | 0 | 0 | 0 | 
"Apollo 13 was the [MASK] crewed mission in the Apollo space program."                    1 | 1 | 0 | 7 | 
"Apollo 13 was the seventh [MASK] mission in the Apollo space program."                   x | x | x | x | 
"Apollo 13 was the seventh crewed [MASK] in the Apollo space program."                    1 | 2 | 0 | 0 | 
"Apollo 13 was the seventh crewed mission [MASK] the Apollo space program."               7 | 3 | 1 | 1 | 
"Apollo 13 was the seventh crewed mission in [MASK] Apollo space program."                1 | 1 | 0 | 0 | 
"Apollo 13 was the seventh crewed mission in the [MASK] space program."                   2 | 1 | 0 | 0 | 
"Apollo 13 was the seventh crewed mission in the Apollo [MASK] program."                  1 | 1 | x | x | 
"Apollo 13 was the seventh crewed mission in the Apollo space [MASK]."                    1 | x | 0 | 0 | 

Run 1                                   Run 2
60  // 0.14 Masking                     113
57  // 0.50 Masking                     75
44  // 0.85 Masking <=== BEST SCORE     72
51  // Default BERT                     51
'''

# "bert-base-cased"
_epochs_200_ = '''
"[MASK] 13 was the seventh crewed mission in the Apollo space program."                   x | x | 2 | 0 | 
"Apollo [MASK] was the seventh crewed mission in the Apollo space program."               2 | x | 0 | 3 | 
"Apollo 13 [MASK] the seventh crewed mission in the Apollo space program."                6 | 1 | 0 | 0 | 
"Apollo 13 was [MASK] seventh crewed mission in the Apollo space program."                5 | 8 | 0 | 0 | 
"Apollo 13 was the [MASK] crewed mission in the Apollo space program."                    1 | 1 | 0 | 7 | 
"Apollo 13 was the seventh [MASK] mission in the Apollo space program."                   x | x | x | x | 
"Apollo 13 was the seventh crewed [MASK] in the Apollo space program."                    3 | 1 | 7 | 0 | 
"Apollo 13 was the seventh crewed mission [MASK] the Apollo space program."               9 | 6 | 4 | 1 | 
"Apollo 13 was the seventh crewed mission in [MASK] Apollo space program."                5 | 2 | 0 | 0 | 
"Apollo 13 was the seventh crewed mission in the [MASK] space program."                   2 | 1 | 0 | 0 | 
"Apollo 13 was the seventh crewed mission in the Apollo [MASK] program."                  x | x | x | x | 
"Apollo 13 was the seventh crewed mission in the Apollo space [MASK]."                    x | x | 0 | 0 | 

113   // 0.15 Masking
120   // 0.50 Masking
53    // 0.85 Masking
51    // Default BERT <=== BEST SCORE
'''

# "bert-large-cased" # Compares 0.15 vs Default
_epochs_100_LARGE_ = '''
"[MASK] 13 was the seventh crewed mission in the Apollo space program."                   1 | 0 | 
"Apollo [MASK] was the seventh crewed mission in the Apollo space program."               4 | 6 | 
"Apollo 13 [MASK] the seventh crewed mission in the Apollo space program."                1 | 0 | 
"Apollo 13 was [MASK] seventh crewed mission in the Apollo space program."                2 | 0 | 
"Apollo 13 was the [MASK] crewed mission in the Apollo space program."                    0 | 2 | 
"Apollo 13 was the seventh [MASK] mission in the Apollo space program."                   x | x | 
"Apollo 13 was the seventh crewed [MASK] in the Apollo space program."                    0 | 1 | 
"Apollo 13 was the seventh crewed mission [MASK] the Apollo space program."               2 | 1 | 
"Apollo 13 was the seventh crewed mission in [MASK] Apollo space program."                2 | 0 | 
"Apollo 13 was the seventh crewed mission in the [MASK] space program."                   1 | 0 | 
"Apollo 13 was the seventh crewed mission in the Apollo [MASK] program."                  5 | 0 | 
"Apollo 13 was the seventh crewed mission in the Apollo space [MASK]."                    0 | 0 | 

38    // 0.15 Masking
30    // Default BERT <=== BEST SCORE
'''


In [None]:
from transformers import RobertaTokenizer, RobertaModel
roberta_tokenizer_def = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model_def     = RobertaModel.from_pretrained('roberta-base')
roberta_model_def.to(device_def)
_sample = 'Apollo 13 was the seventh crewed mission in the Apollo space program.'
_pred,_prob,_toks,_preds,_token_inputs = rt.__textBertWordProbabilities__(_sample, roberta_model_def, roberta_tokenizer_def, device_def)
_prob

In [None]:
_sample = 'Apollo 11 was the seventh crewed mission in the Apollo space program.'
_pred,_prob,_toks,_preds,_token_inputs = rt.__textBertWordProbabilities__(_sample, roberta_model_def, roberta_tokenizer_def, device_def)
_prob