In [35]:
import os, pickle, torch
import ipywidgets as widgets
from ipywidgets import Dropdown
import pandas as pd
import numpy as np
import re
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from transformers import PreTrainedTokenizerFast, GPT2LMHeadModel, GPT2TokenizerFast, GPT2Tokenizer
import torch.nn as nn


In [36]:
trained_model_folder = "./out"
language_pair_staging_folder = "../data/magic_token_folder/"

use_side_frozen_tokens = True
if use_side_frozen_tokens:
    num_magic_tokens = 1
    trained_embeddings_pickle_file = os.path.join( language_pair_staging_folder, f"magic_tokens_size_{num_magic_tokens}.pickle" )

In [37]:
if os.path.exists(trained_model_folder) and os.path.isdir(trained_model_folder):
    # Get a list of all subfolders in language_pair_staging_folder
    subfolders = [folder for folder in os.listdir(trained_model_folder) if os.path.isdir(os.path.join(trained_model_folder, folder))]

    # Filter subfolders that end with "_model"
    model_folders = [folder for folder in subfolders if folder.endswith("_model") or folder.endswith( "_model_step" ) ]

    # Print or use the list of model folders
    print("Folders ending with '_model':", model_folders)
else:
    print(f"The folder '{trained_model_folder}' does not exist or is not a directory.")

Folders ending with '_model': ['hebrew_model_step', 'bsb_model_step', 'greek_model', 'greek_model_step', 'target_model_step', 'bsb_model', 'target_model', 'hebrew_model']


# Inference
copying code from https://www.kaggle.com/code/changyeop/how-to-fine-tune-gpt-2-for-beginners/notebook#Step-3.-Inference

In [38]:
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model


def load_tokenizer(tokenizer_path):
    tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_path)
    return tokenizer

if use_side_frozen_tokens:
    learned_magic_tokens = {}
    if os.path.exists(trained_embeddings_pickle_file):
        with open(trained_embeddings_pickle_file, 'rb') as file:
            learned_magic_tokens = pickle.load(file)
            print( f"Loaded previouse training from {trained_embeddings_pickle_file}" )
    else:
        print( f"Previouse training not found at {trained_embeddings_pickle_file}" )
    
    class MagicTokensEmbeddingPatch(nn.Module):
        def __init__(self, tokenizer, other_embeddings ):
            super().__init__()
            self.tokenizer = tokenizer
            self.other_embeddings = other_embeddings
    
        def forward( self, x ):
            #print( f"Called forward with x {x}" )
            batch_result_list = []
            for batch in x:
                #print( f"batch is {batch}" )
                inputs_embeds_list = []
                for id in batch:
                    #print( f"id is {id}" )
                    token_string = self.tokenizer.decode( [id] )
                    #print( f"token_string is {token_string}" )
                    if token_string.startswith( '[' ) and token_string.endswith( ']' ):
                        if token_string in learned_magic_tokens:
                            inputs_embeds_list.append( learned_magic_tokens[token_string].detach() )
                        else:
                            print( f"Don't have embedding for {token_string}" )
                            
                    else:
                        inputs_embeds_list.append( self.other_embeddings( torch.LongTensor([id]).to(x.device) )[0] )
                        
                inputs_embeds = torch.stack(inputs_embeds_list, dim=0)
                batch_result_list.append(inputs_embeds)
            return torch.stack(batch_result_list, dim=0)

def generate_text(sequence, max_length, model_path, top_k=1, tokenizer=None ): #top_k was 50
    model = load_model(model_path)

    if tokenizer is None:
        tokenizer = load_tokenizer(model_path)
    if use_side_frozen_tokens:
        model.set_input_embeddings( MagicTokensEmbeddingPatch(tokenizer, model.get_input_embeddings()) )
    
    ids = tokenizer.encode(f'{sequence}', return_tensors='pt')
    print( f"Sequence ids {ids}" )
    final_outputs = model.generate(
        ids,
        do_sample=True,
        max_length=max_length,
        pad_token_id=model.config.eos_token_id,
        top_k=top_k,
        top_p=0.95,
    )
    print(tokenizer.decode(final_outputs[0], skip_special_tokens=False))



Loaded previouse training from ../data/magic_token_folder/magic_tokens_size_1.pickle


In [39]:
selected_model_dropdown = Dropdown(options=model_folders)
print( "Select which model to test" )
display(selected_model_dropdown)

Select which model to test


Dropdown(options=('hebrew_model_step', 'bsb_model_step', 'greek_model', 'greek_model_step', 'target_model_step…

In [40]:
tokenizer = load_tokenizer( os.path.join( trained_model_folder, selected_model_dropdown.value ) )

In [41]:
tokenizer.decode( [50257] )

'[GEN 1:1_a]'

In [43]:
#generate_text( "[GEN 1:1_a]", 100, os.path.join( trained_model_folder, selected_model_dropdown.value ), tokenizer=tokenizer )
generate_text( "[GEN 1:2_a]", 100, "gpt2", tokenizer=tokenizer )

Sequence ids tensor([[50259]])
[GEN 1:2_a]  Now the earth was formless and void, and darkness was over the earth. The earth was filled with darkness and darkness was over the earth. The earth was filled with darkness and darkness was over the earth. The earth was filled with darkness and darkness was over the earth. The earth was filled with darkness and darkness was over the earth. The earth was filled with darkness and darkness was over the earth. The earth was filled with darkness and darkness was over the earth. The earth was filled with


In [28]:
generate_text( "[MAT 1:1_a][MAT 1:1_b]", 100, os.path.join( trained_model_folder, selected_model_dropdown.value ) )

Sequence ids tensor([[96547, 96548]])
[MAT 1:1_a] [MAT 1:1_b] He also sent his daughter, a beautiful virgin from Jerusalem, to the city where the king had sent him. She was twenty-three years old, and the daughter of King Darius, the daughter of Zadok the Jezreelite. She stayed there with the servants of King Darius, the master of the temple. When Queen Esther saw that his daughter was with the king, she was greatly alarmed. And the maidservant who stayed with the king said to the king,


In [13]:
generate_text( "[MAT 1:1_a][MAT 1:1_b]", 100, os.path.join( trained_model_folder, selected_model_dropdown.value ) )

Sequence ids tensor([[96547, 96548]])
[MAT 1:1_a] [MAT 1:1_b] The Spirit of truth resides in Christ, which is to say nothing of falsehood. Who then is able to deliver the souls of wicked men? Let him who is rich be commended by God and commended by the saints. [GAL 6:5_a] [ACT 13:13_b] By this we understand that we are justified through Christ, that we are He who has been weighed out of the furnace, not in terms of flesh and blood, but with Christ, the power of life. [ROM 8:12_a] [ROM 7:7_b] By grace you have been strengthened in the


In [18]:
generate_text( "[LUK 1:1_a][LUK 1:1_b]", 500, os.path.join( trained_model_folder, selected_model_dropdown.value ) )

Sequence ids tensor([[100045, 100046]])
[LUK 1:1_a] [LUK 1:1_b] So Gideon sent messengers to the king of Israel at Ramoth-gilead: “We have found favor in your sight, for your servant is a man of valor from Kadesh-barnea.” “Go and look for him,” Elisha replied. [1SA 30:11_a] [NEH 6:11_b] So Gideon went to meet Elisha, who asked, “How long until the one I seek?” “In the past,” Gideon replied. [JHN 7:48_a] [JDG 19:17_b] “I have not had anyone like him present when I fought Aram-naharaim. What can I say?” Elisha asked. “That is enough,” replied Elisha. [GEN 19:28_a] [1SA 3:6_b] Then Gideon replied, “Your servant did not intend to fight, but to bring down the kingdom of Israel. What you are seeing is enough, for I have defeated them all.” [ISA 39:5_a] [2KI 8:14_b] So the king of Israel put him to the sword and said, “Let me see Gideon again tomorrow and take him down.” But Elisha said, “Later, if you will return, then tomorrow I will return and blow him down.” [JDG 10:11_a] [EST 7:1_b] So G

In [19]:
generate_text( "[LUK 1:1_a][LUK 1:1_b]", 500, os.path.join( trained_model_folder, selected_model_dropdown.value ) )

Sequence ids tensor([[100045, 100046]])
[LUK 1:1_a] [LUK 1:1_b] They shall not drink the cup offered by anyone during their Sabbaths, nor may their children drink it. [REV 20:4_a] [REV 2:9_b] In the Year of Jubilee, the priest must make atonement on behalf of the people for any of their transgressions. [REV 21:21_a] [REV 2:9_b] They are to take a large amount of money and give it to their relatives to be used for grain offerings or burnt offerings, as the LORD their God has commanded.’” [REV 2:18_a] [REV 21:27_b] On that same day the LORD said to Moses, [REV 9:5_b] [REV 20:3_b] “This is what the LORD of Hosts, the God of Israel, says: I will put an end to all indecency, both young and old, in the land of Egypt, because of the indecency of the daughters who were brought up with Me, and they were unfaithful to Me in all the abominations I had committed there. [REV 15:2_a] [REV 19:10_a] So I will make an end of them as the LORD has commanded; the Egyptians who were born in Egypt will come

In [23]:
generate_text( "For God so loved the world", 200, os.path.join( trained_model_folder, selected_model_dropdown.value ) )

Sequence ids tensor([[1890, 1793,  523, 6151,  262,  995]])
For God so loved the worldὁ  δὲ  ἀρχιερεὺς  καὶ  οἰκοδεσπότερον  καὶ  μόνον  ἀπὸ  τῆς  σκεύης  τοῦ  Θεοῦ. Ἰουδαίους  γὰρ  καὶ  οἱ  Ἰουδαίους  καὶ  οὐκ  ἠκολούθεια  καὶ  οἀκούθει
