In [1]:
import torch

# Check if CUDA (GPU) is available
if torch.cuda.is_available():
    # Create a tensor on the GPU
    device = torch.device("cuda")
    x = torch.randn(3, 3).to(device)

    # Perform a simple computation on the GPU
    y = x + x

    # Move the result back to CPU and print
    y = y.to("cpu")
    print("GPU acceleration is available and working.")
else:
    print("GPU acceleration is not available on this system.")

GPU acceleration is available and working.


In [2]:
!pip install transformers
!pip install datasets
!pip install transformers[torch]



In [3]:
import pandas as pd
import json, os
from datasets import Dataset
from transformers import Trainer, TrainingArguments
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [4]:
num_magic_tokens = 2
language_pair_data_file = "../data/amo.json"
language_pair_staging_folder = "../data/magic_token_folder/"
trained_model_folder = "./out"

# Load the JSON file with the Greek English and Amo

In [21]:
def load_content_pairs( filename ):
    with open( filename, "rt" ) as fin:
        processed_verses = []
        for verse in json.load(fin):
            processed_translations = {}
            for entry_key,translation_entry in verse.items():
                if entry_key != "vref":
                    processed_translations[entry_key] = translation_entry["content"]
                else:
                    processed_translations[entry_key] = translation_entry
            processed_verses.append( processed_translations )
        return pd.DataFrame(processed_verses )

data = load_content_pairs( language_pair_data_file )

In [22]:
data.keys()

Index(['vref', 'bsb', 'macula', 'target'], dtype='object')

# Modify databefore it is written back out.

split greek and hebrew (I know there is more then that, this is what we are doing now)

In [23]:
nt_first_index = data[data["vref"] == "MAT 1:1"].index[0]
nt_first_index

23145

In [26]:
data.loc[data.index < nt_first_index, 'hebrew'] = data['macula']
data.loc[data.index >= nt_first_index, 'hebrew'] = ""

data.loc[data.index >= nt_first_index, 'greek'] = data['macula']
data.loc[data.index < nt_first_index, 'greek'] = ""

data = data.drop(columns=['macula'])

For testing withhold rev from english version so that we can test projecting into it from the embeddings learned from amo and greek.

In [10]:
rev_first_index = data[data['vref'] == "REV 1:1"].index[0]
data.loc[data.index >= rev_first_index, 'bsb'] = ""

In [11]:
data.loc[rev_first_index-1, 'bsb']

'to the only God our Savior be glory, majesty, dominion, and authority through Jesus Christ our Lord before all time, and now, and for all eternity. Amen.'

# Process verses into individual input files

In [12]:
os.makedirs( language_pair_staging_folder, exist_ok=True )
for key in data.keys():
    if key != "vref":
        with open( os.path.join( language_pair_staging_folder, f"train_{key}.txt" ), "wt" ) as fout:
            for index,row in data.iterrows():
                if len(str(row[key]).strip()) > 0:
                    vref = row['vref']
                    fout.write( "".join( f"[{vref}_{chr(i + ord('a'))}]" for i in range(num_magic_tokens)) + f" {row[key]}\n" )
            

# Modify gpt2 tokenizer and model for each language pair



In [13]:
def add_special_tokens_to_tokenizer( vrefs, tokenizer ):
    special_tokens_to_add = []
    for vref in vrefs:
        for i in range(num_magic_tokens):
            new_token = f"[{vref}_{chr(i + ord('a'))}]"
            special_tokens_to_add.append(new_token)
    tokenizer.add_special_tokens( {"additional_special_tokens": special_tokens_to_add } )
    return special_tokens_to_add
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
added_tokens = add_special_tokens_to_tokenizer( data["vref"], tokenizer )

In [14]:
#save out the new tokenizer
for key in data.keys():
    if key != "vref":
        model_folder = os.path.join( trained_model_folder, f"{key}_model" )
        tokenizer.save_pretrained( model_folder )

In [15]:
#now load the model
model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))
for key in data.keys():
    if key != "vref":
        model_folder = os.path.join( trained_model_folder, f"{key}_model" )
        model.save_pretrained( model_folder )

You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding dimension will be 112461. This might induce some performance reduction as *Tensor Cores* will not be available. For more details about this, or help on choosing the correct value for resizing, refer to this guide: https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc


# Check what the initial state is of one of the magic token

In [16]:
import contextlib

def write_help(func, out_file):
    with open(out_file, 'w') as f:
        with contextlib.redirect_stdout(f):
            help(func)

write_help(model, 'SS.txt')

In [17]:
input_embeddings = model.get_input_embeddings()

In [18]:
write_help( input_embeddings, "input_embeddings.txt" )

In [19]:
tokens_to_test = list(tokenizer("[GEN 1:1_a]").input_ids)
tokens_to_test

[50257]

In [20]:
current_embedding = input_embeddings(torch.LongTensor(tokens_to_test))
current_embedding

tensor([[ 1.8218e-02, -3.3257e-02, -1.7171e-02, -1.7965e-02,  2.0816e-02,
         -2.6498e-02, -2.2455e-02,  3.0211e-02,  1.7868e-02, -5.7020e-03,
          4.1241e-03,  4.5361e-04, -6.3447e-02, -4.5307e-03,  4.5426e-03,
          2.4871e-02,  1.5913e-02, -1.6129e-02,  2.9891e-03, -2.5205e-02,
          6.6512e-03, -4.9822e-03, -5.4929e-03, -2.7331e-02, -4.5498e-02,
          2.2512e-02, -2.6214e-02,  3.4922e-02,  2.7797e-02,  1.3524e-02,
          2.7797e-02,  1.6790e-02,  2.0141e-02, -3.7667e-02,  1.6595e-04,
          8.4818e-03, -1.7257e-02,  1.4155e-02, -7.8517e-03,  9.6883e-03,
          2.0339e-02, -2.4115e-02, -8.4154e-03, -1.5157e-02,  1.8431e-02,
          4.1104e-02, -4.9657e-02,  2.0277e-02,  2.0856e-02, -3.6539e-02,
         -2.6159e-02,  5.0826e-03, -1.6454e-02,  1.7480e-03, -6.6160e-03,
         -2.1845e-02,  2.6541e-02,  1.1458e-02,  5.5602e-03, -2.7316e-02,
         -1.4009e-02,  6.5182e-03, -9.8880e-03,  6.2848e-03, -1.7856e-02,
         -2.9930e-02,  2.2912e-02,  1.