<a href="https://colab.research.google.com/github/ryderwishart/biblical-machine-learning/blob/main/macula_gpt_insights.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Imports

In [74]:
!pip install openai transformers torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [81]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import pandas as pd

## Build Corpus

In [2]:
if 'macula-greek.tsv' not in [path for path in os.listdir()]:
    !wget -q 'https://raw.githubusercontent.com/Clear-Bible/macula-greek/main/Nestle1904/TSV/macula-greek.tsv'
if 'marble-domain-label-mapping.json' not in [path for path in os.listdir()]:
    !wget -q 'https://raw.githubusercontent.com/Clear-Bible/macula-greek/main/sources/MARBLE/SDBG/marble-domain-label-mapping.json'

In [3]:
os.listdir()

['.config',
 'marble-domain-label-mapping.json',
 'macula-greek.tsv',
 'sample_data']

In [27]:
# Import Macula Greek data
mg = pd.read_csv('macula-greek.tsv', index_col='xml:id', sep='\t', header=0, converters={'*': str}).fillna('missing')
# mg['domain'] = mg['domain'].astype(str).fillna('missing')

# Extract book, chapter, and verse into separate columns
mg[['book', 'chapter', 'verse']] = mg['ref'].str.extract(r'(\d?[A-Z]+)\s(\d+):(\d+)')

# Add columns for book + chapter, and book + chapter + verse for easier grouping
mg['book_chapter'] = mg['book'] + ' ' + mg['chapter'].astype(str)
mg['book_chapter_verse'] = mg['book_chapter'] + ':' + mg['verse'].astype(str)

# Display the updated data frame
mg.head()

Unnamed: 0_level_0,ref,role,class,type,gloss,text,after,lemma,normalized,strong,...,domain,ln,frame,subjref,referent,book,chapter,verse,book_chapter,book_chapter_verse
xml:id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n40001001001,MAT 1:1!1,missing,noun,common,[The] book,Βίβλος,,βίβλος,Βίβλος,976,...,033005,33.38,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1
n40001001002,MAT 1:1!2,missing,noun,common,of [the] genealogy,γενέσεως,,γένεσις,γενέσεως,1078,...,010002 033003,10.24 33.19,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1
n40001001003,MAT 1:1!3,missing,noun,proper,of Jesus,Ἰησοῦ,,Ἰησοῦς,Ἰησοῦ,2424,...,093001,93.169a,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1
n40001001004,MAT 1:1!4,missing,noun,proper,Christ,Χριστοῦ,,Χριστός,Χριστοῦ,5547,...,093001,93.387,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1
n40001001005,MAT 1:1!5,missing,noun,common,son,υἱοῦ,,υἱός,υἱοῦ,5207,...,010002,10.30,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1


In [28]:
mg.dtypes

ref                   object
role                  object
class                 object
type                  object
gloss                 object
text                  object
after                 object
lemma                 object
normalized            object
strong                 int64
morph                 object
person                object
number                object
gender                object
case                  object
tense                 object
voice                 object
mood                  object
degree                object
domain                object
ln                    object
frame                 object
subjref               object
referent              object
book                  object
chapter               object
verse                 object
book_chapter          object
book_chapter_verse    object
dtype: object

In [29]:
# Import domain-label mapping
import json

# Open the JSON file
with open('marble-domain-label-mapping.json', 'r') as f:

    # Load the contents of the file as a dictionary
    domain_labels = json.load(f)

domain_labels['missing'] = 'no domain'
domain_labels['nan'] = 'no domain'

# Display the resulting dictionary
count = 0
for d, l in domain_labels.items():
    print(d, l)
    if count > 5:
        break
    count += 1

001 Geographical Objects and Features
001001 Universe, Creation
001002 Regions Above the Earth
001003 Regions Below the Surface of the Earth
001004 Heavenly Bodies
001005 Atmospheric Objects
001006 The Earth's Surface


In [30]:
def get_domain_label(domain_string_number):
    labels = [domain_labels[label] for label in domain_string_number.split(' ')]
    return labels

mg['domain_label'] = mg['domain'].apply(get_domain_label)

mg.head()

Unnamed: 0_level_0,ref,role,class,type,gloss,text,after,lemma,normalized,strong,...,ln,frame,subjref,referent,book,chapter,verse,book_chapter,book_chapter_verse,domain_label
xml:id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
n40001001001,MAT 1:1!1,missing,noun,common,[The] book,Βίβλος,,βίβλος,Βίβλος,976,...,33.38,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1,[Written Language]
n40001001002,MAT 1:1!2,missing,noun,common,of [the] genealogy,γενέσεως,,γένεσις,γενέσεως,1078,...,10.24 33.19,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1,[Kinship Relations Involving Successive Genera...
n40001001003,MAT 1:1!3,missing,noun,proper,of Jesus,Ἰησοῦ,,Ἰησοῦς,Ἰησοῦ,2424,...,93.169a,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1,[Persons]
n40001001004,MAT 1:1!4,missing,noun,proper,Christ,Χριστοῦ,,Χριστός,Χριστοῦ,5547,...,93.387,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1,[Persons]
n40001001005,MAT 1:1!5,missing,noun,common,son,υἱοῦ,,υἱός,υἱοῦ,5207,...,10.30,missing,missing,missing,MAT,1,1,MAT 1,MAT 1:1,[Kinship Relations Involving Successive Genera...


Let's learn some things about some of the words. First, we need to connect to the OpenAI ChapGPT API using a secret API key. If you don't have one, you will need to sign up for one. You could also just query ChatGPT for free.

In [32]:
import getpass
secret_key = getpass.getpass('Enter OpenAI secret key: ')

Enter OpenAI secret key: ··········


In [33]:
os.environ['OPENAI_API_KEY'] = secret_key

In [55]:
mg.loc['n40001001002'].to_dict()

{'ref': 'MAT 1:1!2',
 'role': 'missing',
 'class': 'noun',
 'type': 'common',
 'gloss': 'of [the] genealogy',
 'text': 'γενέσεως',
 'after': ' ',
 'lemma': 'γένεσις',
 'normalized': 'γενέσεως',
 'strong': 1078,
 'morph': 'N-GSF',
 'person': 'missing',
 'number': 'singular',
 'gender': 'feminine',
 'case': 'genitive',
 'tense': 'missing',
 'voice': 'missing',
 'mood': 'missing',
 'degree': 'missing',
 'domain': '010002 033003',
 'ln': '10.24 33.19',
 'frame': 'missing',
 'subjref': 'missing',
 'referent': 'missing',
 'book': 'MAT',
 'chapter': '1',
 'verse': '1',
 'book_chapter': 'MAT 1',
 'book_chapter_verse': 'MAT 1:1',
 'domain_label': ['Kinship Relations Involving Successive Generations',
  'Discourse Types']}

In [54]:
mg['domain'].head()

xml:id
n40001001001           033005
n40001001002    010002 033003
n40001001003           093001
n40001001004           093001
n40001001005           010002
Name: domain, dtype: object

In [63]:
import openai

selected_fields = ['domain_label', 'gloss', 'class', 'type']
def expand(word):
    prompt = mg.loc[word]['lemma'] + "'s " 
    components = [f"{key} is {value}" for key, value in mg.loc[word].to_dict().items() if key in selected_fields]
    
    lemma_coreferences = []
    # for component in components:
        # print(component)
        # if component == 'nan':
        #     print('true', type(component),component)
        # else:
        #     print('false', type(component),component)
    prompt += ", ".join(components)
    print('Prompt: ', prompt)

    return prompt, lemma_coreferences



def generate_prompt(expanded_word_prompt):
    return '''Explain what this word means based on the supplied values:

Prompt: "γένεσις, a common noun, '[the] genealogy', ['Kinship Relations Involving Successive Generations', 'Discourse Types']"
Explanation: The lemma γένεσις is a common noun that could be glossed 'genealogy', or 'the genealogy' in this context. Semantically, this word has to do with kinship relations across successive generations, and it refers to a particular kind of discourse or genre (i.e., a genealogical record).
Word: "Ἰησοῦς, a proper noun, 'Jesus', ['Persons']"
Explanation: The lemma Ἰησοῦς is a proper noun that could be glossed 'Jesus', the son of Mary and adopted son of Joseph. This word's semantics indicates a person is in view.
Word: {0}
Explanation: '''.format(expanded_word_prompt)

In [101]:
expanded_word = expand('n40001001001')
input_prompt = expanded_word[0]

completion = openai.Completion.create(
  model="text-davinci-003",
#   model="babbage",
  prompt=generate_prompt(input_prompt),
  temperature=0.6,
  max_tokens=50,
  top_p=1,
  frequency_penalty=0.0,
  presence_penalty=0.6,
)

text_before_final_period = lambda text: text[:text.rfind('.')+1]
print(completion.choices[0].text)

Prompt:  βίβλος's class is noun, type is common, gloss is [The] book, domain_label is ['Written Language']

The lemma βίβλος is a common noun that could be glossed 'book', or 'the book' in this context. Semantically, this word refers to a written language object and could refer to a physical book


In [102]:
text_before_final_period = lambda text: text[:text.rfind('.')+1] # Simple function to ensure trailing content is removed

In [103]:
# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")

In [104]:
# Define the source language code and target language codes
src_lang = "eng_Latn"
tgt_langs = ["deu_Latn", "spa_Latn", "fra_Latn"] # german, spanish, french

# Define the translation prompt
translation_prompt = text_before_final_period(completion.choices[0].text)

# Encode all Greek/Hebrew tokens as 1_1, 2_2, 3_3, etc.
special_id2token = dict()

def is_special_token(token):
    '''
    Any token that contains a Greek or Hebrew unicode character is a special token.
    '''
    greek_unicode_chars = [chr(i) for i in range(0x0370, 0x03FF)]
    hebrew_unicode_chars = [chr(i) for i in range(0x0590, 0x05FF)]
    return any([char in token for char in greek_unicode_chars + hebrew_unicode_chars])

def encode_non_english_tokens(original_prompt):
    prompt = original_prompt
    count = 0
    for token in original_prompt.split():
        if is_special_token(token):
            prompt = prompt.replace(token, f"{count}_{count}")
            special_id2token[f"{count}_{count}"] = token
            count += 1
    return prompt

encoded_prompt = encode_non_english_tokens(translation_prompt)
encoded_prompt

"\nThe lemma 0_0 is a common noun that could be glossed 'book', or 'the book' in this context."

In [105]:
# Instantiate each target language translator
deu_translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang="eng_Latn", tgt_lang='deu_Latn')
spa_translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang="eng_Latn", tgt_lang='spa_Latn')
fra_translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang="eng_Latn", tgt_lang='fra_Latn')

In [106]:

# Decode the special tokens
def decode_special_tokens(result):
    decoded_result = result[0]['translation_text']
    for k, v in special_id2token.items():
        decoded_result = decoded_result.replace(k, v)
    return decoded_result

In [107]:
# Pass prompt through the pipelines
deu_result = deu_translator(encoded_prompt)
deu_text = deu_result[0]['translation_text']
spa_result = spa_translator(encoded_prompt)
spa_text = spa_result[0]['translation_text']
fra_result = fra_translator(encoded_prompt)
fra_text = fra_result[0]['translation_text']

print('German: ', decode_special_tokens(deu_result))
print('Spanish: ', decode_special_tokens(spa_result))
print('French: ', decode_special_tokens(fra_result))

German:  Das Lemma βίβλος ist ein allgemeines Nomen, das in diesem Zusammenhang "Buch" oder "das Buch" genannt werden könnte.
Spanish:  El lema βίβλος es un sustantivo común que podría ser glosado 'libro', o 'el libro' en este contexto.
French:  Le lemma βίβλος est un nom commun qui pourrait être glossé "livre", ou "le livre" dans ce contexte.
