In [None]:
# Load all bibles
# Takes about 3 minutes

from os import listdir
from os.path import isfile, join
from tqdm.notebook import tqdm
from iso639 import Lang

path = "/Users/laura/llmResearch/scripture/"
files = [f for f in listdir(path) if isfile(join(path, f))]
print('Before processing - number of Bibles:',len(files))

bibles = []
languages = []
filepaths = []

# Don't include the languages we want to test on
test_languages = ['bft','bcw','bap','ksr','gbj','acw','kje','kfc','kxv','kwf','lmp','mgz','nxq','rjs','kan','tdd','rro','cja']

# Cache previously seen iso-3 language codes (speeds it up considerably)
language_codes = {}
# Exceptions
language_codes['pou'] = 'poc'
language_codes['sgjj'] = 'sgj' # typo
language_codes['in'] = 'ind' # best guess - indonesian
language_codes['wra'] = 'wra'
language_codes['pltA'] = 'plt'
language_codes['pltB'] = 'plt'
language_codes['thfL'] = 'thf'
language_codes['bapL'] = 'bap'
language_codes['dud'] = 'dud'
language_codes['eng'] = 'Latn'
language_codes['deu'] = 'Latn'

for file_name in tqdm(files):
  with open(path + file_name,"r",encoding="utf-8") as file:
    lines = [i[:-1] for i in file.readlines()] # remove ending newline

  if len([i for i in lines if i!='']) == 0: # skip empty bibles
    continue

  if len(lines) != 41899: # only use full bibles (previous calculations show only about 7% of bibles aren't full)
    continue

  # Find iso-3 language code for each language
  language = file_name[:file_name.find('-')]
  if language in language_codes:
    iso_code = language_codes[language]
  else:
    try:
      iso_code = Lang(language).pt3
    except: # can't find language code
      iso_code = ''
    language_codes[language] = iso_code

  if iso_code not in test_languages:
    languages.append(iso_code)
    bibles.append(lines)
    filepaths.append(file_name)

print('After processing - final number of Bibles:',len(bibles))
print('Number of unknown languages:',len([i for i in languages if i==''])) # number of unknown language codes

In [None]:
# Only need to run if you want to automatically generate script codes for below - otherwise skip
# Before running, need to have run previous code to get a list of scripts not_in_dict

import pandas as pd

iso_scripts = pd.read_csv('data/iso_scripts.csv')

for script in set(not_in_dict):
  code = list(iso_scripts[iso_scripts['Alias']==script]['Code'])[0]
  print("script_codes['"+script.upper()+"'] = '" + code + "'")

In [None]:
# Find the script for each Bible
# Takes about 2.5 minutes

import sys
import os
sys.path.append(os.path.abspath("/Users/laura/silnlp/silnlp/common"))
from script_utils import get_script

#https://en.wikipedia.org/wiki/ISO_15924
# also can generate this automatically using code above
script_codes = {}
script_codes['CYRILLIC'] = 'Cyrl'
script_codes['LATIN'] = 'Latn'
script_codes['KANNADA'] = 'Knda'
script_codes['GUJARATI'] = 'Gujr'
script_codes['ARABIC'] = 'Arab'
script_codes['BENGALI'] = 'Beng'
script_codes['DEVANAGARI'] = 'Deva'
script_codes['ETHIOPIC'] = 'Ethi'
script_codes['GEORGIA'] = 'Geor'
script_codes['GREEK'] = 'Grek'
script_codes['GURMUKHI'] = 'Guru'
script_codes['HANGUL'] = 'Hang'
script_codes['HEBREW'] = 'Hira'
script_codes['KHMER'] = 'Khmr'
script_codes['LAO'] = 'Laoo'
script_codes['MALAYALAM'] = 'Mlym'
script_codes['MYANMAR'] = 'Mymr'
script_codes['ORIYA'] = 'Orya'
script_codes['SINHALA'] = 'Sinh'
script_codes['TAMIL'] = 'Taml'
script_codes['TELUGU'] = 'Telu'
script_codes['THAI'] = 'Thai'
script_codes['TIBETAN'] = 'Tibt'
script_codes['VAI'] = 'Vaii'
script_codes['TAI_THAM'] = 'Lana'
script_codes['TIFINAGH'] = 'Tfng'
script_codes['GEORGIAN'] = 'Geor'
script_codes['LISU'] = 'Lisu'
script_codes['HIRAGANA'] = 'Hira'
script_codes['SYRIAC'] = 'Syrc'
script_codes['COMMON'] = 'Zyyy'
script_codes['MONGOLIAN'] = 'Mong'
script_codes['CANADIAN_ABORIGINAL'] = 'Cans'
script_codes['KAYAH_LI'] = 'Kali'
script_codes['LIMBU'] = 'Limb'
script_codes['HAN'] = 'Hani'

llm_tags = []
not_in_dict = []
for i in tqdm(range(len(bibles))):
  not_empty = [k for k in bibles[i] if k != ''] # find non-empty verses
  script = get_script(''.join(not_empty[:10])) # look at 10 verses to determine script
  if script.upper() in script_codes:
    llm_tags.append(languages[i] + '_' + script_codes[script.upper()])
  else:
    not_in_dict.append(script)
    llm_tags.append(languages[i] + '_Othr')

print('scripts not in dictionary',set(not_in_dict))

In [None]:
# Count up how many versions of each verse there are
# Takes about 22 minutes

num_verses = []
for verse in tqdm(range(41899)):
  num_verses.append(len([i for i in bibles if i[verse] != '']))

In [None]:
# Plot the number of versions of each verse there are

import seaborn as sns
import matplotlib.pyplot as plt

plt.plot(num_verses)

In [None]:
# If we were to consider every pair of verses in every language, how many pairs of verses would we have?

total = sum([i*(i-1) for i in num_verses])
print(total)

In [None]:
# Count how many Bibles are in each script

from collections import Counter

counter = Counter()
for llm_tag in tqdm(llm_tags):
  script = llm_tag.split('_')[1]
  counter[script]+=1

In [None]:
counter

In [None]:
# pivot sources calculated in Select_pivot_languages.ipynb
pivot_sources = ['en-NIV11R','es-DHHE','ko-RNKSV','fr-LBS21','cmn-CU2010S','ru-NRT23','swh-MFT_2023_11_11','hi-HINOVBSI','npi-SNHB','id-BIMK']
pivot_sources = [i + '.txt' for i in pivot_sources]

pivot_indices = [filepaths.index(pivot_source) for pivot_source in pivot_sources]

In [None]:
pivot_indices

In [None]:
# how many pairs of verses are there with ten pivot languages?

total = sum([(i-10)*10 for i in num_verses])
print(total)

In [None]:
non_pivot_indices = [i for i in range(len(bibles)) if i not in pivot_indices]

In [None]:
with open('data/all_scripture_llm_input.jsonl', 'w') as output_file:
  for verse in tqdm(range(41899)):
    pivot_verses = [str(verse) + " " + llm_tags[j] + ": " + bibles[j][verse].strip() for j in pivot_indices if bibles[j][verse] != '']
    non_pivot_verses = [str(verse) + " " + llm_tags[j] + ": " + bibles[j][verse].strip() for j in non_pivot_indices if bibles[j][verse] != '']

    output_file.write('\n'.join(pivot_verses + non_pivot_verses))