In [None]:
from os import listdir
from os.path import isfile, join
from tqdm.notebook import tqdm
from iso639 import Lang

import sys
import os
sys.path.append(os.path.abspath("/Users/laura/silnlp/silnlp/common")) # path to silnlp common folder
from script_utils import get_script # use get_script function from silnlp

path = "/Users/laura/llmResearch/scripture/" # path to all Scripture data (downloaded from s3 bucket)
files = [f for f in listdir(path) if isfile(join(path, f))]
print('Before processing - number of Bibles:',len(files))

In [None]:
# Load all bibles, assign language code to each Bible
# Takes about 3 minutes

bibles = []
languages = []
filepaths = []
num_verses = []
scripts = []
llm_tags = []

# Don't include the languages we want to test on
test_languages = ['bft','bcw','bap','ksr','gbj','acw','kje','kfc','kxv','kwf','lmp','mgz','nxq','rjs','kan','tdd','rro','cja']

# Cache previously seen iso-3 language codes (speeds it up considerably)
language_codes = {}
# Exceptions
language_codes['pou'] = 'poc'
language_codes['sgjj'] = 'sgj' # typo
language_codes['in'] = 'ind' # best guess - indonesian
language_codes['wra'] = 'wra'
language_codes['pltA'] = 'plt'
language_codes['pltB'] = 'plt'
language_codes['thfL'] = 'thf'
language_codes['bapL'] = 'bap'
language_codes['dud'] = 'dud'

for file_name in tqdm(files):
  with open(path + file_name,"r",encoding="utf-8") as file:
    lines = [i[:-1] for i in file.readlines()] # remove ending newline

  if len([i for i in lines if i!='' and i!='...']) == 0: # skip empty bibles
    continue

  if len(lines) < 31170: # only use full bibles (previous calculations show only about 7% of bibles aren't full)
    continue

  lines = lines[:31170] # get rid of apocrypha

  # Find iso-3 language code for each language
  language = file_name[:file_name.find('-')]
  if language in language_codes:
    iso_code = language_codes[language]
  else:
    try:
      iso_code = Lang(language).pt3
    except: # can't find language code
      iso_code = ''
    language_codes[language] = iso_code

  if iso_code not in test_languages:
    languages.append(iso_code)
    bibles.append(lines)
    filepaths.append(file_name)
    num_verses.append(len([i for i in lines if i!='' and i!='...']))

print('After processing - final number of Bibles:',len(bibles))
print('Number of unknown languages:',len([i for i in languages if i==''])) # number of unknown language codes

In [None]:
# Put all data in a dataframe to make it easier to sort / manipulate

import pandas as pd

df = pd.DataFrame()
df['language'] = languages
df['filepath'] = filepaths
df['num_verses'] = num_verses
df['bibles_index'] = range(len(bibles)) # index into bibles list

In [None]:
vref = bibles[list(df[df.filepath=='vref.txt'].bibles_index)[0]]
vref_noVerses = [i.split(':')[0] for i in vref] # just leave the book and chapter

In [None]:
df = df.drop(df[df.filepath=='vref.txt'].index) # just verse names

# not real Bible translations
df = df.drop(df[df.filepath=='sux-TEST.txt'].index) # not a real Bible translation
df = df.drop(df[df.filepath=='ms-MultiCCAligned_id_ms.clean.100K.txt'].index)
df = df.drop(df[df.filepath=='id-MultiCCAligned_id_ms.clean.100K.txt'].index)

In [None]:
# Keep correct English pivot translation - remove other English translations
keep_index = df[(df.language=='eng') & (df.filepath=="en-NIV11R.txt")].index
remove_indices = df[(df.language=='eng')].index
remove_indices = [i for i in remove_indices if i!=keep_index[0]]
df.drop(index=list(remove_indices),inplace=True)

In [None]:
# Drop unknown languages
df.drop(index=df[df.language==""].index,inplace=True)

In [None]:
# Choose one Bible translation per language (the one with the most verses)
print('Before',len(df))
df = df.sort_values('num_verses',ascending=False)
df = df.drop_duplicates(subset='language',keep='first')
print('After',len(df))

In [None]:
# Only need to run if you want to automatically generate script codes for below - otherwise skip
# Before running, need to have run previous code to get a list of scripts not_in_dict

import pandas as pd

iso_scripts = pd.read_csv('data/iso_scripts.csv')

for script in set(not_in_dict):
  code = list(iso_scripts[iso_scripts['Alias']==script]['Code'])[0]
  print("script_codes['"+script.upper()+"'] = '" + code + "'")

In [None]:
# Find script for each Bible we're keeping
df = df.reset_index()

scripts = []
llm_tags = []
not_in_dict = []

#https://en.wikipedia.org/wiki/ISO_15924
# also can generate this automatically using code above
script_codes = {}
script_codes['CYRILLIC'] = 'Cyrl'
script_codes['LATIN'] = 'Latn'
script_codes['KANNADA'] = 'Knda'
script_codes['GUJARATI'] = 'Gujr'
script_codes['ARABIC'] = 'Arab'
script_codes['BENGALI'] = 'Beng'
script_codes['DEVANAGARI'] = 'Deva'
script_codes['ETHIOPIC'] = 'Ethi'
script_codes['GEORGIA'] = 'Geor'
script_codes['GREEK'] = 'Grek'
script_codes['GURMUKHI'] = 'Guru'
script_codes['HANGUL'] = 'Hang'
script_codes['HEBREW'] = 'Hira'
script_codes['KHMER'] = 'Khmr'
script_codes['LAO'] = 'Laoo'
script_codes['MALAYALAM'] = 'Mlym'
script_codes['MYANMAR'] = 'Mymr'
script_codes['ORIYA'] = 'Orya'
script_codes['SINHALA'] = 'Sinh'
script_codes['TAMIL'] = 'Taml'
script_codes['TELUGU'] = 'Telu'
script_codes['THAI'] = 'Thai'
script_codes['TIBETAN'] = 'Tibt'
script_codes['VAI'] = 'Vaii'
script_codes['TAI_THAM'] = 'Lana'
script_codes['TIFINAGH'] = 'Tfng'
script_codes['GEORGIAN'] = 'Geor'
script_codes['LISU'] = 'Lisu'
script_codes['HIRAGANA'] = 'Hira'
script_codes['SYRIAC'] = 'Syrc'
script_codes['COMMON'] = 'Zyyy'
script_codes['MONGOLIAN'] = 'Mong'
script_codes['CANADIAN_ABORIGINAL'] = 'Cans'
script_codes['KAYAH_LI'] = 'Kali'
script_codes['LIMBU'] = 'Limb'
script_codes['HAN'] = 'Hani'

for index, row in df.iterrows():
  not_empty = [k for k in bibles[row['bibles_index']] if k != ''] # find non-empty verses
  script = get_script(''.join(not_empty[:10])) # look at 10 verses to determine script
  if script.upper() in script_codes:
    scripts.append(script.capitalize())
    llm_tags.append(row['language'] + '_' + script_codes[script.upper()])
  else:
    not_in_dict.append(script)
    scripts.append(script.capitalize())
    llm_tags.append(row['language'] + '_Othr')

print('scripts not in dictionary',set(not_in_dict))
df['script'] = scripts
df['llm_tag'] = llm_tags

In [None]:
english_verses = list(df[df.language=='eng'].num_verses)[0]

In [None]:
# full bibles: at least 80% of English NIV
full_bibles = df[df.num_verses >= 0.8*english_verses]

In [None]:
# Save full Bibles so we don't need to re-run everything to this point
full_bibles.to_csv("data/full_bibles.csv")

In [None]:
# Read in full Bibles - if you previously saved them and are picking up here
import pandas as pd

full_bibles = pd.read_csv("data/full_bibles.csv")
full_bibles = full_bibles.drop(columns=["Unnamed: 0","index"])

path = "/Users/laura/llmResearch/scripture/" # path to all Scripture data (downloaded from s3 bucket)

bibles = []
for it,row in full_bibles.iterrows():
  with open(path + row.filepath,"r",encoding="utf-8") as file:
    lines = [i[:-1] for i in file.readlines()] # remove ending newline
  lines = lines[:31170] # get rid of apocrypha
  bibles.append(lines)

full_bibles['bibles_index'] = range(len(bibles))

# Read in vref
with open(path + "vref.txt","r",encoding="utf-8") as file:
  vref = [i[:-1] for i in file.readlines()] # remove ending newline
vref = vref[:31170] # get rid of apocrypha
vref_noVerses = [i.split(':')[0] for i in vref] # just leave the book and chapter

In [None]:
# How many of each script are in our set of Bibles?
from collections import Counter
counter = Counter()
for it,row in full_bibles.iterrows():
  counter[row.script] += 1

In [None]:
# Display all scripts and counts
counter

In [None]:
# Read in some of the Ethnologue data
ethnologue = pd.read_excel("data/Ethnologue/LanguageEthnologAdditionalData.xlsx")

In [None]:
# Get language family information into full_bibles dataframe
full_bibles = pd.merge(full_bibles, ethnologue[["LanguageCode","LanguageFamily"]], left_on="language", right_on="LanguageCode")
full_bibles["language_family"] = full_bibles["LanguageFamily"]
full_bibles = full_bibles.drop(columns=["LanguageCode","LanguageFamily"])

In [None]:
# Curate set of languages that we want to work with

# For Latin languages, take one Bible from each language family
highest_df = full_bibles[full_bibles.script=="Latin"].sort_values("num_verses",ascending=False)
highest_df = highest_df.drop_duplicates(subset="language_family",keep="first")

# Keep all non-Latin languages
highest_df = pd.concat([highest_df,full_bibles[full_bibles.script!="Latin"]])

# Keep our English pivot translation
highest_df = pd.concat([highest_df,full_bibles[full_bibles.language=="eng"]])

In [None]:
# Save these languages in case this crashes

savepaths = list(highest_df.filepath)

import pickle
with open('data/103_filepaths.pkl','wb') as pickleFile:
  pickle.dump(savepaths,pickleFile)

In [None]:
# Using English as a pivot languages, generate inputs, outputs, and instructions for LLM
# Takes about 13 minutes

pivotLanguage = "eng"
pivotIndex = list(highest_df[highest_df.language==pivotLanguage].bibles_index)[0]
pivotTag = "eng_Latn"
nonPivotLanguages = list(set(highest_df[highest_df.language != pivotLanguage].language))
startingVerse = 0
numVerses = 10
verseToken = "</VERSE>"

inputs = []
outputs = []
instructions = []

count = 0
while startingVerse < 31170:
    if count % 100 == 0:
        print(startingVerse) # so you can see how fast it's progressing
    count += 1

    endingVerse = startingVerse + numVerses
    if endingVerse > 31170:
        endingVerse = 31170
    if vref_noVerses[startingVerse] != vref_noVerses[endingVerse-1]:
        for i in range(startingVerse+1,endingVerse):
            if vref_noVerses[i] != vref_noVerses[startingVerse]:
                endingVerse = i
                break
    
    pivotRelevant = bibles[pivotIndex][startingVerse:endingVerse]

    for language in nonPivotLanguages:
        relevant = bibles[list(highest_df[(highest_df.language==language)].bibles_index)[0]][startingVerse:endingVerse]
        translation = (' ' + verseToken + ' ').join(relevant) + ' ' + verseToken
        if translation.strip() == "": # no verses here
            continue

        # only include verses in target translation
        pivotTranslation = (' ' + verseToken + ' ').join([pivotRelevant[i] for i in range(len(pivotRelevant)) if relevant[i].strip()!='']) + ' ' + verseToken
        if pivotTranslation.strip() == "": # no verses here
            continue

        inputs.append(pivotTranslation)
        outputs.append(translation)
        instructions.append("Translate from " + pivotTag + " to " + list(highest_df[(highest_df.language==language)].llm_tag)[0])

    startingVerse = endingVerse

In [None]:
# Save LLM data as a JSON file
# Takes less than a minute

import json

# If file already exists, this will add on to the file, not erase what's already in the file - watch out for this behavior
with open('data/103languages.jsonl', 'w') as output_file:
  for input, output, instruction in zip(inputs, outputs, instructions):
      data = {
          "input": input.strip(),
          "output": output.strip(),
          "instruction": instruction.strip()
      }
      json.dump(data, output_file)
      output_file.write("\n")

In [None]:
# Figure out which tokens to add to the tokenizers
tokens = set()
for it,row in highest_df.iterrows():
  tokens.add(row.language+'_')
  tokens.add(row.llm_tag[4:]) # script abbreviation

print(list(tokens))