# Transliteration via Large Language Models (LLMs) 
---
This notebook provides code that transliterates English text using large language models (LLMs), specifically OpenAI's GPT models. To run this code, you need access to the OpenAI API. Visit [OpenAI's website](https://openai.com/index/openai-api/) to purchase the required quotas. Once you have your API credentials, put them in the following cell: your API key (`api_key`) and API Base Link (`api_base`).

---

In [None]:
#####################################################
############### API Key of Elevenlabs ###############
#####################################################

api_key = "sk_..."
api_base = ""

#####################################################
#####################################################
#####################################################

import warnings
warnings.filterwarnings("ignore")

from phonemizer import phonemize
import pandas as pd
import collections
import numpy as np
from tqdm import tqdm
import glob
import os

import sys
sys.path.append('../sho_util/pyfiles/')
from gpt import gpt_api_no_stream, get_json_result

import re
from whisper.normalizers.english import EnglishNumberNormalizer, EnglishSpellingNormalizer, remove_symbols_and_diacritics

# keep numbers, -, and '
class EnglishTextNormalizer:
    def __init__(self):
        self.ignore_patterns = r"\b(hmm|mm|mhm|mmm|uh|um)\b"
        self.replacers = {
            # common contractions
            r"\bwon't\b": "will not",
            r"\bcan't\b": "can not",
            r"\blet's\b": "let us",
            r"\bain't\b": "aint",
            r"\by'all\b": "you all",
            r"\bwanna\b": "want to",
            r"\bgotta\b": "got to",
            r"\bgonna\b": "going to",
            r"\bi'ma\b": "i am going to",
            r"\bimma\b": "i am going to",
            r"\bwoulda\b": "would have",
            r"\bcoulda\b": "could have",
            r"\bshoulda\b": "should have",
            r"\bma'am\b": "madam",
            # contractions in titles/prefixes
            r"\bmr\b": "mister ",
            r"\bmrs\b": "missus ",
            r"\bst\b": "saint ",
            r"\bdr\b": "doctor ",
            r"\bprof\b": "professor ",
            r"\bcapt\b": "captain ",
            r"\bgov\b": "governor ",
            r"\bald\b": "alderman ",
            r"\bgen\b": "general ",
            r"\bsen\b": "senator ",
            r"\brep\b": "representative ",
            r"\bpres\b": "president ",
            r"\brev\b": "reverend ",
            r"\bhon\b": "honorable ",
            r"\basst\b": "assistant ",
            r"\bassoc\b": "associate ",
            r"\blt\b": "lieutenant ",
            r"\bcol\b": "colonel ",
            r"\bjr\b": "junior ",
            r"\bsr\b": "senior ",
            r"\besq\b": "esquire ",
            # prefect tenses, ideally it should be any past participles, but it's harder..
            r"'d been\b": " had been",
            r"'s been\b": " has been",
            r"'d gone\b": " had gone",
            r"'s gone\b": " has gone",
            r"'d done\b": " had done",  # "'s done" is ambiguous
            r"'s got\b": " has got",
            # general contractions
            r"n't\b": " not",
            r"'re\b": " are",
            # r"'s\b": " is",
            r"'d\b": " would",
            r"'ll\b": " will",
            r"'t\b": " not",
            r"'ve\b": " have",
            r"'m\b": " am",
        }
        self.standardize_numbers = EnglishNumberNormalizer()
        self.standardize_spellings = EnglishSpellingNormalizer()

    def __call__(self, s: str):
        s = s.lower()

        s = re.sub(r"[<\[][^>\]]*[>\]]", "", s)  # remove words between brackets
        s = re.sub(r"\(([^)]+?)\)", "", s)  # remove words between parenthesis
        s = re.sub(self.ignore_patterns, "", s)
        # s = re.sub(r"\s+'", "'", s)  # when there's a space before an apostrophe

        for pattern, replacement in self.replacers.items():
            s = re.sub(pattern, replacement, s)

        s = re.sub(r"(\d),(\d)", r"\1\2", s)  # remove commas between digits
        s = re.sub(r"\.([^0-9]|$)", r" \1", s)  # remove periods not followed by numbers
        s = remove_symbols_and_diacritics(s, keep=".%$¢€£-'")  # keep numeric symbols

        # s = self.standardize_numbers(s)
        s = self.standardize_spellings(s)

        # now remove prefix/suffix symbols that are not preceded/followed by numbers
        s = re.sub(r"[.$¢€£]([^0-9])", r" \1", s)
        s = re.sub(r"([^0-9])%", r"\1 ", s)

        s = re.sub(r"\s+", " ", s)  # replace any successive whitespaces with a space
        
        return s
normalizer = EnglishTextNormalizer()


from openai import OpenAI
client = OpenAI(api_key=api_key, base_url=api_base)

def gpt_api_no_stream(prompt: str, 
                      model="gpt-4o",
                      reset_messages: bool = True,
                      response_only: bool = True
                      ):
    """
    ------------
    Examples
    ------------
    
    try:
        response = gpt_api_no_stream(prompt, model=model)[1]
    except AuthenticationError:
        continue
    if "OpenAI API error" in response:
        print(f"{response}")
    else:
        np.save(savepath, response)
    """
    
    if "gpt-3.5" in model:
        model = "gpt-3.5-turbo-1106"
    elif "gpt-4omini" in model:
        model = "gpt-4o-mini-2024-07-18"
    elif "gpt-4o" in model:
        model = "gpt-4o-2024-11-20"
    elif "gpt-o1mini" in model:
        model = "o1-mini-2024-09-12"
    messages = [{'role': 'user','content': prompt},]

    try:
        completion = client.chat.completions.create(
            model=model,
            messages=messages,
        )
        completion = dict(completion)
        msg = None
        choices = completion.get('choices', None)
        if choices:
            msg = choices[0].message.content
        else:
            msg = completion.message.content
    except Exception as err:
        return (False, f'OpenAI API error: {err}')
    if reset_messages:
        messages.pop(-1)
    else:
        # add text of response to messages
        messages.append({
            'role': choices[0].message.role,
            'content': choices[0].message.content
        })
    if response_only:
        return True, msg
    else:
        return True, messages

def get_json_result(response):
    try:
        tem = response[::-1][response[::-1].index("}"):][::-1]
    
        cumulative = ""
        extra = 1
        while extra>0:
            extra -= 1
            idxb = tem[::-1].index("{")+1
            add = tem[::-1][:idxb][::-1]
            extra += np.array([a=="}" for a in list(add[1:-1])]).sum()
            cumulative = add + cumulative
            tem = tem[::-1][idxb:][::-1]
        curlyblankets = cumulative

        # ## Preprocessing
        pattern = r"//.*?\n" # delete comment-outs
        curlyblankets = re.sub(pattern, "", curlyblankets)
        l = [] # Delete "Target Text" and "Backchannel or not"
        alsonext = False
        for element in curlyblankets.split("\n"):
            if alsonext:
                alsonext = False
                continue
            if "Target Text" in element or "Backchannel" in element:
                if ":" in element[-2:]:
                    alsonext = True
                continue
            l += [element]
        curlyblankets = "\n".join(l)

        curlyblankets = curlyblankets.replace("null", '"neutral"')
        a = eval(curlyblankets)
    
    except (ValueError, SyntaxError, NameError):
        return False, None
    
    return True, a 

def GetLLMPrompt(sentence, language, phonemized=None):
    words = sentence.split()
    if type(phonemized)!=list:
        phonemized = [phonemize(word, language='en-us', backend='espeak', with_stress=True).split()[0] for word in words]
    shfflephonemized = phonemized

    start = f"""Can you provide me with three {language} words to represent the phoneme sequences delimited by triple backticks. 
For example, in Japanese, "Trail (tɹˈeɪl)" is expected to have Japanese representation of "トレイル"; where "'" in phonemes represents the stress point of the word. 
Here, your task is to provide me with three {language} words that can replace the phoneme senquences, delimited by triple backticks.
Please focus on phonetically similar characters instead of similar characters in terms of the meaning.
The expected output should be in JSON format. 
You can first list three possible choices of the words and then re-order them in order of the similarity of the pronunciation. 
The following is the example in Hindi language.
{{
  "I": {{
    "phonemes": "ˈaɪ",
    "choices": ["आई", "ऐ", "आई"],
    "similarity order": ["आई", "ऐ", "आई"]
  }},
  "love": {{
    "phonemes": "lˈʌv",
    "choices": ["लव", "लव", "लव"],
    "similarity order": ["लव", "लव", "लव"]
  }},
  "you": {{
    "phonemes": "juː",
    "choices": ["यू", "यू", "यू"],
    "similarity order": ["यू", "यू", "यू"]
  }},
}}
```
"""
    for p, ph in enumerate(shfflephonemized):
        start += f"{words[p]}: {ph}\n"
    start = start[:-1]
    start += f"""
```
Again, the responses should be in a JSON format and sort them in order of the similarity to each phoneme sequence.
{{
"""
    for p, ph in enumerate(shfflephonemized):
        start += f"""  "{words[p]}": {{
"phonemes": "{ph}",
"choices": [`1st choices of {language} characters`, `2nd choices of {language} characters`, `3rd choices of {language} characters`],
"similarity order": [`1st most similar {language} characters`, `2nd most similar {language} characters`, `3rd most similar {language} characters`],
}},\n"""
    start = start[:] + "}"
    return start

adds = {
    "zhi": ["the", ["ðɪ"]],
    "za": ["the pineapple", ["ðə", "pˈaɪnæpəl"]],
    "ah": ["a little awkward", ["ɐ","lˈɪɾəl","ˈɔːkwɚd"]],
}

# Evaluate each word
postprocessing = {a: {} for a in adds}
for addname in adds:
# for addname in ["zhi"]:
    sentence, phonemized = adds[addname]
    # for language in ["Hindi", "Korean", "Japanese", "Russian"]:
    for language in ["Hindi", "Korean"]:
    # for language in ["Hindi", "Korean", "Japanese"]:
        # filelists = glob.glob(f"./LLM_responses/08-A/{language}/postprocessing_{addname}_*.npy")
        filelists = glob.glob(f"../../../seq2seq-vc/datasetgeneration/LLM_responses/08-A/{language}/postprocessing_{addname}_*.npy")
        a_list = []
        for path in filelists:
            response = np.load(path).item()
            try:
                a_list += [eval(response[response.index("{"):-1*response[::-1].index("}")]) if response[-1]!="}" else eval(response[response.index("{"):])]
            except ValueError:
                pass
        # print(f"{len(a_list)} / {len(filelists)}")
        # print("Normalized    :", sentence)
        dirs = []
        for a in a_list:
            a = {key: a[key] for key in sentence.split()}
            dirs += [a]
        for i in range(len(dirs)):
            for key in dirs[i]:
                newlist = []
                for j in range(len(dirs[i][key]["similarity order"])):
                    newlist += [dirs[i][key]["similarity order"][j]]*(3-j)
                dirs[i][key]["similarity order"] = newlist
        data = {key: [element for i in range(len(dirs)) for element in dirs[i][key]["similarity order"]] for key in dirs[0]}
        # Get the transliterated sentences
        arrays = []
        counts = []
        for word in sentence.split():
            c = collections.Counter(data[word])
            df = pd.DataFrame(c.items(), columns=["phonemes", "count"]).sort_values("count", ascending=False).values
            arrays += [df[0,0]]
            counts += [list(df[:1,1])]
            
        postprocessing[addname][language] = arrays[0]

def CheckResultValidity(a, inputtext):
    if len(a)==len(set(inputtext.split())):
        test = []
        for word in inputtext.split():
            exist = word in a
            if not(exist):
                normalized_word = normalizer.standardize_numbers(word)
                a_array = np.array(list(a.keys()))
                bl = normalized_word==a_array
                exist = bool(bl.sum())
                if exist:
                    a[word] = a[normalized_word]
            ## check the type of data
            if exist:
                if type(a[word])!=dict:
                    exist = False
            test += [exist]
        if np.array([test]).mean()==1:
            return True, a
    return False, None

def PostprocessTransliteration(sentence, a_list):
    inputtext = normalizer(sentence)
    dirs = []
    for a in a_list:
        a = {key: a[key] for key in inputtext.split()}
        dirs += [a]
    ordernames = []
    for i in range(len(dirs)):
        for key in dirs[i]:
            newlist = []
            try:
                ordername = "similarity order"
                dirs[i][key][ordername]
            except KeyError:
                ordername = "similarity_order"
            # delete duplicated words
            candidates = list(set(dirs[i][key][ordername]))
            newwords = []
            for tword in dirs[i][key][ordername]:
                if tword in candidates:
                    candidates.remove(tword)
                    newwords += [tword]
                if len(candidates)==0:
                    break
            dirs[i][key][ordername] = newwords
            for j in range(len(dirs[i][key][ordername])):
                newlist += [dirs[i][key][ordername][j]]*(len(newwords)-j)
                # newlist += [dirs[i][key][ordername][j]]
            dirs[i][key][ordername] = newlist
        ordernames += [ordername]
    data = {key: [element for i in range(len(dirs)) for element in dirs[i][key][ordernames[i]]] for key in dirs[0]}

    # Get the transliterated sentences
    arrays = []
    words = inputtext.split()
    for w, word in enumerate(words):
        if word in set([a[0].split(" ")[0] for a in list(adds.values())]):
            if word=="a":
                arrays += [postprocessing["ah"][language]]
            if word=="the":
                pro = phonemize(words[w] + " " + words[w+1], language='en-us', backend='espeak', with_stress=True).split()[0]
                for the in ["zhi", "za"]:
                    if adds[the][1][0]==pro:
                        break
                arrays += [postprocessing[the][language]]
        else:
            c = collections.Counter(data[word])
            df = pd.DataFrame(c.items(), columns=["phonemes", "count"]).sort_values("count", ascending=False).values
            arrays += [df[0,0]]

    # put period and comma
    try:
        targets = [".", ","]
        now = 0
        english_arrays = inputtext.split()
        for word in sentence.split():
        # for word in sentence.split():
            normalized = normalizer(word)
            num = len(normalized.split(" "))
            now += (num-1)
            for target in targets:
                if target in word:
                    arrays[now] += target
                    english_arrays[now] += target
            now += 1
    except IndexError:
        return None
    return " ".join(arrays)

def GetResult(prompt, gptmodel, display_print=False):
    repeat = True
    trial = 1
    while repeat:
        response = gpt_api_no_stream(prompt, model=gptmodel)[1]
        getresult, a = get_json_result(response)
        if getresult:
            valid, result = CheckResultValidity(a, inputtext)
            if valid:
                if display_print:
                    print(f"Trial {trial}: Success!!!")
                repeat = False
            else:
                if display_print:
                    print(f"Trial {trial}: The result is not valid")
        else:
            if display_print:
                print(f"Trial {trial}: Error in Converting Json Format")
    return result

---
# Trial of Transliteration via LLMs
---

In this example, we will transliterate an English sentence using a GPT model. Please adjust the following variables:

- `sentence`: A string containing the English sentence you wish to transliterate.
- `language`: A string specifying the target language. Supported options are "Hindi", "Korean", and "Japanese".
- `gptmodel`: A string indicating which GPT model to use. Available options include "gpt-3.5", "gpt-4omini", "gpt-4o", and "gpt-o1mini". You can add or modify the list of released models by editing the file `MacST-project-page/sho_util/pyfiles/gpt.py`.

Feel free to try out the transliteration with one response from the GPT model using these variables.

---

In [None]:
###########################################
########## Adjustable Parameters ##########
###########################################

sentence = "Transliterate English text into Hindi text."
language = "Hindi"
gptmodel = "gpt-3.5"

###########################################
###########################################
###########################################

inputtext = normalizer(sentence)
prompt = GetLLMPrompt(inputtext, language)
result = GetResult(prompt, gptmodel)
transliterated = PostprocessTransliteration(sentence, [result])

print("English       :", sentence)
print("Normalized    :", inputtext)
print("Transliterated:", transliterated)
print("\n----------------------------------------\n----------------------------------------\n----------------------------------------\n")
print("PROMPT:\n")
print(prompt)
print("\n----------------------------------------\n----------------------------------------\n----------------------------------------\n")
print("Response:\n")
print(response)

---
# Transliterate Multiple Texts
---

In this example, we will transliterate multiple English sentences using a GPT model. To improve the reliability of the results, the code generates several transliteration responses for each sentence. Adjust the following variables as needed:

- `sentence_list`: A dictionary where each key is a text name and the corresponding value is the English sentence you want to transliterate.
- `language`: A string specifying the target language for transliteration. The supported options are "Hindi", "Korean", and "Japanese".
- `gptmodel`: A string that indicates which GPT model to use. The available options include "gpt-3.5", "gpt-4omini", "gpt-4o", and "gpt-o1mini". You can add or modify the list of models by editing the file `MacST-project-page/sho_util/pyfiles/gpt.py`.
- `savedir` : A string that specifies the directory where all transliteration responses will be saved.
- `repeatnum`: An integer that sets the number of responses (transliterations) to generate for each sentence.
- `reset_response`: A boolean that determines whether to re-generate the transliteration responses, even if previous responses exist in `savedir`.
- `transliterated_results`: A dictionary where each key is a text name and the corresponding value is the transliterated text.

---

In [None]:
###########################################
########## Adjustable Parameters ##########
###########################################

sentence_list = {
    "text1": "ICASSP in India.",
    "text2": "I'm Sho Inoue.",
}
language = "Hindi"
gptmodel = "gpt-3.5"
savedir = f"./responses_{language}_{gptmodel}/"
repeatnum = 3 # Increase this number for more reliable transliteration
reset_response = False

###########################################
###########################################
###########################################

# Save the valid responses
for key in sentence_list:
    print(key)
    exist_length = len(glob.glob(savedir+f"{key}_*.npy"))
    if not(reset_response) and exist_length>=repeatnum:
        continue
    sentence = sentence_list[key]
    inputtext = normalizer(sentence)
    prompt = GetLLMPrompt(inputtext, language)
    
    for r in tqdm(range(repeatnum)):
        savepath = savedir + f"{key}_{r}.npy"
        if not(reset_response) and os.path.exists(savepath):
            continue
        result = GetResult(prompt, gptmodel, display_print=False)
        os.makedirs(os.path.dirname(savepath), exist_ok=True)
        np.save(savepath, result)

transliterated_results = {}
for key in sentence_list:
    files = glob.glob(savedir+f"{key}_*.npy")
    transliterated = PostprocessTransliteration(sentence_list[key], [np.load(path, allow_pickle=True).item() for path in files])
    transliterated_results[key] = transliterated
    
for key in sentence_list:
    print("\n----------------------------------------\n----------------------------------------\n----------------------------------------\n")
    print("Key           :", key)
    print("English       :", sentence_list[key])
    print("Transliterated:", transliterated_results[key])