In [None]:
#!git clone https://github.com/MLRS/malti.git

In [4]:
import requests
from src.transliterate import transliterate
import src.token_rankers as token_rankers

In [None]:
token_mappings = ["./src/token_mappings/small_closed_class.map", "./src/token_mappings/additional_closed_class.map"]
token_rankers = [
    token_rankers.WordModelScoreRanker("./language_models/aggregated_country/lm/word/tn-maghreb.arpa"),
    token_rankers.CharacterModelScoreRanker("./language_models/aggregated_country/lm/char/tn-maghreb.arpa"),
]

In [6]:
#get token mappings from aeb file
TOKEN_MAPPINGS = {}
def get_token_mappings_aeb(path: str) -> dict[str, str]:
    if path not in TOKEN_MAPPINGS:
        with open(path, "r", encoding="utf-8") as file:
            mappings = {}
            for line in file:
                token, eng, mapping = line.strip().split("\t")
                mappings[token] = mapping
            TOKEN_MAPPINGS[path] = mappings
    return TOKEN_MAPPINGS[path]

In [7]:
#get aeb translation
mt_en_aeb_translations = "./src/translations/1504_mt_en_aeb.txt"
def translate_token_aeb(token: str) -> str:
    try:
        return get_token_mappings_aeb(mt_en_aeb_translations)[token]
    except KeyError:
        return transliterate(token, token_mappings, token_rankers, return_token_merge=False)

In [None]:
import pickle
import sys
sys.path.append('./src')
from src.etymology_classification import featurise
with open("./src/etymology_data/model.pickle", "rb") as file:
    model = pickle.load(file)

# Demo: mixed transliteration/translation pipeline for Tunisian Arabic

In [None]:
text = "Il-karozza Porsche tal-2022 għandha speed fenomenali!"
tokens = requests.get("https://mlrs.research.um.edu.mt/tools/mlrsapi/tokenise", params={"text": text}).json()["result"]
labels = model.predict([featurise(tokens)])[0]
labels.tolist()

In [10]:
transliteration_translation_ar = []
for i, label in enumerate(labels):

    if label in ("Arabic", "Symbol"):
        transliteration_translation_ar.append(transliterate(tokens[i], token_mappings, token_rankers, return_token_merge=False))
    else:
        transliteration_translation_ar.append(translate_token_aeb(tokens[i]))

# merge adjacent tokens according to Arabic orthographic conventions
boolean_list = []
for token in tokens:
    transliteration, merge_bool = transliterate(token, token_mappings, token_rankers, return_token_merge=True)
    boolean_list.append(merge_bool)

combined_str = ''
for transformed_tok, merge_bool in zip(transliteration_translation_ar, boolean_list):
    if merge_bool:
        combined_str += transformed_tok
    else:
        combined_str += transformed_tok + ' '
print(text)
print(labels)
print(combined_str)

Il-karozza Porsche tal-2022 għandha speed fenomenali!
['Arabic' 'Non-Arabic' 'Name' 'Arabic' 'Symbol' 'Arabic' 'Code-Switching'
 'Non-Arabic' 'Symbol']
الالسيارات برسكهي تاع ال٢٠٢٢ عندها صباد الظواهر ! 
